[IBCDPE-983] Updates GX functionality to surface warnings (#161)

* bump synapseclient version * updates logic to surface warnings * updates GX unit tests * updates documentation * bring synapseclient version back to 4.5 * adds mostly to gene_info fields * pre-commit fix * remove allOf test * revert notebook change * updates docstring * adds missing type hints * fix typing * add more specific messages to test * adds gx runner docstring * synapseclient 4.4.1
Sage-Bionetworks · Dec 9, 2024 · 17485f6 · 17485f6
1 parent ae4cf82
commit 17485f6
Show file tree

Hide file tree

Showing 11 changed files with 2,263 additions and 1,986 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -186,6 +186,8 @@ This package uses [Great Expectations](https://greatexpectations.io/) to validat
    - You can prevent Great Expectations from running for a dataset by setting `gx_enabled: false` in the configuration for the dataset.
 1. Test data processing by running `adt test_config.yaml --upload` and ensure that HTML reports with all expectations are generated and uploaded to the proper folder in Synapse.
 
+**Note:** If you are adding a new expectation and you want to allow for "fuzzy validation" (e.g. you expect X% of the values in a column to match the expectation, but the remaining Y% are allowed to not match), you will need to make use of the `mostly` [parameter](https://docs.greatexpectations.io/docs/0.18/reference/learn/expectations/standard_arguments/#mostly). This package is set up to surface "warnings" for instances where the `mostly` parameter is used to show users which expectations have some failed values although the overall validation still passes.
+
 #### Custom Expectations
 
 This repository is currently home to three custom expectations that were created for use on `agora-data-tools` datasets:

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb
@@ -236,7 +236,7 @@
     "validator.expect_column_values_to_be_of_type(\"median_expression\", \"str\")\n",
     "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/median_expression.json\", \"r\") as file:\n",
     "    median_expression_schema = json.load(file)\n",
-    "validator.expect_column_values_to_match_json_schema(\"median_expression\", json_schema=median_expression_schema)"
+    "validator.expect_column_values_to_match_json_schema(\"median_expression\", json_schema=median_expression_schema, mostly=0.95)"
    ]
   },
   {
@@ -271,7 +271,7 @@
    "source": [
     "# biodomains\n",
     "validator.expect_column_values_to_be_of_type(\"biodomains\", \"list\")\n",
-    "validator.expect_column_values_to_have_list_members_of_type(column=\"biodomains\", member_type=\"str\")\n",
+    "validator.expect_column_values_to_have_list_members_of_type(column=\"biodomains\", member_type=\"str\", mostly=0.95)\n",
     "validator.expect_column_values_to_have_list_members(column=\"biodomains\", list_members={\n",
     "        'Apoptosis',\n",
     "        'Vasculature',\n",
@@ -350,7 +350,7 @@
    "outputs": [],
    "source": [
     "# uniprotkb_accessions\n",
-    "validator.expect_column_values_to_be_of_type(\"uniprotkb_accessions\", \"list\")\n",
+    "validator.expect_column_values_to_be_of_type(\"uniprotkb_accessions\", \"list\", mostly=0.95)\n",
     "validator.expect_column_values_to_have_list_members_of_type(column=\"uniprotkb_accessions\", member_type=\"str\")"
    ]
   },

diff --git a/setup.cfg b/setup.cfg
@@ -36,7 +36,7 @@ install_requires =
     pandas~=2.0.0
     numpy~=1.21
     setuptools~=70.0.0
-    synapseclient~=4.0.0
+    synapseclient==4.4.1
     PyYAML~=6.0
     pyarrow~=14.0.1
     typer~=0.7.0

diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json
@@ -411,7 +411,8 @@
             "array",
             "null"
           ]
-        }
+        },
+        "mostly": 0.95
       },
       "meta": {}
     },
@@ -505,7 +506,8 @@
       "expectation_type": "expect_column_values_to_have_list_members_of_type",
       "kwargs": {
         "column": "biodomains",
-        "member_type": "str"
+        "member_type": "str",
+        "mostly": 0.95
       },
       "meta": {}
     },
@@ -514,25 +516,25 @@
       "kwargs": {
         "column": "biodomains",
         "list_members": [
-          "Apoptosis",
+          "Myelination",
           "Vasculature",
-          "Lipid Metabolism",
-          "Oxidative Stress",
-          "Mitochondrial Metabolism",
-          "APP Metabolism",
-          "Proteostasis",
-          "DNA Repair",
           "Synapse",
           "Immune Response",
-          "Tau Homeostasis",
+          "DNA Repair",
+          "Autophagy",
+          "Endolysosome",
+          "Proteostasis",
+          "Mitochondrial Metabolism",
           "Cell Cycle",
           "Epigenetic",
+          "Lipid Metabolism",
           "Metal Binding and Homeostasis",
-          "Endolysosome",
-          "Structural Stabilization",
-          "Myelination",
           "RNA Spliceosome",
-          "Autophagy"
+          "Tau Homeostasis",
+          "Apoptosis",
+          "Oxidative Stress",
+          "APP Metabolism",
+          "Structural Stabilization"
         ]
       },
       "meta": {}
@@ -640,6 +642,7 @@
       "expectation_type": "expect_column_values_to_be_of_type",
       "kwargs": {
         "column": "uniprotkb_accessions",
+        "mostly": 0.95,
         "type_": "list"
       },
       "meta": {}

diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py
@@ -17,10 +17,22 @@
 
 
 class GreatExpectationsRunner:
-    """Class to run great expectations on a dataset and upload the HTML report to Synapse"""
+    """Class to run great expectations on a dataset and upload the HTML report to Synapse
+
+    Attributes:
+        failures (bool): Whether or not the GX run had any failed expectations.
+        failure_message (str): Message of the GX run if any expectations failed.
+        warnings (bool): Whether or not the GX run had any warnings.
+        warning_message (str): Summary message for the GX run if any expectations had warnings.
+        report_file (str): Synapse ID of the GX report file.
+        report_version (int): Version number of the GX report file.
+        report_link (str): URL of the specific version of the GX report file.
+    """
 
     failures: bool = False
     failure_message: Optional[str] = None
+    warnings: bool = False
+    warning_message: Optional[str] = None
     report_file: Optional[str] = None
     report_version: Optional[int] = None
     report_link: Optional[str] = None
@@ -67,7 +79,7 @@ def _get_data_context_location(self) -> str:
         gx_directory = os.path.join(script_dir, "great_expectations")
         return gx_directory
 
-    def _check_if_expectation_suite_exists(self) -> bool:
+    def check_if_expectation_suite_exists(self) -> bool:
         """Checks if the expectation suite exists in the great_expectations workspace"""
         exists = (
             self.expectation_suite_name in self.context.list_expectation_suite_names()
@@ -78,7 +90,7 @@ def _check_if_expectation_suite_exists(self) -> bool:
             )
         return exists
 
-    def _get_results_path(self, checkpoint_result: CheckpointResult) -> str:
+    def get_results_path(self, checkpoint_result: CheckpointResult) -> str:
         """Gets the path to the most recent HTML report for a checkpoint,
         copies it to a Synapse-API friendly name, and returns the new path
 
@@ -106,7 +118,7 @@ def _get_results_path(self, checkpoint_result: CheckpointResult) -> str:
         shutil.copy(original_results_path, new_results_path)
         return new_results_path
 
-    def _upload_results_file_to_synapse(self, results_path: str) -> None:
+    def upload_results_file_to_synapse(self, results_path: str) -> None:
         """Uploads a results file to Synapse. Assigns class attributes associated
         with the report file.
 
@@ -148,43 +160,61 @@ def convert_nested_columns_to_json(
             df[column] = df[column].apply(json.dumps)
         return df
 
-    def get_failed_expectations(self, checkpoint_result: CheckpointResult) -> str:
-        """Gets the failed expectations from a CheckpointResult and returns them as a formatted string
+    def set_warnings_and_failures(self, checkpoint_result: CheckpointResult) -> None:
+        """Sets class attributes for warnings and failures given a CheckpointResult
 
         Args:
             checkpoint_result (CheckpointResult): CheckpointResult object
-
-        Returns:
-            fail_message (str): String with information on which fields and expectations failed
         """
+        warning_dict = {self.expectation_suite_name: {}}
         fail_dict = {self.expectation_suite_name: {}}
         expectation_results = checkpoint_result.list_validation_results()[0]["results"]
+
         for result in expectation_results:
-            if not result["success"]:
-                column = result["expectation_config"]["kwargs"]["column"]
-                failed_expectation = result["expectation_config"]["expectation_type"]
-                if not fail_dict[self.expectation_suite_name].get(column, None):
-                    fail_dict[self.expectation_suite_name][column] = []
-                fail_dict[self.expectation_suite_name][column].append(
-                    failed_expectation
+            column = result["expectation_config"]["kwargs"].get(
+                "column",
+                "/".join(result["expectation_config"]["kwargs"].get("column_list", [])),
+            )
+            expectation = result["expectation_config"]["expectation_type"]
+            if result["success"]:
+                if result["result"].get("partial_unexpected_list", None):
+                    warning_dict[self.expectation_suite_name].setdefault(
+                        column, []
+                    ).append(expectation)
+            else:
+                fail_dict[self.expectation_suite_name].setdefault(column, []).append(
+                    expectation
                 )
+
+        self.warning_message, self.warnings = self._generate_message(
+            warning_dict, "warnings"
+        )
+        self.failure_message, self.failures = self._generate_message(
+            fail_dict, "failures"
+        )
+
+    def _generate_message(
+        self, result_dict: dict, message_type: str
+    ) -> typing.Tuple[str, bool]:
+        """Generate message and status for warnings or failures."""
         messages = []
-        for _, fields_dict in fail_dict.items():
-            for field, failed_expectations in fields_dict.items():
+        for suite_name, fields_dict in result_dict.items():
+            for field, expectations in fields_dict.items():
                 messages.append(
-                    f"{field} has failed expectations {', '.join(failed_expectations)}"
+                    f"In the {suite_name} dataset, '{field}' has failed values for expectations {', '.join(expectations)}"
                 )
-
-        fail_message = ("Great Expectations data validation has failed: ") + "; ".join(
-            messages
+        message = (
+            (f"Great Expectations data validation has the following {message_type}: ")
+            + "; ".join(messages)
+            if messages
+            else None
         )
-
-        return fail_message
+        return message, bool(message)
 
     def run(self) -> None:
         """Run great expectations on a dataset and upload the results to Synapse."""
 
-        if not self._check_if_expectation_suite_exists():
+        if not self.check_if_expectation_suite_exists():
             return
 
         logger.info(f"Running data validation on {self.expectation_suite_name}")
@@ -209,11 +239,9 @@ def run(self) -> None:
         logger.info(
             f"Data validation complete for {self.expectation_suite_name}. Uploading results to Synapse."
         )
-        latest_reults_path = self._get_results_path(checkpoint_result)
+        latest_reults_path = self.get_results_path(checkpoint_result)
 
-        if self.upload_folder:
-            self._upload_results_file_to_synapse(latest_reults_path)
+        self.set_warnings_and_failures(checkpoint_result)
 
-        if not checkpoint_result.success:
-            self.failures = True
-            self.failure_message = self.get_failed_expectations(checkpoint_result)
+        if self.upload_folder:
+            self.upload_results_file_to_synapse(latest_reults_path)
diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py
@@ -160,6 +160,8 @@ def process_dataset(
             ),
             gx_failures=gx_runner.failures,
             gx_failure_message=gx_runner.failure_message,
+            gx_warnings=gx_runner.warnings,
+            gx_warning_message=gx_runner.warning_message,
         )
 
         if upload and not gx_runner.failures:

diff --git a/src/agoradatatools/reporter.py b/src/agoradatatools/reporter.py
@@ -22,6 +22,8 @@ class DatasetReport:
         gx_report_link: URL of the specific version of the GX report file.
         gx_failures: Whether or not the GX run had any failed expectations.
         gx_failure_message: Message of the GX run if any expectations failed.
+        gx_warnings: Whether or not the GX run had any warnings.
+        gx_warning_message: Summary message for the GX run if any expectations had warnings.
         adt_output_file: Synapse ID of the ADT output file.
         adt_output_version: Version number of the ADT output file.
         adt_output_link: URL of the specific version of the ADT output file.
@@ -39,6 +41,8 @@ class DatasetReport:
     gx_report_link: Optional[str] = field(default=None)
     gx_failures: Optional[bool] = field(default=False)
     gx_failure_message: Optional[str] = field(default=None)
+    gx_warnings: Optional[bool] = field(default=False)
+    gx_warning_message: Optional[str] = field(default=None)
     adt_output_file: Optional[str] = field(default=None)
     adt_output_version: Optional[int] = field(default=None)
     adt_output_link: Optional[str] = field(default=None)

diff --git a/tests/test_assets/gx/checkpoint_result_fail.json b/tests/test_assets/gx/checkpoint_result_fail.json
@@ -64,7 +64,7 @@
                         }
                     },
                     {
-                        "success": false,
+                        "success": true,
                         "expectation_config": {
                             "expectation_type": "expect_column_value_lengths_to_equal",
                             "kwargs": {

diff --git a/tests/test_assets/gx/checkpoint_result_pass.json b/tests/test_assets/gx/checkpoint_result_pass.json
@@ -76,11 +76,9 @@
                         },
                         "result": {
                             "element_count": 15991,
-                            "unexpected_count": 1,
-                            "unexpected_percent": 0.006253517603652055,
-                            "partial_unexpected_list": [
-                                "ENSG00"
-                            ],
+                            "unexpected_count": 0,
+                            "unexpected_percent": 0.0,
+                            "partial_unexpected_list": [],
                             "missing_count": 0,
                             "missing_percent": 0.0,
                             "unexpected_percent_total": 0.006253517603652055,
@@ -115,11 +113,9 @@
                         },
                         "result": {
                             "element_count": 15991,
-                            "unexpected_count": 1,
-                            "unexpected_percent": 0.006253517603652055,
-                            "partial_unexpected_list": [
-                                "ENSG00"
-                            ],
+                            "unexpected_count": 0,
+                            "unexpected_percent": 0.0,
+                            "partial_unexpected_list": [],
                             "missing_count": 0,
                             "missing_percent": 0.0,
                             "unexpected_percent_total": 0.006253517603652055,