Sage-Bionetworks · BWMac · Nov 22, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/great_expectations/gx/.gitignore b/great_expectations/gx/.gitignore
@@ -0,0 +1,3 @@
+
+uncommitted/
+.ge_store_backend_id
diff --git a/great_expectations/gx/great_expectations.yml b/great_expectations/gx/great_expectations.yml
@@ -0,0 +1,102 @@
+
+# Welcome to Great Expectations! Always know what to expect from your data.
+#
+# Here you can define datasources, batch kwargs generators, integrations and
+# more. This file is intended to be committed to your repo. For help with
+# configuration please:
+#   - Read our docs: https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview/#2-configure-your-datasource
+#   - Join our slack channel: http://greatexpectations.io/slack
+
+# config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility
+# It is auto-generated and usually does not need to be changed.
+config_version: 3
+
+# Datasources tell Great Expectations where your data lives and how to get it.
+# Read more at https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview
+datasources: {}
+
+# This config file supports variable substitution which enables: 1) keeping
+# secrets out of source control & 2) environment-based configuration changes
+# such as staging vs prod.
+#
+# When GX encounters substitution syntax (like `my_key: ${my_value}` or
+# `my_key: $my_value`) in the great_expectations.yml file, it will attempt
+# to replace the value of `my_key` with the value from an environment
+# variable `my_value` or a corresponding key read from this config file,
+# which is defined through the `config_variables_file_path`.
+# Environment variables take precedence over variables defined here.
+#
+# Substitution values defined here can be a simple (non-nested) value,
+# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR})
+#
+#
+# https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials
+
+
+config_variables_file_path: uncommitted/config_variables.yml
+
+# The plugins_directory will be added to your python path for custom modules
+# used to override and extend Great Expectations.
+plugins_directory: plugins/
+
+stores:
+# Stores are configurable places to store things like Expectations, Validations
+# Data Docs, and more. These are for advanced users only - most users can simply
+# leave this section alone.
+#
+# Three stores are required: expectations, validations, and
+# evaluation_parameters, and must exist with a valid store entry. Additional
+# stores can be configured for uses such as data_docs, etc.
+  expectations_store:
+    class_name: ExpectationsStore
+    store_backend:
+      class_name: TupleFilesystemStoreBackend
+      base_directory: expectations/
+
+  validations_store:
+    class_name: ValidationsStore
+    store_backend:
+      class_name: TupleFilesystemStoreBackend
+      base_directory: uncommitted/validations/
+
+  evaluation_parameter_store:
+    # Evaluation Parameters enable dynamic expectations. Read more here:
+    # https://docs.greatexpectations.io/docs/reference/evaluation_parameters/
+    class_name: EvaluationParameterStore
+
+  checkpoint_store:
+    class_name: CheckpointStore
+    store_backend:
+      class_name: TupleFilesystemStoreBackend
+      suppress_store_backend_id: true
+      base_directory: checkpoints/
+
+  profiler_store:
+    class_name: ProfilerStore
+    store_backend:
+      class_name: TupleFilesystemStoreBackend
+      suppress_store_backend_id: true
+      base_directory: profilers/
+
+expectations_store_name: expectations_store
+validations_store_name: validations_store
+evaluation_parameter_store_name: evaluation_parameter_store
+checkpoint_store_name: checkpoint_store
+
+data_docs_sites:
+  # Data Docs make it simple to visualize data quality in your project. These
+  # include Expectations, Validations & Profiles. The are built for all
+  # Datasources from JSON artifacts in the local repo including validations &
+  # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs
+  local_site:
+    class_name: SiteBuilder
+    # set to false to hide how-to buttons in Data Docs
+    show_how_to_buttons: true
+    store_backend:
+        class_name: TupleFilesystemStoreBackend
+        base_directory: uncommitted/data_docs/local_site/
+    site_index_builder:
+        class_name: DefaultSiteIndexBuilder
+
+anonymous_usage_statistics:
+  enabled: True
diff --git a/great_expectations/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/great_expectations/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css
@@ -0,0 +1,22 @@
+/*index page*/
+.ge-index-page-site-name-title {}
+.ge-index-page-table-container {}
+.ge-index-page-table {}
+.ge-index-page-table-profiling-links-header {}
+.ge-index-page-table-expectations-links-header {}
+.ge-index-page-table-validations-links-header {}
+.ge-index-page-table-profiling-links-list {}
+.ge-index-page-table-profiling-links-item {}
+.ge-index-page-table-expectation-suite-link {}
+.ge-index-page-table-validation-links-list {}
+.ge-index-page-table-validation-links-item {}
+
+/*breadcrumbs*/
+.ge-breadcrumbs {}
+.ge-breadcrumbs-item {}
+
+/*navigation sidebar*/
+.ge-navigation-sidebar-container {}
+.ge-navigation-sidebar-content {}
+.ge-navigation-sidebar-title {}
+.ge-navigation-sidebar-link {}
diff --git a/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_length.py b/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_length.py
@@ -0,0 +1,142 @@
+import pandas as pd
+from typing import Optional, Any
+
+from great_expectations.core.expectation_configuration import ExpectationConfiguration
+from great_expectations.execution_engine import PandasExecutionEngine
+from great_expectations.expectations.expectation import ColumnMapExpectation
+from great_expectations.expectations.metrics import (
+    ColumnMapMetricProvider,
+    column_condition_partial,
+)
+
+
+# This class defines a Metric to support your Expectation.
+# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
+class ColumnValuesListLength(ColumnMapMetricProvider):
+    """Class definition for list length checking metric."""
+
+    # This is the id string that will be used to reference your metric.
+    condition_metric_name = "column_values.list_length"
+    condition_value_keys = ("list_length",)
+
+    # This method implements the core logic for the PandasExecutionEngine
+    @column_condition_partial(engine=PandasExecutionEngine)
+    def _pandas(cls, column: pd.core.series.Series, list_length: int, **kwargs) -> bool:
+        """Core logic for list length checking metric on a
+        pandas execution engine.
+
+        Args:
+            column (pd.core.series.Series): Pandas column to be evaluated.
+            list_length (int): Expected list length.
+        Returns:
+            bool: Whether or not the column values have the expected list length.
+        """
+        return column.apply(lambda x: cls._check_list_length(x, list_length))
+
+    @staticmethod
+    def _check_list_length(cell: Any, list_length: int) -> bool:
+        """Check if a cell is a list, and if it has the expected length.
+
+        Args:
+            cell (Any): Individual cell to be evaluated.
+            list_length (int): Expected list length.
+
+        Returns:
+            bool: Whether or not the cell is a list with the expected length.
+        """
+        if not isinstance(cell, list):
+            return False
+        if len(cell) != list_length:
+            return False
+        return True
+
+    # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
+    # @column_condition_partial(engine=SqlAlchemyExecutionEngine)
+    # def _sqlalchemy(cls, column, _dialect, **kwargs):
+    #     raise NotImplementedError
+
+    # This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
+    # @column_condition_partial(engine=SparkDFExecutionEngine)
+    # def _spark(cls, column, **kwargs):
+    #     raise NotImplementedError
+
+
+# This class defines the Expectation itself
+class ExpectColumnValuesToHaveListLength(ColumnMapExpectation):
+    """Expect the list in column values to have a certain length."""
+
+    # These examples will be shown in the public gallery.
+    # They will also be executed as unit tests for your Expectation.
+    examples = [
+        {
+            "data": {
+                "a": [[1, 2, 3, 4, 5]],
+            },
+            "tests": [
+                {
+                    "title": "positive_test_with_list_length_5",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "a", "list_length": 5},
+                    "out": {"success": True},
+                },
+                {
+                    "title": "negative_test_with_list_length_5",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "a", "list_length": 4},
+                    "out": {"success": False},
+                },
+            ],
+        }
+    ]
+
+    # This is the id string of the Metric used by this Expectation.
+    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
+    map_metric = "column_values.list_length"
+
+    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
+    success_keys = ("list_length",)
+
+    # This dictionary contains default values for any parameters that should have default values
+    default_kwarg_values = {}
+
+    def validate_configuration(
+        self, configuration: Optional[ExpectationConfiguration] = None
+    ) -> None:
+        """
+        Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
+        necessary configuration arguments have been provided for the validation of the expectation.
+
+        Args:
+            configuration (OPTIONAL[ExpectationConfiguration]): \
+                An optional Expectation Configuration entry that will be used to configure the expectation
+        Returns:
+            None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
+        """
+
+        super().validate_configuration(configuration)
+        configuration = configuration or self.configuration
+
+        # # Check other things in configuration.kwargs and raise Exceptions if needed
+        # try:
+        #     assert (
+        #         ...
+        #     ), "message"
+        #     assert (
+        #         ...
+        #     ), "message"
+        # except AssertionError as e:
+        #     raise InvalidExpectationConfigurationError(str(e))
+
+    # This object contains metadata for display in the public Gallery
+    library_metadata = {
+        "tags": [],  # Tags for this Expectation in the Gallery
+        "contributors": [  # Github handles for all contributors to this Expectation.
+            "@BWMac",  # Don't forget to add your github handle here!
+        ],
+    }
+
+
+if __name__ == "__main__":
+    ExpectColumnValuesToHaveListLength().print_diagnostic_checklist()
diff --git a/gx_metabolomics.py b/gx_metabolomics.py
@@ -0,0 +1,94 @@
+import great_expectations as gx
+from great_expectations.data_context import FileDataContext
+
+context = FileDataContext.create(project_root_dir="great_expectations")
+
+from expectations.expect_column_values_to_have_list_length import (
+    ExpectColumnValuesToHaveListLength,
+)
+
+test_dataset = "./metabolomics.json"
+context = gx.get_context()
+validator = context.sources.pandas_default.read_json(test_dataset)
+
+# ad_diagnosis_p_value
+validator.expect_column_values_to_be_of_type("ad_diagnosis_p_value", "list")
+validator.expect_column_values_to_not_be_null("ad_diagnosis_p_value")
+# for custom and experimental expectations you have to pass args as kwargs
+validator.expect_column_values_to_have_list_length(
+    column="ad_diagnosis_p_value", list_length=1
+)
+
+# associated gene name
+validator.expect_column_values_to_be_of_type("associated_gene_name", "str")
+validator.expect_column_values_to_not_be_null("associated_gene_name")
+validator.expect_column_value_lengths_to_be_between(
+    "associated_gene_name", min_value=1, max_value=25
+)
+# allows all alphanumeric characters, underscores, periods, and dashes
+validator.expect_column_values_to_match_regex(
+    "associated_gene_name", "^[A-Za-z0-9_.-]+$"
+)
+
+# association p
+validator.expect_column_values_to_be_of_type("association_p", "float")
+validator.expect_column_values_to_not_be_null("association_p")
+validator.expect_column_values_to_be_between("association_p", min_value=0, max_value=1)
+
+# ensembl gene id
+validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
+validator.expect_column_values_to_not_be_null("ensembl_gene_id")
+validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
+# checks format and allowed chatacters
+validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
+validator.expect_column_values_to_be_unique("ensembl_gene_id")
+
+# gene_wide_p_threshold_1kgp
+validator.expect_column_values_to_be_of_type("gene_wide_p_threshold_1kgp", "float")
+validator.expect_column_values_to_not_be_null("gene_wide_p_threshold_1kgp")
+validator.expect_column_values_to_be_between(
+    "gene_wide_p_threshold_1kgp", min_value=0, max_value=0.05
+)
+
+# metabolite full name
+validator.expect_column_values_to_be_of_type("metabolite_full_name", "str")
+validator.expect_column_values_to_not_be_null("metabolite_full_name")
+validator.expect_column_value_lengths_to_be_between(
+    "metabolite_full_name", min_value=1, max_value=25
+)
+# allows all alphanumeric characters, dashes, parentheses, hyphens and spaces
+validator.expect_column_values_to_match_regex(
+    "metabolite_full_name", "^[A-Za-z0-9\s\-:.()+]+$"
+)
+
+# metabolite ID
+validator.expect_column_values_to_be_of_type("metabolite_id", "str")
+validator.expect_column_values_to_not_be_null("metabolite_id")
+validator.expect_column_value_lengths_to_be_between(
+    "metabolite_id", min_value=1, max_value=15
+)
+# allows all alphanumeric characters and periods
+validator.expect_column_values_to_match_regex("metabolite_id", "^[A-Za-z0-9.]+$")
+
+# n_per_group
+validator.expect_column_values_to_be_of_type("n_per_group", "list")
+validator.expect_column_values_to_not_be_null("n_per_group")
+validator.expect_column_values_to_have_list_length(column="n_per_group", list_length=2)
+
+# transposed_boxplot_stats
+validator.expect_column_values_to_be_of_type("transposed_boxplot_stats", "list")
+validator.expect_column_values_to_not_be_null("transposed_boxplot_stats")
+validator.expect_column_values_to_have_list_length(
+    column="transposed_boxplot_stats", list_length=2
+)
+
+# save expectation suite and run checkpoint
+validator.save_expectation_suite()
+checkpoint = context.add_or_update_checkpoint(
+    name="agora-test-checkpoint",
+    validator=validator,
+)
+checkpoint_result = checkpoint.run()
+
+# generate and open report
+context.view_validation_result(checkpoint_result)