diff --git a/config.yaml b/config.yaml index 7e17cd5d..fe4d4c8d 100644 --- a/config.yaml +++ b/config.yaml @@ -62,6 +62,7 @@ datasets: ensembl_id: ensembl_gene_id goterm_name: go_terms destination: *dest + gx_folder: syn53127958 - neuropath_corr: files: diff --git a/gx_suite_definitions/genes_biodomains.ipynb b/gx_suite_definitions/genes_biodomains.ipynb new file mode 100644 index 00000000..23eeea6d --- /dev/null +++ b/gx_suite_definitions/genes_biodomains.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import great_expectations as gx\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n", + "\n", + "from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n", + "from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange\n", + "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n", + "from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType\n", + "from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ExpectColumnValuesToHaveListOfDictWithExpectedValues\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Genes Biodomains Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "genes_biodomains_data_file = syn.get(\"syn51062085\").path\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator = context.sources.pandas_default.read_json(\n", + " genes_biodomains_data_file\n", + ")\n", + "validator.expectation_suite_name = \"genes_biodomains\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ensembl_gene_id\n", + "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n", + "validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n", + "# checks format and allowed chatacters\n", + "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", + "validator.expect_column_values_to_be_unique(\"ensembl_gene_id\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gene_biodomains\n", + "validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"list\")\n", + "validator.expect_column_values_to_not_be_null(\"gene_biodomains\")\n", + "validator.expect_column_values_to_have_list_length_in_range(column=\"gene_biodomains\", list_length_range=[1, 19])\n", + "validator.expect_column_values_to_have_list_members_of_type(column=\"gene_biodomains\", member_type=\"dict\")\n", + "biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']\n", + "validator.expect_column_values_to_have_list_of_dict_with_expected_values(column=\"gene_biodomains\", list_dict_values={\"key\": \"biodomain\", \"values\": biodomain_list})\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Expectation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/great_expectations/gx/checkpoints/genes_biodomains.yml b/src/agoradatatools/great_expectations/gx/checkpoints/genes_biodomains.yml new file mode 100644 index 00000000..f235a388 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/checkpoints/genes_biodomains.yml @@ -0,0 +1,30 @@ +name: genes_biodomains +config_version: 1.0 +template_name: +module_name: great_expectations.checkpoint +class_name: Checkpoint +run_name_template: +expectation_suite_name: +batch_request: {} +action_list: + - name: store_validation_result + action: + class_name: StoreValidationResultAction + - name: store_evaluation_params + action: + class_name: StoreEvaluationParametersAction + - name: update_data_docs + action: + class_name: UpdateDataDocsAction +evaluation_parameters: {} +runtime_configuration: {} +validations: + - batch_request: + datasource_name: default_pandas_datasource + data_asset_name: '#ephemeral_pandas_asset' + options: {} + batch_slice: + expectation_suite_name: genes_biodomains +profilers: [] +ge_cloud_id: +expectation_suite_ge_cloud_id: diff --git a/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json b/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json new file mode 100644 index 00000000..712f0703 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json @@ -0,0 +1,113 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "genes_biodomains", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ensembl_gene_id", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "ensembl_gene_id" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "ensembl_gene_id", + "value": 15 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "ensembl_gene_id", + "regex": "^ENSG\\d{11}$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "ensembl_gene_id" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "gene_biodomains", + "type_": "list" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "gene_biodomains" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_have_list_length_in_range", + "kwargs": { + "column": "gene_biodomains", + "list_length_range": [ + 1, + 19 + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_have_list_members_of_type", + "kwargs": { + "column": "gene_biodomains", + "member_type": "dict" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_have_list_of_dict_with_expected_values", + "kwargs": { + "column": "gene_biodomains", + "list_dict_values": { + "key": "biodomain", + "values": [ + "Apoptosis", + "Vasculature", + "Lipid Metabolism", + "Proteostasis", + "Immune Response", + "Autophagy", + "Mitochondrial Metabolism", + "Structural Stabilization", + "Synapse", + "Endolysosome", + "Metal Binding and Homeostasis", + "Oxidative Stress", + "Epigenetic", + "APP Metabolism", + "Cell Cycle", + "DNA Repair", + "RNA Spliceosome", + "Tau Homeostasis", + "Myelination" + ] + } + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} \ No newline at end of file diff --git a/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_length_in_range.py b/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_length_in_range.py new file mode 100644 index 00000000..c2a84943 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_length_in_range.py @@ -0,0 +1,128 @@ +import pandas as pd +from typing import Optional, Any + +from great_expectations.core.expectation_configuration import ExpectationConfiguration +from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.expectations.expectation import ColumnMapExpectation +from great_expectations.expectations.metrics import ( + ColumnMapMetricProvider, + column_condition_partial, +) + + +# This class defines a Metric to support your Expectation. +# For most ColumnMapExpectations, the main business logic for calculation will live in this class. +class ColumnValuesListLengthInRange(ColumnMapMetricProvider): + """Class definition for list length in range checking metric.""" + + # This is the id string that will be used to reference your metric. + condition_metric_name = "column_values.list_length_range" + condition_value_keys = ("list_length_range",) + + # This method implements the core logic for the PandasExecutionEngine + @column_condition_partial(engine=PandasExecutionEngine) + def _pandas( + cls, column: pd.core.series.Series, list_length_range: list, **kwargs + ) -> bool: + """Core logic for list length checking metric on a + pandas execution engine. + + Args: + column (pd.core.series.Series): Pandas column to be evaluated. + list_length_range (list): list of length 2 containing the minimum and maximum allowed lengths. + Returns: + bool: Whether or not the column values have the expected list length. + """ + return column.apply( + lambda x: cls._check_list_length_in_range(x, list_length_range) + ) + + @staticmethod + def _check_list_length_in_range(cell: Any, list_length_range: list) -> bool: + """Check if a cell is a list, and if it has the expected length. + + Args: + cell (Any): Individual cell to be evaluated. + list_length_range (list): list of length 2 containing the minimum and maximum allowed lengths. + Returns: + bool: Whether or not the cell is a list with the expected length. + """ + if not isinstance(cell, list): + return False + if len(list_length_range) != 2: + raise ValueError( + "list_length_range must be a list of length 2 containing the minimum and maximum allowed lengths." + ) + if len(cell) >= min(list_length_range) and len(cell) <= max(list_length_range): + return True + return False + + +# This class defines the Expectation itself +class ExpectColumnValuesToHaveListLengthInRange(ColumnMapExpectation): + """Expect the list in column values to have a length within a certain range.""" + + # These examples will be shown in the public gallery. + # They will also be executed as unit tests for your Expectation. + examples = [ + { + "data": { + "a": [[1, 2, 3, 4, 5]], + }, + "tests": [ + { + "title": "positive_test_with_0_5", + "exact_match_out": False, + "include_in_gallery": True, + "in": {"column": "a", "list_length_range": [0, 5]}, + "out": {"success": True}, + }, + { + "title": "negative_test_with_7_10", + "exact_match_out": False, + "include_in_gallery": True, + "in": {"column": "a", "list_length_range": [7, 10]}, + "out": {"success": False}, + }, + ], + } + ] + + # This is the id string of the Metric used by this Expectation. + # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above. + map_metric = "column_values.list_length_range" + + # This is a list of parameter names that can affect whether the Expectation evaluates to True or False + success_keys = ("list_length_range",) + + # This dictionary contains default values for any parameters that should have default values + default_kwarg_values = {} + + def validate_configuration( + self, configuration: Optional[ExpectationConfiguration] = None + ) -> None: + """ + Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that + necessary configuration arguments have been provided for the validation of the expectation. + + Args: + configuration (OPTIONAL[ExpectationConfiguration]): \ + An optional Expectation Configuration entry that will be used to configure the expectation + Returns: + None. Raises InvalidExpectationConfigurationError if the config is not validated successfully + """ + + super().validate_configuration(configuration) + configuration = configuration or self.configuration + + # This object contains metadata for display in the public Gallery + library_metadata = { + "tags": [], # Tags for this Expectation in the Gallery + "contributors": [ # Github handles for all contributors to this Expectation. + "@BWMac", # Don't forget to add your github handle here! + ], + } + + +if __name__ == "__main__": + ExpectColumnValuesToHaveListLengthInRange().print_diagnostic_checklist() diff --git a/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_of_dict_with_expected_values.py b/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_of_dict_with_expected_values.py new file mode 100644 index 00000000..34be1b0a --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/plugins/expectations/expect_column_values_to_have_list_of_dict_with_expected_values.py @@ -0,0 +1,145 @@ +import pandas as pd +from typing import Optional, Any + +from great_expectations.core.expectation_configuration import ExpectationConfiguration +from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.expectations.expectation import ColumnMapExpectation +from great_expectations.expectations.metrics import ( + ColumnMapMetricProvider, + column_condition_partial, +) + + +# This class defines a Metric to support your Expectation. +# For most ColumnMapExpectations, the main business logic for calculation will live in this class. +class ColumnValuesListOfDictWithExpectedValues(ColumnMapMetricProvider): + """Class definition for expecting list of dic with expected vlaues checking metric.""" + + # This is the id string that will be used to reference your metric. + condition_metric_name = "column_values.list_dict_values" + condition_value_keys = ("list_dict_values",) + + # This method implements the core logic for the PandasExecutionEngine + @column_condition_partial(engine=PandasExecutionEngine) + def _pandas( + cls, column: pd.core.series.Series, list_dict_values: dict, **kwargs + ) -> bool: + """Core logic for list length checking metric on a + pandas execution engine. + + Args: + column (pd.core.series.Series): Pandas column to be evaluated. + list_dict_values (dict): Dictionary containing the key to check + and a list of the values it is allowed to have. + Returns: + bool: Whether or not the column values have the expected list length. + """ + return column.apply( + lambda x: cls._check_list_of_dict_has_expected_values(x, list_dict_values) + ) + + @staticmethod + def _check_list_of_dict_has_expected_values( + cell: Any, list_dict_values: dict + ) -> bool: + """Check if a cell is a list, and if it has the expected length. + + Args: + cell (Any): Individual cell to be evaluated. + list_dict_values (dict): Dictionary containing the key to check + and a list of the values it is allowed to have. + Returns: + bool: Whether or not the cell is a list with the expected length. + """ + if not isinstance(cell, list): + return False + if ( + not isinstance(list_dict_values, dict) + and "key" not in list_dict_values + and "values" not in list_dict_values + ): + raise ValueError( + "list_dict_values must be a dict which contains 'key' (string) and 'values' (list)." + ) + for item in cell: + if not isinstance(item, dict): + return False + if item[list_dict_values["key"]] not in list_dict_values["values"]: + return False + return True + + +# This class defines the Expectation itself +class ExpectColumnValuesToHaveListOfDictWithExpectedValues(ColumnMapExpectation): + """Expect the list in column values to have a length within a certain range.""" + + # These examples will be shown in the public gallery. + # They will also be executed as unit tests for your Expectation. + examples = [ + { + "data": { + "a": [[{"a": "b"}]], + }, + "tests": [ + { + "title": "positive_test", + "exact_match_out": False, + "include_in_gallery": True, + "in": { + "column": "a", + "list_dict_values": {"key": "a", "values": ["b", "c", "d"]}, + }, + "out": {"success": True}, + }, + { + "title": "negative_test", + "exact_match_out": False, + "include_in_gallery": True, + "in": { + "column": "a", + "list_dict_values": {"key": "a", "values": ["c", "d"]}, + }, + "out": {"success": False}, + }, + ], + } + ] + + # This is the id string of the Metric used by this Expectation. + # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above. + map_metric = "column_values.list_dict_values" + + # This is a list of parameter names that can affect whether the Expectation evaluates to True or False + success_keys = ("list_dict_values",) + + # This dictionary contains default values for any parameters that should have default values + default_kwarg_values = {} + + def validate_configuration( + self, configuration: Optional[ExpectationConfiguration] = None + ) -> None: + """ + Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that + necessary configuration arguments have been provided for the validation of the expectation. + + Args: + configuration (OPTIONAL[ExpectationConfiguration]): \ + An optional Expectation Configuration entry that will be used to configure the expectation + Returns: + None. Raises InvalidExpectationConfigurationError if the config is not validated successfully + """ + + super().validate_configuration(configuration) + configuration = configuration or self.configuration + + # This object contains metadata for display in the public Gallery + library_metadata = { + "tags": [], # Tags for this Expectation in the Gallery + "contributors": [ # Github handles for all contributors to this Expectation. + "@BWMac", # Don't forget to add your github handle here! + ], + } + + +if __name__ == "__main__": + ExpectColumnValuesToHaveListOfDictWithExpectedValues().print_diagnostic_checklist() diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py index d9597a22..b9e10d4c 100644 --- a/src/agoradatatools/gx.py +++ b/src/agoradatatools/gx.py @@ -35,9 +35,15 @@ def __init__( from expectations.expect_column_values_to_have_list_members import ( ExpectColumnValuesToHaveListMembers, ) + from expectations.expect_column_values_to_have_list_length_in_range import ( + ExpectColumnValuesToHaveListLengthInRange, + ) from expectations.expect_column_values_to_have_list_members_of_type import ( ExpectColumnValuesToHaveListMembersOfType, ) + from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ( + ExpectColumnValuesToHaveListOfDictWithExpectedValues, + ) def _get_data_context_location(self) -> str: """Gets the path to the great_expectations directory""" diff --git a/test_config.yaml b/test_config.yaml index 6bee2f43..be97915d 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -62,6 +62,7 @@ datasets: ensembl_id: ensembl_gene_id goterm_name: go_terms destination: *dest + gx_folder: syn53127956 - neuropath_corr: files: