Merge pull request #96 from Sage-Bionetworks/bwmac/IBCDPE-688/gx-meta…

…bolomics [IBCDPE-688] Great Expectations Implementation for Metabolomics Data
Sage-Bionetworks · Nov 22, 2023 · d4d76fa · d4d76fa
2 parents 0ab3803 + 0c680cb
commit d4d76fa
Show file tree

Hide file tree

Showing 27 changed files with 5,039 additions and 677 deletions.
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -21,8 +21,9 @@ jobs:
         python-version:
           - "3.8"
           - "3.9"
-          - "3.10"
-          - "3.11"
+          # Support for Python 3.10 and 3.11 is temproarily disabled
+          # - "3.10"
+          # - "3.11"
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.gitignore b/.gitignore
@@ -75,10 +75,6 @@ docs/_build/
 # PyBuilder
 target/
 
-# Jupyter Notebook
-.ipynb_checkpoints
-*.ipynb
-
 # IPython
 profile_default/
 ipython_config.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -139,6 +139,25 @@ This package has a `src/agoradatatools/etl/transform` submodule.  This folder ho
     - Use `pytest.mark.parameterize` to loop through multiple datasets in a single test.
     - The class `TestTransformGenesBiodomains` can be used as an example for future tests contibuted.
 
+### Great Expectations
+
+This package uses [Great Expectations](https://greatexpectations.io/) to validate output data.  The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` output dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps:
+
+1. Create a new expectation suite by defining the expectations for the new dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/).
+1. Run the notebook to generate the new expectation suite. It should populate as a JSON file in the `/great_expectations/expectations` folder.
+1. Add support for running Great Expectations on a dataset by adding the `gx_folder` key to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. The `gx_folder` should be the Synapse ID pointing of a folder where generated HTML reports from Great Expectations for that dataset should be uploaded. If a folder specific to your dataset does not yet exist in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)), create folders with the same name as the dataset itself and copy the new folders' Synapse IDs to the config files.
+1. Test data processing by running `adt test_config.yaml` and ensure that HTML reports with all expectations are generated and uploaded to the proper folder in Synapse.
+
+#### Custom Expectations
+
+This repository is currently home to three custom expectations that were created for use on `agora-data-tools` datasets:
+
+1. `ExpectColumnValuesToHaveListLength`: checks to see if the lists in a particular column are the length that we expect.
+1. `ExpectColumnValuesToHaveListMembers`: checks to see if the lists in a particular column contain only values that we expect.
+1. `ExpectColumnValuesToHaveListMembersOfType`: checks to see if the lists in a particular column contain members of the type we expect.
+
+These expectations are defined in the `/great_expectations/gx/plugins/expectations` folder. To add more custom expectations, follow the instructions [here](https://docs.greatexpectations.io/docs/guides/expectations/custom_expectations_lp).
+
 ### DockerHub
 
 Rather than using GitHub actions to build and push Docker images to DockerHub, the Docker images are automatically built in DockerHub. This requires the `sagebiodockerhub` GitHub user to be an Admin of this repo. You can view the docker build [here](https://hub.docker.com/r/sagebionetworks/agora-data-tools).
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+# Ensure that /great_expectations and all of its contents are included when the package is installed
+graft src/agoradatatools/great_expectations/
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/config.yaml b/config.yaml
@@ -13,7 +13,7 @@
         column_rename:
           biodomain: name
         destination: *dest
-        
+
     - genes_biodomains:
         files:
           - name: genes_biodomains
@@ -102,6 +102,7 @@
         provenance:
           - syn26064497.1
         destination: *dest
+        gx_folder: syn52948669
 
     - gene_info:
         files:

diff --git a/gx_suite_definitions/metabolomics.ipynb b/gx_suite_definitions/metabolomics.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import synapseclient\n",
+    "\n",
+    "import great_expectations as gx\n",
+    "\n",
+    "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n",
+    "\n",
+    "from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n",
+    "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n",
+    "from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Expectation Suite for Metabolomics Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Example Data File"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "syn = synapseclient.Synapse()\n",
+    "syn.login()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metabolomics_data_file = syn.get(\"syn19276330\").path\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create Validator Object on Data File"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validator = context.sources.pandas_default.read_json(\n",
+    "    metabolomics_data_file\n",
+    ")\n",
+    "validator.expectation_suite_name = \"metabolomics\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add Expectations to Validator Object For Each Column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ad_diagnosis_p_value\n",
+    "validator.expect_column_values_to_be_of_type(\"ad_diagnosis_p_value\", \"list\")\n",
+    "validator.expect_column_values_to_not_be_null(\"ad_diagnosis_p_value\")\n",
+    "# for custom and experimental expectations you have to pass args as kwargs\n",
+    "validator.expect_column_values_to_have_list_length(column=\"ad_diagnosis_p_value\", list_length=1)\n",
+    "validator.expect_column_values_to_have_list_members_of_type(column=\"ad_diagnosis_p_value\", member_type=\"float\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# associated gene name\n",
+    "validator.expect_column_values_to_be_of_type(\"associated_gene_name\", \"str\")\n",
+    "validator.expect_column_values_to_not_be_null(\"associated_gene_name\")\n",
+    "validator.expect_column_value_lengths_to_be_between(\"associated_gene_name\", min_value=2, max_value=100)\n",
+    "# allows all alphanumeric characters, underscores, periods, and dashes\n",
+    "validator.expect_column_values_to_match_regex(\"associated_gene_name\", \"^[A-Za-z0-9_.-]+$\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# association p\n",
+    "validator.expect_column_values_to_be_of_type(\"association_p\", \"float\")\n",
+    "validator.expect_column_values_to_not_be_null(\"association_p\")\n",
+    "validator.expect_column_values_to_be_between(\"association_p\", strict_min_value=0, max_value=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# boxplot_group_names\n",
+    "validator.expect_column_values_to_be_of_type(\"boxplot_group_names\", \"list\")\n",
+    "validator.expect_column_values_to_not_be_null(\"boxplot_group_names\")\n",
+    "validator.expect_column_values_to_have_list_length(column=\"boxplot_group_names\", list_length=2)\n",
+    "validator.expect_column_values_to_have_list_members(column=\"boxplot_group_names\", list_members={\"AD\", \"CN\"})\n",
+    "validator.expect_column_values_to_have_list_members_of_type(column=\"boxplot_group_names\", member_type=\"str\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ensembl gene id\n",
+    "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n",
+    "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n",
+    "validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n",
+    "# checks format and allowed chatacters\n",
+    "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n",
+    "validator.expect_column_values_to_be_unique(\"ensembl_gene_id\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gene_wide_p_threshold_1kgp\n",
+    "validator.expect_column_values_to_be_of_type(\"gene_wide_p_threshold_1kgp\", \"float\")\n",
+    "validator.expect_column_values_to_not_be_null(\"gene_wide_p_threshold_1kgp\")\n",
+    "validator.expect_column_values_to_be_between(\"gene_wide_p_threshold_1kgp\", strict_min_value=0, max_value=0.05)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metabolite full name\n",
+    "validator.expect_column_values_to_be_of_type(\"metabolite_full_name\", \"str\")\n",
+    "validator.expect_column_values_to_not_be_null(\"metabolite_full_name\")\n",
+    "validator.expect_column_value_lengths_to_be_between(\"metabolite_full_name\", min_value=2, max_value=100)  \n",
+    "# allows all alphanumeric characters, dashes, parentheses, hyphens and spaces\n",
+    "validator.expect_column_values_to_match_regex(\"metabolite_full_name\", \"^[A-Za-z0-9\\s\\-:.()+]+$\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metabolite ID\n",
+    "validator.expect_column_values_to_be_of_type(\"metabolite_id\", \"str\")\n",
+    "validator.expect_column_values_to_not_be_null(\"metabolite_id\")\n",
+    "validator.expect_column_value_lengths_to_be_between(\"metabolite_id\", min_value=2, max_value=100)\n",
+    "# allows all alphanumeric characters and periods\n",
+    "validator.expect_column_values_to_match_regex(\"metabolite_id\", \"^[A-Za-z0-9.]+$\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# n_per_group\n",
+    "validator.expect_column_values_to_be_of_type(\"n_per_group\", \"list\")\n",
+    "validator.expect_column_values_to_not_be_null(\"n_per_group\")\n",
+    "validator.expect_column_values_to_have_list_length(column=\"n_per_group\", list_length=2)\n",
+    "validator.expect_column_values_to_have_list_members_of_type(column=\"n_per_group\", member_type=\"int\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transposed_boxplot_stats\n",
+    "validator.expect_column_values_to_be_of_type(\"transposed_boxplot_stats\", \"list\")\n",
+    "validator.expect_column_values_to_not_be_null(\"transposed_boxplot_stats\")\n",
+    "validator.expect_column_values_to_have_list_length(column=\"transposed_boxplot_stats\", list_length=2)\n",
+    "validator.expect_column_values_to_have_list_members_of_type(column=\"transposed_boxplot_stats\", member_type=\"list\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Expectation Suite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validator.save_expectation_suite(discard_failed_expectations=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create Checkpoint and View Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "checkpoint = context.add_or_update_checkpoint(\n",
+    "    name=\"agora-test-checkpoint\",\n",
+    "    validator=validator,\n",
+    ")\n",
+    "checkpoint_result = checkpoint.run()\n",
+    "context.view_validation_result(checkpoint_result)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build Data Docs - Click on Expectation Suite to View All Expectations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context.build_data_docs()\n",
+    "context.open_data_docs()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-name = agora-data-tools
+name = agoradatatools
 description = "A collection of tools that can be used to work with Agora's data"
 long_description = file: README.md
 long_description_content_type = text/markdown
@@ -20,8 +20,9 @@ classifiers =
     Programming Language :: Python :: 3 :: Only
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
-    Programming Language :: Python :: 3.10
-    Programming Language :: Python :: 3.11
+    # Support for Python versions 3.10 and above is temporarily disabled
+    # Programming Language :: Python :: 3.10 
+    # Programming Language :: Python :: 3.11
     Topic :: Scientific/Engineering
 project_urls =
     Bug Tracker = https://github.com/Sage-Bionetworks/agora-data-tools/issues
@@ -35,10 +36,11 @@ install_requires =
     pandas==1.2.4
     numpy~=1.21
     setuptools~=67.0.0
-    synapseclient~=2.7.0
+    synapseclient~=3.1.1
     PyYAML~=6.0
     pyarrow~=11.0
     typer~=0.7.0
+    great-expectations==0.18.1
 python_requires = >=3.8, <3.12
 include_package_data = True
 zip_safe = False
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Ensure that /great_expectations and all of its contents are included when the package is installed
		graft src/agoradatatools/great_expectations/