Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IBCDPE-712] Implement Great Expectations for the genes_biodomains Datset #103

Merged
merged 13 commits into from
Dec 13, 2023
1 change: 1 addition & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ datasets:
ensembl_id: ensembl_gene_id
goterm_name: go_terms
destination: *dest
gx_folder: syn53127958

- neuropath_corr:
files:
Expand Down
187 changes: 187 additions & 0 deletions gx_suite_definitions/genes_biodomains.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import synapseclient\n",
"\n",
"import great_expectations as gx\n",
"\n",
"context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n",
"\n",
"from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n",
"from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange\n",
"from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n",
"from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType\n",
"from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ExpectColumnValuesToHaveListOfDictWithExpectedValues\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create Expectation Suite for Metabolomics Data"
BWMac marked this conversation as resolved.
Show resolved Hide resolved
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Example Data File"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"syn = synapseclient.Synapse()\n",
"syn.login()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"genes_biodomains_data_file = syn.get(\"syn51062085\").path\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Validator Object on Data File"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"validator = context.sources.pandas_default.read_json(\n",
" genes_biodomains_data_file\n",
")\n",
"validator.expectation_suite_name = \"genes_biodomains\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add Expectations to Validator Object For Each Column"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ensembl_gene_id\n",
"validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n",
"validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n",
"validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n",
"# checks format and allowed chatacters\n",
"validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n",
"validator.expect_column_values_to_be_unique(\"ensembl_gene_id\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# gene_biodomains\n",
"validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"list\")\n",
"validator.expect_column_values_to_not_be_null(\"gene_biodomains\")\n",
"validator.expect_column_values_to_have_list_length_in_range(column=\"gene_biodomains\", list_length_range=[0, 19])\n",
"validator.expect_column_values_to_have_list_members_of_type(column=\"gene_biodomains\", member_type=\"dict\")\n",
"biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']\n",
"validator.expect_column_values_to_have_list_of_dict_with_expected_values(column=\"gene_biodomains\", list_dict_values={\"key\": \"biodomain\", \"values\": biodomain_list})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save Expectation Suite"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"validator.save_expectation_suite(discard_failed_expectations=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Checkpoint and View Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"checkpoint = context.add_or_update_checkpoint(\n",
" name=\"agora-test-checkpoint\",\n",
" validator=validator,\n",
")\n",
"checkpoint_result = checkpoint.run()\n",
"context.view_validation_result(checkpoint_result)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build Data Docs - Click on Expectation Suite to View All Expectations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"context.build_data_docs()\n",
"context.open_data_docs()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: agora-test-checkpoint
BWMac marked this conversation as resolved.
Show resolved Hide resolved
config_version: 1.0
template_name:
module_name: great_expectations.checkpoint
class_name: Checkpoint
run_name_template:
expectation_suite_name:
batch_request: {}
action_list:
- name: store_validation_result
action:
class_name: StoreValidationResultAction
- name: store_evaluation_params
action:
class_name: StoreEvaluationParametersAction
- name: update_data_docs
action:
class_name: UpdateDataDocsAction
evaluation_parameters: {}
runtime_configuration: {}
validations:
- batch_request:
datasource_name: default_pandas_datasource
data_asset_name: '#ephemeral_pandas_asset'
options: {}
batch_slice:
expectation_suite_name: genes_biodomains
profilers: []
ge_cloud_id:
expectation_suite_ge_cloud_id:
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: genes_biodomains
config_version: 1.0
template_name:
module_name: great_expectations.checkpoint
class_name: Checkpoint
run_name_template:
expectation_suite_name:
batch_request: {}
action_list:
- name: store_validation_result
action:
class_name: StoreValidationResultAction
- name: store_evaluation_params
action:
class_name: StoreEvaluationParametersAction
- name: update_data_docs
action:
class_name: UpdateDataDocsAction
evaluation_parameters: {}
runtime_configuration: {}
validations:
- batch_request:
datasource_name: default_pandas_datasource
data_asset_name: '#ephemeral_pandas_asset'
options: {}
batch_slice:
expectation_suite_name: genes_biodomains
profilers: []
ge_cloud_id:
expectation_suite_ge_cloud_id:
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"data_asset_type": null,
"expectation_suite_name": "genes_biodomains",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_of_type",
"kwargs": {
"column": "ensembl_gene_id",
"type_": "str"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_not_be_null",
"kwargs": {
"column": "ensembl_gene_id"
},
"meta": {}
},
{
"expectation_type": "expect_column_value_lengths_to_equal",
"kwargs": {
"column": "ensembl_gene_id",
"value": 15
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_match_regex",
"kwargs": {
"column": "ensembl_gene_id",
"regex": "^ENSG\\d{11}$"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_be_unique",
"kwargs": {
"column": "ensembl_gene_id"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_be_of_type",
"kwargs": {
"column": "gene_biodomains",
"type_": "list"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_not_be_null",
"kwargs": {
"column": "gene_biodomains"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_length_in_range",
"kwargs": {
"column": "gene_biodomains",
"list_length_range": [
0,
19
BWMac marked this conversation as resolved.
Show resolved Hide resolved
]
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_members_of_type",
"kwargs": {
"column": "gene_biodomains",
"member_type": "dict"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_of_dict_with_expected_values",
"kwargs": {
"column": "gene_biodomains",
"list_dict_values": {
"key": "biodomain",
"values": [
"Apoptosis",
"Vasculature",
"Lipid Metabolism",
"Proteostasis",
"Immune Response",
"Autophagy",
"Mitochondrial Metabolism",
"Structural Stabilization",
"Synapse",
"Endolysosome",
"Metal Binding and Homeostasis",
"Oxidative Stress",
"Epigenetic",
"APP Metabolism",
"Cell Cycle",
"DNA Repair",
"RNA Spliceosome",
"Tau Homeostasis",
"Myelination"
]
}
},
"meta": {}
}
],
"ge_cloud_id": null,
"meta": {
"great_expectations_version": "0.18.1"
}
}
Loading
Loading