From ae4cf8217049c791eb99405ac47464f446ecf98b Mon Sep 17 00:00:00 2001 From: Beatriz Saldana <37123591+beatrizsaldana@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:34:54 -0800 Subject: [PATCH] Incorporate uniprot id into gene info (#160) * Added uniprotkb_accession column to gene_info dataset * Updated gx validation for gene_info * Added the uniprot mapping file to the gene_info provenance * Updating gx validation * Updated gx validation to include uniprotkb_accessions * Finally got the gx validation to use the plural uniprotkb_accessions * Pytests passing * Removed rename_unknown_column() and related code * gene_info test_add_uniprot_id_to_gene_info_should_pass passing * uniprot fail test is failing * Removing testing yaml file that we do not need * Removing unnecessary changes to utils.py * Updated version number of syn54113663 to syn54113663.3 in config and test_config * Made the ensembl to uniprot test mapping file smaller * Improved uniprot code integration with expected flow of existing codebase * Removed unused regex for uniprot gx validation * Adding duplicate ensembl ID to uniprot testing file. Removing unused testing files. * Updating testing_gene_info docstring * Removed unused testing file * Removed unused variables in gene_info testing script --------- Co-authored-by: Beatriz Saldana --- config.yaml | 6 ++ gx_suite_definitions/gene_info.ipynb | 11 +++ src/agoradatatools/etl/transform/gene_info.py | 10 ++ .../gx/expectations/gene_info.json | 40 +++++--- test_config.yaml | 6 ++ .../input/ensg_to_uniprot_mapping_good.tsv | 20 ++++ .../output/gene_info_good_output_1.json | 91 +++++++++++++++---- .../output/gene_info_good_output_2.json | 54 +++++++---- tests/transform/test_gene_info.py | 9 +- 9 files changed, 198 insertions(+), 49 deletions(-) create mode 100644 tests/test_assets/gene_info/input/ensg_to_uniprot_mapping_good.tsv diff --git a/config.yaml b/config.yaml index 7b8b4f4b..b6ff6073 100644 --- a/config.yaml +++ b/config.yaml @@ -169,6 +169,9 @@ datasets: - name: tep_adi_info id: syn51942280.3 format: csv + - name: ensg_to_uniprot_mapping + id: syn54113663.3 + format: tsv final_format: json custom_transformations: adjusted_p_value_threshold: 0.05 @@ -186,6 +189,8 @@ datasets: maximumlogcpm: max possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink + uniprotkb_accession: uniprotkb_accessions + resource_identifier: ensembl_gene_id provenance: - syn25953363.13 - syn12514826.5 @@ -199,6 +204,7 @@ datasets: - syn13363443.11 - *genes_biodomains_provenance - syn51942280.3 + - syn54113663.3 agora_rename: symbol: hgnc_symbol destination: *dest diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb index 4d4b327f..1def5a28 100644 --- a/gx_suite_definitions/gene_info.ipynb +++ b/gx_suite_definitions/gene_info.ipynb @@ -343,6 +343,17 @@ "validator.expect_column_values_to_match_json_schema(\"ensembl_info\", ensembl_info_schema)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uniprotkb_accessions\n", + "validator.expect_column_values_to_be_of_type(\"uniprotkb_accessions\", \"list\")\n", + "validator.expect_column_values_to_have_list_members_of_type(column=\"uniprotkb_accessions\", member_type=\"str\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index a812a89e..8c5d588b 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -24,6 +24,7 @@ def transform_gene_info( druggability = datasets["druggability"] biodomains = datasets["genes_biodomains"] tep_info = datasets["tep_adi_info"] + uniprot = datasets["ensg_to_uniprot_mapping"] # Modify the data before merging @@ -129,6 +130,13 @@ def transform_gene_info( axis=1, ) + # Collapse uniprot IDs into a list for each ensembl_gene_id + collapsed_uniprot = ( + uniprot.groupby("ensembl_gene_id")["uniprotkb_accessions"] + .apply(list) + .reset_index() + ) + # Merge all the datasets gene_info = gene_metadata @@ -142,6 +150,7 @@ def transform_gene_info( druggability, biodomains, tep_info, + collapsed_uniprot, ]: gene_info = pd.merge( left=gene_info, @@ -245,6 +254,7 @@ def transform_gene_info( "is_tep", "resource_url", "ensembl_info", + "uniprotkb_accessions", ] ] diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json index ea351cac..7c913cd9 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json +++ b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json @@ -514,25 +514,25 @@ "kwargs": { "column": "biodomains", "list_members": [ - "Myelination", + "Apoptosis", + "Vasculature", + "Lipid Metabolism", + "Oxidative Stress", "Mitochondrial Metabolism", + "APP Metabolism", "Proteostasis", - "Oxidative Stress", "DNA Repair", - "Metal Binding and Homeostasis", - "Structural Stabilization", - "Endolysosome", + "Synapse", "Immune Response", - "Autophagy", "Tau Homeostasis", "Cell Cycle", - "Vasculature", - "APP Metabolism", - "Lipid Metabolism", - "Apoptosis", - "Synapse", "Epigenetic", - "RNA Spliceosome" + "Metal Binding and Homeostasis", + "Endolysosome", + "Structural Stabilization", + "Myelination", + "RNA Spliceosome", + "Autophagy" ] }, "meta": {} @@ -635,6 +635,22 @@ } }, "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "uniprotkb_accessions", + "type_": "list" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_have_list_members_of_type", + "kwargs": { + "column": "uniprotkb_accessions", + "member_type": "str" + }, + "meta": {} } ], "ge_cloud_id": null, diff --git a/test_config.yaml b/test_config.yaml index 02bd0f18..a76eb704 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -169,6 +169,9 @@ datasets: - name: tep_adi_info id: syn51942280.3 format: csv + - name: ensg_to_uniprot_mapping + id: syn54113663.3 + format: tsv final_format: json custom_transformations: adjusted_p_value_threshold: 0.05 @@ -186,6 +189,8 @@ datasets: maximumlogcpm: max possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink + uniprotkb_accession: uniprotkb_accessions + resource_identifier: ensembl_gene_id provenance: - syn25953363.13 - syn12514826.5 @@ -199,6 +204,7 @@ datasets: - syn13363443.11 - *genes_biodomains_provenance - syn51942280.3 + - syn54113663.3 agora_rename: symbol: hgnc_symbol destination: *dest diff --git a/tests/test_assets/gene_info/input/ensg_to_uniprot_mapping_good.tsv b/tests/test_assets/gene_info/input/ensg_to_uniprot_mapping_good.tsv new file mode 100644 index 00000000..468bb9d6 --- /dev/null +++ b/tests/test_assets/gene_info/input/ensg_to_uniprot_mapping_good.tsv @@ -0,0 +1,20 @@ +uniprotkb_accessions ensembl_gene_id +P08603 ENSG00000000971 +Q9H2S6 ENSG00000000005 +O00522 ENSG00000001631 +Q9P2G1 ENSG00000001629 +Q9BTY2 ENSG00000001036 +Q5TH74 ENSG00000001460 +P23511 ENSG00000001167 +Q6P499 ENSG00000001461 +Q9Y6X5 ENSG00000001561 +P48506 ENSG00000001084 +Q8IZE3 ENSG00000000457 +Q9NSG2 ENSG00000000460 +P09769 ENSG00000000938 +O60762 ENSG00000000419 +Q16850 ENSG00000001630 +Q9Y4W2 ENSG00000001497 +P13569 ENSG00000001626 +Q13275 ENSG00000001617 +Q1TEST ENSG00000001617 diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_1.json b/tests/test_assets/gene_info/output/gene_info_good_output_1.json index 84f16154..c39db39e 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_1.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_1.json @@ -55,7 +55,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" - } + }, + "uniprotkb_accessions": [ + "Q9H2S6" + ] }, { "ensembl_gene_id": "ENSG00000000419", @@ -157,7 +160,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" - } + }, + "uniprotkb_accessions": [ + "O60762" + ] }, { "ensembl_gene_id": "ENSG00000000457", @@ -213,7 +219,10 @@ "ensembl_release": null, "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" - } + }, + "uniprotkb_accessions": [ + "Q8IZE3" + ] }, { "ensembl_gene_id": "ENSG00000000460", @@ -254,7 +263,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" - } + }, + "uniprotkb_accessions": [ + "Q9NSG2" + ] }, { "ensembl_gene_id": "ENSG00000000938", @@ -288,7 +300,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" - } + }, + "uniprotkb_accessions": [ + "P09769" + ] }, { "ensembl_gene_id": "ENSG00000000971", @@ -343,7 +358,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" - } + }, + "uniprotkb_accessions": [ + "P08603" + ] }, { "ensembl_gene_id": "ENSG00000001036", @@ -400,7 +418,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" - } + }, + "uniprotkb_accessions": [ + "Q9BTY2" + ] }, { "ensembl_gene_id": "ENSG00000001084", @@ -441,7 +462,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": null - } + }, + "uniprotkb_accessions": [ + "P48506" + ] }, { "ensembl_gene_id": "ENSG00000001167", @@ -472,7 +496,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" - } + }, + "uniprotkb_accessions": [ + "P23511" + ] }, { "ensembl_gene_id": "ENSG00000001460", @@ -501,7 +528,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" - } + }, + "uniprotkb_accessions": [ + "Q5TH74" + ] }, { "ensembl_gene_id": "ENSG00000001461", @@ -531,7 +561,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" - } + }, + "uniprotkb_accessions": [ + "Q6P499" + ] }, { "ensembl_gene_id": "ENSG00000001497", @@ -563,7 +596,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" - } + }, + "uniprotkb_accessions": [ + "Q9Y4W2" + ] }, { "ensembl_gene_id": "ENSG00000001561", @@ -591,7 +627,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" - } + }, + "uniprotkb_accessions": [ + "Q9Y6X5" + ] }, { "ensembl_gene_id": "ENSG00000001617", @@ -621,7 +660,11 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" - } + }, + "uniprotkb_accessions": [ + "Q13275", + "Q1TEST" + ] }, { "ensembl_gene_id": "ENSG00000001626", @@ -655,7 +698,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" - } + }, + "uniprotkb_accessions": [ + "P13569" + ] }, { "ensembl_gene_id": "ENSG00000001629", @@ -681,7 +727,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" - } + }, + "uniprotkb_accessions": [ + "Q9P2G1" + ] }, { "ensembl_gene_id": "ENSG00000001630", @@ -714,7 +763,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" - } + }, + "uniprotkb_accessions": [ + "Q16850" + ] }, { "ensembl_gene_id": "ENSG00000001631", @@ -762,7 +814,10 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" - } + }, + "uniprotkb_accessions": [ + "O00522" + ] }, { "ensembl_gene_id": "ENSG00000161149", diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_2.json b/tests/test_assets/gene_info/output/gene_info_good_output_2.json index b57e63aa..dc8ad4c0 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_2.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_2.json @@ -55,7 +55,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" - } + }, + "uniprotkb_accessions": ["Q9H2S6"] }, { "ensembl_gene_id": "ENSG00000000419", @@ -157,7 +158,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" - } + }, + "uniprotkb_accessions": ["O60762"] }, { "ensembl_gene_id": "ENSG00000000457", @@ -213,7 +215,8 @@ "ensembl_release": null, "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" - } + }, + "uniprotkb_accessions": ["Q8IZE3"] }, { "ensembl_gene_id": "ENSG00000000460", @@ -254,7 +257,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" - } + }, + "uniprotkb_accessions": ["Q9NSG2"] }, { "ensembl_gene_id": "ENSG00000000938", @@ -288,7 +292,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" - } + }, + "uniprotkb_accessions": ["P09769"] }, { "ensembl_gene_id": "ENSG00000000971", @@ -343,7 +348,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" - } + }, + "uniprotkb_accessions": ["P08603"] }, { "ensembl_gene_id": "ENSG00000001036", @@ -400,7 +406,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" - } + }, + "uniprotkb_accessions": ["Q9BTY2"] }, { "ensembl_gene_id": "ENSG00000001084", @@ -441,7 +448,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": null - } + }, + "uniprotkb_accessions": ["P48506"] }, { "ensembl_gene_id": "ENSG00000001167", @@ -472,7 +480,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" - } + }, + "uniprotkb_accessions": ["P23511"] }, { "ensembl_gene_id": "ENSG00000001460", @@ -501,7 +510,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" - } + }, + "uniprotkb_accessions": ["Q5TH74"] }, { "ensembl_gene_id": "ENSG00000001461", @@ -531,7 +541,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" - } + }, + "uniprotkb_accessions": ["Q6P499"] }, { "ensembl_gene_id": "ENSG00000001497", @@ -563,7 +574,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" - } + }, + "uniprotkb_accessions": ["Q9Y4W2"] }, { "ensembl_gene_id": "ENSG00000001561", @@ -591,7 +603,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" - } + }, + "uniprotkb_accessions": ["Q9Y6X5"] }, { "ensembl_gene_id": "ENSG00000001617", @@ -621,7 +634,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" - } + }, + "uniprotkb_accessions": ["Q13275", "Q1TEST"] }, { "ensembl_gene_id": "ENSG00000001626", @@ -655,7 +669,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" - } + }, + "uniprotkb_accessions": ["P13569"] }, { "ensembl_gene_id": "ENSG00000001629", @@ -681,7 +696,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" - } + }, + "uniprotkb_accessions": ["Q9P2G1"] }, { "ensembl_gene_id": "ENSG00000001630", @@ -714,7 +730,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" - } + }, + "uniprotkb_accessions": ["Q16850"] }, { "ensembl_gene_id": "ENSG00000001631", @@ -762,7 +779,8 @@ "ensembl_release": "111", "ensembl_possible_replacements": [], "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" - } + }, + "uniprotkb_accessions": ["O00522"] }, { "ensembl_gene_id": "ENSG00000161149", diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index c4449eb9..43fe4e57 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -1,6 +1,6 @@ """Integration test for the gene_info transform. -This transform requires 12 different input datasets and tests several conditions in each one. Description and passing +This transform requires 13 different input datasets and tests several conditions in each one. Description and passing or failing input for each dataset: gene_metadata: collection of information like gene symbol, aliases, etc about each Ensembl ID in every dataset. Both the "alias" and "ensembl_possible_replacements" fields are lists of strings, so this dataset @@ -70,6 +70,10 @@ or is_tep values are assumed to mean "False". These two fields must have boolean values if the data isn't missing. Ensembl IDs should be unique. failing input: a missing hgnc_symbol or a string value in is_adi or is_tep should throw a TypeError. + ensg_to_uniprot_mapping: a list of Ensembl IDs and their associated Uniprot accessions. + passing input: Duplicate Ensembl IDs are allowed due to association with multiple Uniprot accessions, + so the test file has rows with the same Ensembl ID but different Uniprot accession values. + failing input: none Other notes about the test files: Missing Ensembl IDs: these are allowed in any dataset, and rows with missing IDs will get dropped in the transform. @@ -122,6 +126,7 @@ class TestTransformGeneInfo: "druggability": "druggability_good_input.csv", "genes_biodomains": "genes_biodomains_good_input.csv", "tep_adi_info": "tep_adi_info_good_input.csv", + "ensg_to_uniprot_mapping": "ensg_to_uniprot_mapping_good.tsv", } pval_error_match_string = "'<=' not supported" @@ -245,6 +250,8 @@ def read_input_files_dict(self, input_files_dict: dict) -> dict: filename = os.path.join(self.data_files_path, "input", value) if key == "gene_metadata": datasets[key] = pd.read_feather(filename) + elif value.endswith("tsv"): + datasets[key] = pd.read_csv(filename, sep="\t") else: datasets[key] = pd.read_csv(filename)