Skip to content

Commit

Permalink
Incorporate uniprot id into gene info (#160)
Browse files Browse the repository at this point in the history
* Added uniprotkb_accession column to gene_info dataset

* Updated gx validation for gene_info

* Added the uniprot mapping file to the gene_info provenance

* Updating gx validation

* Updated gx validation to include uniprotkb_accessions

* Finally got the gx validation to use the plural uniprotkb_accessions

* Pytests passing

* Removed rename_unknown_column() and related code

* gene_info test_add_uniprot_id_to_gene_info_should_pass passing

* uniprot fail test is failing

* Removing testing yaml file that we do not need

* Removing unnecessary changes to utils.py

* Updated version number of syn54113663 to syn54113663.3 in config and test_config

* Made the ensembl to uniprot test mapping file smaller

* Improved uniprot code integration with expected flow of existing codebase

* Removed unused regex for uniprot gx validation

* Adding duplicate ensembl ID to uniprot testing file. Removing unused testing files.

* Updating testing_gene_info docstring

* Removed unused testing file

* Removed unused variables in gene_info testing script

---------

Co-authored-by: Beatriz Saldana <[email protected]>
  • Loading branch information
beatrizsaldana and Beatriz Saldana authored Nov 27, 2024
1 parent 6f41530 commit ae4cf82
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 49 deletions.
6 changes: 6 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ datasets:
- name: tep_adi_info
id: syn51942280.3
format: csv
- name: ensg_to_uniprot_mapping
id: syn54113663.3
format: tsv
final_format: json
custom_transformations:
adjusted_p_value_threshold: 0.05
Expand All @@ -186,6 +189,8 @@ datasets:
maximumlogcpm: max
possible_replacement: ensembl_possible_replacements
permalink: ensembl_permalink
uniprotkb_accession: uniprotkb_accessions
resource_identifier: ensembl_gene_id
provenance:
- syn25953363.13
- syn12514826.5
Expand All @@ -199,6 +204,7 @@ datasets:
- syn13363443.11
- *genes_biodomains_provenance
- syn51942280.3
- syn54113663.3
agora_rename:
symbol: hgnc_symbol
destination: *dest
Expand Down
11 changes: 11 additions & 0 deletions gx_suite_definitions/gene_info.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,17 @@
"validator.expect_column_values_to_match_json_schema(\"ensembl_info\", ensembl_info_schema)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uniprotkb_accessions\n",
"validator.expect_column_values_to_be_of_type(\"uniprotkb_accessions\", \"list\")\n",
"validator.expect_column_values_to_have_list_members_of_type(column=\"uniprotkb_accessions\", member_type=\"str\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
10 changes: 10 additions & 0 deletions src/agoradatatools/etl/transform/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def transform_gene_info(
druggability = datasets["druggability"]
biodomains = datasets["genes_biodomains"]
tep_info = datasets["tep_adi_info"]
uniprot = datasets["ensg_to_uniprot_mapping"]

# Modify the data before merging

Expand Down Expand Up @@ -129,6 +130,13 @@ def transform_gene_info(
axis=1,
)

# Collapse uniprot IDs into a list for each ensembl_gene_id
collapsed_uniprot = (
uniprot.groupby("ensembl_gene_id")["uniprotkb_accessions"]
.apply(list)
.reset_index()
)

# Merge all the datasets
gene_info = gene_metadata

Expand All @@ -142,6 +150,7 @@ def transform_gene_info(
druggability,
biodomains,
tep_info,
collapsed_uniprot,
]:
gene_info = pd.merge(
left=gene_info,
Expand Down Expand Up @@ -245,6 +254,7 @@ def transform_gene_info(
"is_tep",
"resource_url",
"ensembl_info",
"uniprotkb_accessions",
]
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -514,25 +514,25 @@
"kwargs": {
"column": "biodomains",
"list_members": [
"Myelination",
"Apoptosis",
"Vasculature",
"Lipid Metabolism",
"Oxidative Stress",
"Mitochondrial Metabolism",
"APP Metabolism",
"Proteostasis",
"Oxidative Stress",
"DNA Repair",
"Metal Binding and Homeostasis",
"Structural Stabilization",
"Endolysosome",
"Synapse",
"Immune Response",
"Autophagy",
"Tau Homeostasis",
"Cell Cycle",
"Vasculature",
"APP Metabolism",
"Lipid Metabolism",
"Apoptosis",
"Synapse",
"Epigenetic",
"RNA Spliceosome"
"Metal Binding and Homeostasis",
"Endolysosome",
"Structural Stabilization",
"Myelination",
"RNA Spliceosome",
"Autophagy"
]
},
"meta": {}
Expand Down Expand Up @@ -635,6 +635,22 @@
}
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_be_of_type",
"kwargs": {
"column": "uniprotkb_accessions",
"type_": "list"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_members_of_type",
"kwargs": {
"column": "uniprotkb_accessions",
"member_type": "str"
},
"meta": {}
}
],
"ge_cloud_id": null,
Expand Down
6 changes: 6 additions & 0 deletions test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ datasets:
- name: tep_adi_info
id: syn51942280.3
format: csv
- name: ensg_to_uniprot_mapping
id: syn54113663.3
format: tsv
final_format: json
custom_transformations:
adjusted_p_value_threshold: 0.05
Expand All @@ -186,6 +189,8 @@ datasets:
maximumlogcpm: max
possible_replacement: ensembl_possible_replacements
permalink: ensembl_permalink
uniprotkb_accession: uniprotkb_accessions
resource_identifier: ensembl_gene_id
provenance:
- syn25953363.13
- syn12514826.5
Expand All @@ -199,6 +204,7 @@ datasets:
- syn13363443.11
- *genes_biodomains_provenance
- syn51942280.3
- syn54113663.3
agora_rename:
symbol: hgnc_symbol
destination: *dest
Expand Down
20 changes: 20 additions & 0 deletions tests/test_assets/gene_info/input/ensg_to_uniprot_mapping_good.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
uniprotkb_accessions ensembl_gene_id
P08603 ENSG00000000971
Q9H2S6 ENSG00000000005
O00522 ENSG00000001631
Q9P2G1 ENSG00000001629
Q9BTY2 ENSG00000001036
Q5TH74 ENSG00000001460
P23511 ENSG00000001167
Q6P499 ENSG00000001461
Q9Y6X5 ENSG00000001561
P48506 ENSG00000001084
Q8IZE3 ENSG00000000457
Q9NSG2 ENSG00000000460
P09769 ENSG00000000938
O60762 ENSG00000000419
Q16850 ENSG00000001630
Q9Y4W2 ENSG00000001497
P13569 ENSG00000001626
Q13275 ENSG00000001617
Q1TEST ENSG00000001617
Loading

0 comments on commit ae4cf82

Please sign in to comment.