Skip to content

Commit

Permalink
Merge pull request #159 from Sage-Bionetworks/jbeck/AG-1579/gene_meta…
Browse files Browse the repository at this point in the history
…data_remove_druggability

Removed druggability-only genes from the gene_metadata pre-processing
  • Loading branch information
jaclynbeck-sage authored Nov 25, 2024
2 parents 5ec2ca4 + ce9dc5e commit 6f41530
Show file tree
Hide file tree
Showing 5 changed files with 385 additions and 2,341 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ dmypy.json

# local generated files
staging/*
data_analysis/*/output/*

#test staging location
test_staging_dir/
Expand All @@ -141,3 +142,5 @@ test_staging_dir/
dev_config.yaml

.vscode/
.ipynb_checkpoints/
.Rhistory
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from unipressed import IdMappingClient\n",
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"import agoradatatools.etl.utils as utils\n",
"import agoradatatools.etl.extract as extract\n",
"import preprocessing_utils\n",
"\n",
"config_filename = \"../../../../config.yaml\""
]
Expand All @@ -43,157 +41,19 @@
"Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
" 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
" 'proteomics': ('syn18689335.3', 'csv'),\n",
" 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
" 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
" 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
" 'metabolomics': ('syn26064497.1', 'feather'),\n",
" 'igap': ('syn12514826.5', 'csv'),\n",
" 'eqtl': ('syn12514912.3', 'csv'),\n",
" 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
" 'target_list': ('syn12540368.47', 'csv'),\n",
" 'median_expression': ('syn27211878.2', 'csv'),\n",
" 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
" 'team_info': ('syn12615624.18', 'csv'),\n",
" 'team_member_info': ('syn12615633.18', 'csv'),\n",
" 'overall_scores': ('syn25575156.13', 'table'),\n",
" 'networks': ('syn11685347.1', 'csv')}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"config = utils._get_config(config_path=config_filename)\n",
"datasets = config[\"datasets\"]\n",
"\n",
"files = {}\n",
"\n",
"for dataset in datasets:\n",
" dataset_name = list(dataset.keys())[0]\n",
"\n",
" for entity in dataset[dataset_name][\"files\"]:\n",
" entity_id = entity[\"id\"]\n",
" entity_format = entity[\"format\"]\n",
" entity_name = entity[\"name\"]\n",
"\n",
" # Ignore json files, which are post-processed and not what we're interested in.\n",
" # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n",
" # \"gene_metadata\" which includes druggability genes.\n",
" if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n",
" files[entity_name] = (entity_id, entity_format)\n",
"\n",
"# There are some duplicate synID's in this list but that doesn't really matter\n",
"files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"UPGRADE AVAILABLE\n",
"\n",
"A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
" pip install --upgrade synapseclient\n",
"\n",
"Python Synapse Client version 4.6.0 release notes\n",
"\n",
"https://python-docs.synapse.org/news/\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome, Jaclyn Beck!\n",
"\n",
"INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n",
"\n",
"genes_biodomains has an NaN Ensembl ID\n",
"WARNING: no Ensembl ID column found for team_info!\n",
"WARNING: no Ensembl ID column found for team_member_info!\n"
]
}
],
"source": [
"syn = utils._login_to_synapse(token=None) # Assumes you have already logged in with a valid token\n",
"\n",
"# The various column names used to store Ensembl IDs in the files\n",
"col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
"file_ensembl_list = []\n",
"\n",
"for file in files.keys():\n",
" df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
"\n",
" file_ensembl_ids = None\n",
"\n",
" for C in col_names:\n",
" if C in df.columns:\n",
" file_ensembl_ids = df[C]\n",
"\n",
" # networks file is a special case\n",
" if file == \"networks\":\n",
" file_ensembl_ids = pd.melt(\n",
" df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
" )[\"value\"]\n",
"\n",
" if file_ensembl_ids is not None:\n",
" file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
" if \"n/A\" in file_ensembl_ids.tolist():\n",
" print(file + \" has an n/A Ensembl ID\")\n",
" file_ensembl_list.remove(\"n/A\")\n",
" if np.NaN in file_ensembl_ids.tolist():\n",
" print(file + \" has an NaN Ensembl ID\")\n",
" else:\n",
" print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"35858\n"
]
}
],
"outputs": [],
"source": [
"file_ensembl_list = list(set(file_ensembl_list))\n",
"\n",
"# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n",
"ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n",
"\n",
"print(len(ensembl_ids))"
"ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
" config_filename=config_filename,\n",
" exclude_files=[\"gene_metadata\", \"druggability\"],\n",
" token=None,\n",
")\n",
"print(\"\")\n",
"print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
]
},
{
Expand Down
Loading

0 comments on commit 6f41530

Please sign in to comment.