Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed druggability-only genes from the gene_metadata pre-processing #159

Merged
merged 8 commits into from
Nov 25, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ dmypy.json

# local generated files
staging/*
data_analysis/*/output/*

#test staging location
test_staging_dir/
Expand All @@ -141,3 +142,5 @@ test_staging_dir/
dev_config.yaml

.vscode/
.ipynb_checkpoints/
.Rhistory
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from unipressed import IdMappingClient\n",
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"import agoradatatools.etl.utils as utils\n",
"import agoradatatools.etl.extract as extract\n",
"import preprocessing_utils\n",
"\n",
"config_filename = \"../../../../config.yaml\""
]
Expand All @@ -43,157 +41,19 @@
"Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
" 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
" 'proteomics': ('syn18689335.3', 'csv'),\n",
" 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
" 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
" 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
" 'metabolomics': ('syn26064497.1', 'feather'),\n",
" 'igap': ('syn12514826.5', 'csv'),\n",
" 'eqtl': ('syn12514912.3', 'csv'),\n",
" 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
" 'target_list': ('syn12540368.47', 'csv'),\n",
" 'median_expression': ('syn27211878.2', 'csv'),\n",
" 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
" 'team_info': ('syn12615624.18', 'csv'),\n",
" 'team_member_info': ('syn12615633.18', 'csv'),\n",
" 'overall_scores': ('syn25575156.13', 'table'),\n",
" 'networks': ('syn11685347.1', 'csv')}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"config = utils._get_config(config_path=config_filename)\n",
"datasets = config[\"datasets\"]\n",
"\n",
"files = {}\n",
"\n",
"for dataset in datasets:\n",
" dataset_name = list(dataset.keys())[0]\n",
"\n",
" for entity in dataset[dataset_name][\"files\"]:\n",
" entity_id = entity[\"id\"]\n",
" entity_format = entity[\"format\"]\n",
" entity_name = entity[\"name\"]\n",
"\n",
" # Ignore json files, which are post-processed and not what we're interested in.\n",
" # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n",
" # \"gene_metadata\" which includes druggability genes.\n",
" if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n",
" files[entity_name] = (entity_id, entity_format)\n",
"\n",
"# There are some duplicate synID's in this list but that doesn't really matter\n",
"files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"UPGRADE AVAILABLE\n",
"\n",
"A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
" pip install --upgrade synapseclient\n",
"\n",
"Python Synapse Client version 4.6.0 release notes\n",
"\n",
"https://python-docs.synapse.org/news/\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome, Jaclyn Beck!\n",
"\n",
"INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n",
"\n",
"genes_biodomains has an NaN Ensembl ID\n",
"WARNING: no Ensembl ID column found for team_info!\n",
"WARNING: no Ensembl ID column found for team_member_info!\n"
]
}
],
"source": [
"syn = utils._login_to_synapse(token=None) # Assumes you have already logged in with a valid token\n",
"\n",
"# The various column names used to store Ensembl IDs in the files\n",
"col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
"file_ensembl_list = []\n",
"\n",
"for file in files.keys():\n",
" df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
"\n",
" file_ensembl_ids = None\n",
"\n",
" for C in col_names:\n",
" if C in df.columns:\n",
" file_ensembl_ids = df[C]\n",
"\n",
" # networks file is a special case\n",
" if file == \"networks\":\n",
" file_ensembl_ids = pd.melt(\n",
" df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
" )[\"value\"]\n",
"\n",
" if file_ensembl_ids is not None:\n",
" file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
" if \"n/A\" in file_ensembl_ids.tolist():\n",
" print(file + \" has an n/A Ensembl ID\")\n",
" file_ensembl_list.remove(\"n/A\")\n",
" if np.NaN in file_ensembl_ids.tolist():\n",
" print(file + \" has an NaN Ensembl ID\")\n",
" else:\n",
" print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"35858\n"
]
}
],
"outputs": [],
"source": [
"file_ensembl_list = list(set(file_ensembl_list))\n",
"\n",
"# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n",
"ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n",
"\n",
"print(len(ensembl_ids))"
"ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
" config_filename=config_filename,\n",
" exclude_files=[\"gene_metadata\", \"druggability\"],\n",
" token=None,\n",
")\n",
"print(\"\")\n",
"print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
]
},
{
Expand Down
Loading