diff --git a/.gitignore b/.gitignore index 20c38245..909837f7 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,7 @@ dmypy.json # local generated files staging/* +data_analysis/*/output/* #test staging location test_staging_dir/ @@ -141,3 +142,5 @@ test_staging_dir/ dev_config.yaml .vscode/ +.ipynb_checkpoints/ +.Rhistory diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb index ba477beb..2b369886 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb @@ -20,16 +20,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unipressed import IdMappingClient\n", "import time\n", "import pandas as pd\n", - "import numpy as np\n", - "import agoradatatools.etl.utils as utils\n", - "import agoradatatools.etl.extract as extract\n", + "import preprocessing_utils\n", "\n", "config_filename = \"../../../../config.yaml\"" ] @@ -43,157 +41,19 @@ "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. " ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'genes_biodomains': ('syn44151254.5', 'csv'),\n", - " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n", - " 'proteomics': ('syn18689335.3', 'csv'),\n", - " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n", - " 'proteomics_srm': ('syn52579640.4', 'csv'),\n", - " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n", - " 'metabolomics': ('syn26064497.1', 'feather'),\n", - " 'igap': ('syn12514826.5', 'csv'),\n", - " 'eqtl': ('syn12514912.3', 'csv'),\n", - " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n", - " 'target_list': ('syn12540368.47', 'csv'),\n", - " 'median_expression': ('syn27211878.2', 'csv'),\n", - " 'tep_adi_info': ('syn51942280.2', 'csv'),\n", - " 'team_info': ('syn12615624.18', 'csv'),\n", - " 'team_member_info': ('syn12615633.18', 'csv'),\n", - " 'overall_scores': ('syn25575156.13', 'table'),\n", - " 'networks': ('syn11685347.1', 'csv')}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config = utils._get_config(config_path=config_filename)\n", - "datasets = config[\"datasets\"]\n", - "\n", - "files = {}\n", - "\n", - "for dataset in datasets:\n", - " dataset_name = list(dataset.keys())[0]\n", - "\n", - " for entity in dataset[dataset_name][\"files\"]:\n", - " entity_id = entity[\"id\"]\n", - " entity_format = entity[\"format\"]\n", - " entity_name = entity[\"name\"]\n", - "\n", - " # Ignore json files, which are post-processed and not what we're interested in.\n", - " # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n", - " # \"gene_metadata\" which includes druggability genes.\n", - " if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n", - " files[entity_name] = (entity_id, entity_format)\n", - "\n", - "# There are some duplicate synID's in this list but that doesn't really matter\n", - "files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.6.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome, Jaclyn Beck!\n", - "\n", - "INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n", - "\n", - "genes_biodomains has an NaN Ensembl ID\n", - "WARNING: no Ensembl ID column found for team_info!\n", - "WARNING: no Ensembl ID column found for team_member_info!\n" - ] - } - ], - "source": [ - "syn = utils._login_to_synapse(token=None) # Assumes you have already logged in with a valid token\n", - "\n", - "# The various column names used to store Ensembl IDs in the files\n", - "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n", - "file_ensembl_list = []\n", - "\n", - "for file in files.keys():\n", - " df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n", - "\n", - " file_ensembl_ids = None\n", - "\n", - " for C in col_names:\n", - " if C in df.columns:\n", - " file_ensembl_ids = df[C]\n", - "\n", - " # networks file is a special case\n", - " if file == \"networks\":\n", - " file_ensembl_ids = pd.melt(\n", - " df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n", - " )[\"value\"]\n", - "\n", - " if file_ensembl_ids is not None:\n", - " file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n", - " if \"n/A\" in file_ensembl_ids.tolist():\n", - " print(file + \" has an n/A Ensembl ID\")\n", - " file_ensembl_list.remove(\"n/A\")\n", - " if np.NaN in file_ensembl_ids.tolist():\n", - " print(file + \" has an NaN Ensembl ID\")\n", - " else:\n", - " print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35858\n" - ] - } - ], + "outputs": [], "source": [ - "file_ensembl_list = list(set(file_ensembl_list))\n", - "\n", - "# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n", - "ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n", - "\n", - "print(len(ensembl_ids))" + "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n", + " config_filename=config_filename,\n", + " exclude_files=[\"gene_metadata\", \"druggability\"],\n", + " token=None,\n", + ")\n", + "print(\"\")\n", + "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")" ] }, { diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index 9ef8fedf..7550d17c 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -108,178 +108,43 @@ "source": [ "## Get Ensembl IDs from data sets that will be processed by agora-data-tools\n", "\n", - "Loop through all data sets in the config file to get all Ensembl IDs used in every data set." + "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. Exclude `gene_metadata` since that's the file we are building, and `druggability` since that data is deprecated." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a3fdbeec", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'genes_biodomains': ('syn44151254.5', 'csv'),\n", - " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n", - " 'proteomics': ('syn18689335.3', 'csv'),\n", - " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n", - " 'proteomics_srm': ('syn52579640.4', 'csv'),\n", - " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n", - " 'metabolomics': ('syn26064497.1', 'feather'),\n", - " 'igap': ('syn12514826.5', 'csv'),\n", - " 'eqtl': ('syn12514912.3', 'csv'),\n", - " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n", - " 'target_list': ('syn12540368.47', 'csv'),\n", - " 'median_expression': ('syn27211878.2', 'csv'),\n", - " 'druggability': ('syn13363443.11', 'csv'),\n", - " 'tep_adi_info': ('syn51942280.2', 'csv'),\n", - " 'team_info': ('syn12615624.18', 'csv'),\n", - " 'team_member_info': ('syn12615633.18', 'csv'),\n", - " 'overall_scores': ('syn25575156.13', 'table'),\n", - " 'networks': ('syn11685347.1', 'csv')}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "config = utils._get_config(config_path=config_filename)\n", - "datasets = config[\"datasets\"]\n", - "\n", - "files = {}\n", - "\n", - "for dataset in datasets:\n", - " dataset_name = list(dataset.keys())[0]\n", - "\n", - " for entity in dataset[dataset_name][\"files\"]:\n", - " entity_id = entity[\"id\"]\n", - " entity_format = entity[\"format\"]\n", - " entity_name = entity[\"name\"]\n", - "\n", - " # Ignore json files, which are post-processed and not what we're interested in.\n", - " # Also ignore \"gene_metadata\" since that's the file we're making here.\n", - " if entity_format != \"json\" and entity_name != \"gene_metadata\":\n", - " files[entity_name] = (entity_id, entity_format)\n", - "\n", - "# There are some duplicate synID's in this list but that doesn't really matter\n", - "files" + "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n", + " config_filename=config_filename,\n", + " exclude_files=[\"gene_metadata\", \"druggability\"],\n", + " token=None,\n", + ")\n", + "print(\"\")\n", + "print(str(len(file_ensembl_list)) + \" Ensembl IDs found.\")\n", + "print(file_ensembl_list[0:5])" ] }, { "cell_type": "markdown", - "id": "8f1a2120", + "id": "5fa76bfb", "metadata": {}, "source": [ - "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs." + "Create a data frame with these IDs so it can be merged with the MyGene query results below." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "9843689d", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.2.0) is available. Your version (4.0.0) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.2.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome, Jaclyn Beck!\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:synapseclient_default:Welcome, Jaclyn Beck!\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "genes_biodomains has an NaN Ensembl ID\n", - "WARNING: no Ensembl ID column found for team_info!\n", - "WARNING: no Ensembl ID column found for team_member_info!\n" - ] - } - ], - "source": [ - "syn = utils._login_to_synapse(\n", - " token=None\n", - ") # Assumes you have already logged in with a valid token\n", - "\n", - "# The various column names used to store Ensembl IDs in the files\n", - "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n", - "file_ensembl_list = []\n", - "\n", - "for file in files.keys():\n", - " df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n", - "\n", - " file_ensembl_ids = None\n", - "\n", - " for C in col_names:\n", - " if C in df.columns:\n", - " file_ensembl_ids = df[C]\n", - "\n", - " # networks file is a special case\n", - " if file == \"networks\":\n", - " file_ensembl_ids = pd.melt(\n", - " df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n", - " )[\"value\"]\n", - "\n", - " if file_ensembl_ids is not None:\n", - " file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n", - " if \"n/A\" in file_ensembl_ids.tolist():\n", - " print(file + \" has an n/A Ensembl ID\")\n", - " file_ensembl_list.remove(\"n/A\")\n", - " if np.NaN in file_ensembl_ids.tolist():\n", - " print(file + \" has an NaN Ensembl ID\")\n", - " else:\n", - " print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "f1303e5b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "37452\n" - ] - } - ], + "outputs": [], "source": [ - "file_ensembl_list = list(set(file_ensembl_list))\n", - "\n", "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n", "\n", "\"\"\" Removed due to no longer getting genes from BioMart, but saving code\n", @@ -300,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "4e7a37c8", "metadata": {}, "outputs": [], @@ -321,243 +186,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "7ebd03d4", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:biothings.client:querying 1-1000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 1001-2000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 2001-3000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 3001-4000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 4001-5000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 5001-6000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 6001-7000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 7001-8000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 8001-9000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 9001-10000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 10001-11000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 11001-12000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 12001-13000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 13001-14000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 14001-15000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 15001-16000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 16001-17000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 17001-18000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 18001-19000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 19001-20000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 20001-21000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 21001-22000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 22001-23000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 23001-24000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 24001-25000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 25001-26000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 26001-27000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 27001-28000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 28001-29000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 29001-30000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 30001-31000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 31001-32000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 32001-33000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 33001-34000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 34001-35000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 35001-36000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 36001-37000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 37001-37452...\n", - "INFO:biothings.client:done.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_id_versionaliasnamesummarysymboltype_of_genenotfound
ensembl_gene_id
ENSG00000164972846882.0[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]sperm microtubule inner protein 6This gene encodes a nuclear- or perinuclear-lo...SPMIP6protein-codingNaN
ENSG000001691051131892.0[ATCS, D4ST1, EDSMC1, HNK1ST]carbohydrate sulfotransferase 14This gene encodes a member of the HNK-1 family...CHST14protein-codingNaN
ENSG00000255136ENSG000002551361.0NaNTPBGL antisense RNA 1NaNTPBGL-AS1NaNNaN
ENSG0000010549986051.0CPLA2-gammaphospholipase A2 group IVCThis gene encodes a protein which is a member ...PLA2G4Cprotein-codingNaN
ENSG00000104611638981.0[PPP1R38, SH2A]SH2 domain containing 4AEnables phosphatase binding activity. Located ...SH2D4Aprotein-codingNaN
\n", - "
" - ], - "text/plain": [ - " _id _version \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 84688 2.0 \n", - "ENSG00000169105 113189 2.0 \n", - "ENSG00000255136 ENSG00000255136 1.0 \n", - "ENSG00000105499 8605 1.0 \n", - "ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4] \n", - "ENSG00000169105 [ATCS, D4ST1, EDSMC1, HNK1ST] \n", - "ENSG00000255136 NaN \n", - "ENSG00000105499 CPLA2-gamma \n", - "ENSG00000104611 [PPP1R38, SH2A] \n", - "\n", - " name \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 sperm microtubule inner protein 6 \n", - "ENSG00000169105 carbohydrate sulfotransferase 14 \n", - "ENSG00000255136 TPBGL antisense RNA 1 \n", - "ENSG00000105499 phospholipase A2 group IVC \n", - "ENSG00000104611 SH2 domain containing 4A \n", - "\n", - " summary symbol \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "ENSG00000169105 This gene encodes a member of the HNK-1 family... CHST14 \n", - "ENSG00000255136 NaN TPBGL-AS1 \n", - "ENSG00000105499 This gene encodes a protein which is a member ... PLA2G4C \n", - "ENSG00000104611 Enables phosphatase binding activity. Located ... SH2D4A \n", - "\n", - " type_of_gene notfound \n", - "ensembl_gene_id \n", - "ENSG00000164972 protein-coding NaN \n", - "ENSG00000169105 protein-coding NaN \n", - "ENSG00000255136 NaN NaN \n", - "ENSG00000105499 protein-coding NaN \n", - "ENSG00000104611 protein-coding NaN " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "mg = mygene.MyGeneInfo()\n", "\n", @@ -573,21 +207,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "23bb114e", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Annotations found for 36284 genes.\n", - "No annotations found for 1175 genes.\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Annotations found for \" + str(sum(mygene_output[\"notfound\"].isna())) + \" genes.\")\n", "print(\n", @@ -611,158 +236,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "186d8cb8", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(37459, 9)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
0ENSG00000164972846882.0[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]sperm microtubule inner protein 6This gene encodes a nuclear- or perinuclear-lo...SPMIP6protein-codingNaN
1ENSG000001691051131892.0[ATCS, D4ST1, EDSMC1, HNK1ST]carbohydrate sulfotransferase 14This gene encodes a member of the HNK-1 family...CHST14protein-codingNaN
2ENSG00000255136ENSG000002551361.0NaNTPBGL antisense RNA 1NaNTPBGL-AS1NaNNaN
3ENSG0000010549986051.0CPLA2-gammaphospholipase A2 group IVCThis gene encodes a protein which is a member ...PLA2G4Cprotein-codingNaN
4ENSG00000104611638981.0[PPP1R38, SH2A]SH2 domain containing 4AEnables phosphatase binding activity. Located ...SH2D4Aprotein-codingNaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "0 ENSG00000164972 84688 2.0 \n", - "1 ENSG00000169105 113189 2.0 \n", - "2 ENSG00000255136 ENSG00000255136 1.0 \n", - "3 ENSG00000105499 8605 1.0 \n", - "4 ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "0 [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4] \n", - "1 [ATCS, D4ST1, EDSMC1, HNK1ST] \n", - "2 NaN \n", - "3 CPLA2-gamma \n", - "4 [PPP1R38, SH2A] \n", - "\n", - " name \\\n", - "0 sperm microtubule inner protein 6 \n", - "1 carbohydrate sulfotransferase 14 \n", - "2 TPBGL antisense RNA 1 \n", - "3 phospholipase A2 group IVC \n", - "4 SH2 domain containing 4A \n", - "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", - "\n", - " type_of_gene notfound \n", - "0 protein-coding NaN \n", - "1 protein-coding NaN \n", - "2 NaN NaN \n", - "3 protein-coding NaN \n", - "4 protein-coding NaN " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "gene_table_merged = pd.merge(\n", " left=ensembl_ids_df,\n", @@ -791,36 +270,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "285c10d2", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# NaN or NULL alias values become empty lists\n", - "for row in gene_table_merged.loc[gene_table_merged[\"alias\"].isnull(), \"alias\"].index:\n", - " gene_table_merged.at[row, \"alias\"] = []\n", - "\n", - "# Some alias values are a single string, not a list. Turn them into lists here.\n", - "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", - " lambda cell: cell if isinstance(cell, list) else [cell]\n", - ")\n", - "\n", - "\n", - "# Some alias values are lists of lists or have duplicate values\n", - "def flatten(row):\n", - " flattened = []\n", - " for item in row:\n", - " if isinstance(item, list):\n", - " flattened = flattened + item\n", - " else:\n", - " flattened.append(item)\n", - " return flattened\n", - "\n", - "\n", "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", - " lambda row: list(set(flatten(row)))\n", + " preprocessing_utils.standardize_list_item\n", ")" ] }, @@ -831,542 +289,42 @@ "source": [ "## Remove duplicate Ensembl IDs from the list. \n", "\n", - "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. There's not a good way to reconcile this, so we first check for entries whose `symbol` is something other than \"LOC#######\", and designate that entry as the main row. If there are multiple or zero entries meeting that criteria, we just use the first entry in the list for each ensembl ID and discard the rest, which is what the Agora front end does. The gene symbols of duplicate rows are then added as aliases to the matching unique row." + "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. For every set of duplicated rows with the same Ensembl ID, we remove all rows but the first row in the set, and the symbols and aliases of the removed rows get added to the \"alias\" field of the first row." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "bc63cc53", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
6011ENSG000002765181289667221.0[]putative killer cell immunoglobulin-like recep...NaNLOC128966722protein-codingNaN
6012ENSG000002765181289667321.0[]putative killer cell immunoglobulin-like recep...NaNLOC128966732protein-codingNaN
6013ENSG000002765181289667301.0[]putative killer cell immunoglobulin-like recep...NaNLOC128966730protein-codingNaN
6014ENSG000002765181289667311.0[]putative killer cell immunoglobulin-like recep...NaNLOC128966731protein-codingNaN
6015ENSG000002765181289667331.0[]putative killer cell immunoglobulin-like recep...NaNLOC128966733protein-codingNaN
12139ENSG000002303731001332201.0[GOLGA6L3]golgin A6 family like 3, pseudogeneNaNGOLGA6L3PpseudoNaN
12140ENSG000002303736424021.0[GOLGA6L21P]golgin A6 family like 17, pseudogeneNaNGOLGA6L17PpseudoNaN
23329ENSG000002763871249005711.0[]killer cell immunoglobulin-like receptor 2DS1NaNLOC124900571protein-codingNaN
23330ENSG0000027638738022.0[NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...killer cell immunoglobulin like receptor, two ...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-codingNaN
31304ENSG000002497382856261.0[]uncharacterized LOC285626NaNLOC285626ncRNANaN
31305ENSG000002497381053776831.0[]uncharacterized LOC105377683NaNLOC105377683ncRNANaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "6011 ENSG00000276518 128966722 1.0 \n", - "6012 ENSG00000276518 128966732 1.0 \n", - "6013 ENSG00000276518 128966730 1.0 \n", - "6014 ENSG00000276518 128966731 1.0 \n", - "6015 ENSG00000276518 128966733 1.0 \n", - "12139 ENSG00000230373 100133220 1.0 \n", - "12140 ENSG00000230373 642402 1.0 \n", - "23329 ENSG00000276387 124900571 1.0 \n", - "23330 ENSG00000276387 3802 2.0 \n", - "31304 ENSG00000249738 285626 1.0 \n", - "31305 ENSG00000249738 105377683 1.0 \n", - "\n", - " alias \\\n", - "6011 [] \n", - "6012 [] \n", - "6013 [] \n", - "6014 [] \n", - "6015 [] \n", - "12139 [GOLGA6L3] \n", - "12140 [GOLGA6L21P] \n", - "23329 [] \n", - "23330 [NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ... \n", - "31304 [] \n", - "31305 [] \n", - "\n", - " name \\\n", - "6011 putative killer cell immunoglobulin-like recep... \n", - "6012 putative killer cell immunoglobulin-like recep... \n", - "6013 putative killer cell immunoglobulin-like recep... \n", - "6014 putative killer cell immunoglobulin-like recep... \n", - "6015 putative killer cell immunoglobulin-like recep... \n", - "12139 golgin A6 family like 3, pseudogene \n", - "12140 golgin A6 family like 17, pseudogene \n", - "23329 killer cell immunoglobulin-like receptor 2DS1 \n", - "23330 killer cell immunoglobulin like receptor, two ... \n", - "31304 uncharacterized LOC285626 \n", - "31305 uncharacterized LOC105377683 \n", - "\n", - " summary symbol \\\n", - "6011 NaN LOC128966722 \n", - "6012 NaN LOC128966732 \n", - "6013 NaN LOC128966730 \n", - "6014 NaN LOC128966731 \n", - "6015 NaN LOC128966733 \n", - "12139 NaN GOLGA6L3P \n", - "12140 NaN GOLGA6L17P \n", - "23329 NaN LOC124900571 \n", - "23330 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "31304 NaN LOC285626 \n", - "31305 NaN LOC105377683 \n", - "\n", - " type_of_gene notfound \n", - "6011 protein-coding NaN \n", - "6012 protein-coding NaN \n", - "6013 protein-coding NaN \n", - "6014 protein-coding NaN \n", - "6015 protein-coding NaN \n", - "12139 pseudo NaN \n", - "12140 pseudo NaN \n", - "23329 protein-coding NaN \n", - "23330 protein-coding NaN \n", - "31304 ncRNA NaN \n", - "31305 ncRNA NaN " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# duplicated() will return true if the ID is a duplicate and is not the first one to appear the list.\n", + "# For printing only\n", "dupes = gene_table_merged[\"ensembl_gene_id\"].duplicated()\n", - "dupe_vals = gene_table_merged[dupes]\n", + "dupe_ids = gene_table_merged.loc[dupes, \"ensembl_gene_id\"]\n", + "print(\n", + " gene_table_merged.loc[\n", + " gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids),\n", + " [\"ensembl_gene_id\", \"symbol\", \"alias\"],\n", + " ]\n", + ")\n", "\n", - "# Rows with duplicated Ensembl IDs\n", - "all_duplicated = gene_table_merged.loc[\n", - " gene_table_merged[\"ensembl_gene_id\"].isin(dupe_vals[\"ensembl_gene_id\"])\n", - "]\n", - "all_duplicated" + "# Remove duplicates\n", + "gene_table_merged = preprocessing_utils.merge_duplicate_ensembl_ids(gene_table_merged)" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "093a2e98", + "execution_count": null, + "id": "bc76d96e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4 duplicated genes have been processed.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
37442ENSG00000163811231601.0[NET12, UTP5]WD repeat domain 43Enables RNA binding activity. Involved in posi...WDR43protein-codingNaN
37443ENSG00000226467105541.0[G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]1-acylglycerol-3-phosphate O-acyltransferase 1This gene encodes an enzyme that converts lyso...AGPAT1protein-codingNaN
37444ENSG00000120533569431.0[Sus1, e(y)2, DC6]ENY2 transcription and export complex 2 subunitEnables nuclear receptor coactivator activity....ENY2protein-codingNaN
37445ENSG00000214759ENSG000002147591.0[]ribosomal protein L36a pseudogene 2NaNRPL36AP2NaNNaN
37446ENSG00000253981ENSG000002539811.0[]ALG1 like 13, pseudogeneNaNALG1L13PNaNNaN
37447ENSG000002672061580621.0[hLcn5, LCN5, UNQ643]lipocalin 6Predicted to enable small molecule binding act...LCN6protein-codingNaN
37448ENSG0000027638738022.0[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...killer cell immunoglobulin like receptor, two ...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-codingNaN
37449ENSG000002765181289667221.0[LOC128966730, LOC128966732, LOC128966731, LOC...putative killer cell immunoglobulin-like recep...NaNLOC128966722protein-codingNaN
37450ENSG000002303731001332201.0[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]golgin A6 family like 3, pseudogeneNaNGOLGA6L3PpseudoNaN
37451ENSG000002497382856261.0[LOC105377683]uncharacterized LOC285626NaNLOC285626ncRNANaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "37442 ENSG00000163811 23160 1.0 \n", - "37443 ENSG00000226467 10554 1.0 \n", - "37444 ENSG00000120533 56943 1.0 \n", - "37445 ENSG00000214759 ENSG00000214759 1.0 \n", - "37446 ENSG00000253981 ENSG00000253981 1.0 \n", - "37447 ENSG00000267206 158062 1.0 \n", - "37448 ENSG00000276387 3802 2.0 \n", - "37449 ENSG00000276518 128966722 1.0 \n", - "37450 ENSG00000230373 100133220 1.0 \n", - "37451 ENSG00000249738 285626 1.0 \n", - "\n", - " alias \\\n", - "37442 [NET12, UTP5] \n", - "37443 [G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha] \n", - "37444 [Sus1, e(y)2, DC6] \n", - "37445 [] \n", - "37446 [] \n", - "37447 [hLcn5, LCN5, UNQ643] \n", - "37448 [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C... \n", - "37449 [LOC128966730, LOC128966732, LOC128966731, LOC... \n", - "37450 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3] \n", - "37451 [LOC105377683] \n", - "\n", - " name \\\n", - "37442 WD repeat domain 43 \n", - "37443 1-acylglycerol-3-phosphate O-acyltransferase 1 \n", - "37444 ENY2 transcription and export complex 2 subunit \n", - "37445 ribosomal protein L36a pseudogene 2 \n", - "37446 ALG1 like 13, pseudogene \n", - "37447 lipocalin 6 \n", - "37448 killer cell immunoglobulin like receptor, two ... \n", - "37449 putative killer cell immunoglobulin-like recep... \n", - "37450 golgin A6 family like 3, pseudogene \n", - "37451 uncharacterized LOC285626 \n", - "\n", - " summary symbol \\\n", - "37442 Enables RNA binding activity. Involved in posi... WDR43 \n", - "37443 This gene encodes an enzyme that converts lyso... AGPAT1 \n", - "37444 Enables nuclear receptor coactivator activity.... ENY2 \n", - "37445 NaN RPL36AP2 \n", - "37446 NaN ALG1L13P \n", - "37447 Predicted to enable small molecule binding act... LCN6 \n", - "37448 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "37449 NaN LOC128966722 \n", - "37450 NaN GOLGA6L3P \n", - "37451 NaN LOC285626 \n", - "\n", - " type_of_gene notfound \n", - "37442 protein-coding NaN \n", - "37443 protein-coding NaN \n", - "37444 protein-coding NaN \n", - "37445 NaN NaN \n", - "37446 NaN NaN \n", - "37447 protein-coding NaN \n", - "37448 protein-coding NaN \n", - "37449 protein-coding NaN \n", - "37450 pseudo NaN \n", - "37451 ncRNA NaN " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n", - "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n", - "\n", - "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n", - "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n", - " group = all_duplicated.loc[all_duplicated[\"ensembl_gene_id\"] == ens_id].copy(\n", - " deep=True\n", - " )\n", - " # Put any entries with symbols that aren't \"LOC#####\" at the top of the data frame\n", - " matches = group[\"symbol\"].str.startswith(\"LOC\") == False\n", - " group = pd.concat([group.loc[matches], group.loc[matches == False]]).reset_index(\n", - " drop=True\n", - " )\n", - "\n", - " # Add all duplicate symbols and their aliases to the alias field of the first entry\n", - " for row in group.index[1:]:\n", - " group.at[group.index[0], \"alias\"].append(group[\"symbol\"][row])\n", - " if len(group.at[row, \"alias\"]) > 0:\n", - " group.at[group.index[0], \"alias\"] = (\n", - " group.at[group.index[0], \"alias\"] + group[\"alias\"][row]\n", - " )\n", - "\n", - " # Make sure we didn't add duplicate aliases\n", - " group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n", - "\n", - " # Keep the first row only, which now has all the aliases\n", - " keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n", - "\n", - "print(\n", - " str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n", - " + \" duplicated genes have been processed.\"\n", - ")\n", - "gene_table_merged = keep_df.reset_index(drop=True)\n", - "gene_table_merged.tail(n=10)" + "print(str(len(dupe_ids.drop_duplicates())) + \" duplicated genes have been processed.\")\n", + "print(gene_table_merged.shape)\n", + "print(gene_table_merged.loc[gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids), \"alias\"])" ] }, { @@ -1383,66 +341,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "4a1bbdee", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " name date url version\n", - "1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37\n", - "2 Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org 111\n", - "3 Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org 110\n", - "4 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109\n", - "5 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108\n", - "6 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107\n", - "7 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106\n", - "8 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105\n", - "9 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104\n", - "10 Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org 103\n", - "11 Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org 102\n", - "12 Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org 101\n", - "13 Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org 100\n", - "14 Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org 99\n", - "15 Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org 98\n", - "16 Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org 97\n", - "17 Ensembl 96 Apr 2019 https://apr2019.archive.ensembl.org 96\n", - "18 Ensembl 95 Jan 2019 https://jan2019.archive.ensembl.org 95\n", - "19 Ensembl 80 May 2015 https://may2015.archive.ensembl.org 80\n", - "20 Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org 77\n", - "21 Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org 75\n", - "22 Ensembl 54 May 2009 https://may2009.archive.ensembl.org 54\n", - " current_release\n", - "1 \n", - "2 *\n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "archive_df = r.listEnsemblArchives()\n", "archive_df.to_csvfile(path=archive_filename, row_names=False, quote=False)\n", @@ -1462,291 +366,38 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "9a747309", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "37452\n", - "Querying genes 1 - 1000\n", - "Querying genes 1001 - 2000\n", - "Querying genes 2001 - 3000\n", - "Querying genes 3001 - 4000\n", - "Querying genes 4001 - 5000\n", - "Querying genes 5001 - 6000\n", - "Querying genes 6001 - 7000\n", - "Querying genes 7001 - 8000\n", - "Querying genes 8001 - 9000\n", - "Querying genes 9001 - 10000\n", - "Querying genes 10001 - 11000\n", - "Querying genes 11001 - 12000\n", - "Querying genes 12001 - 13000\n", - "Querying genes 13001 - 14000\n", - "Querying genes 14001 - 15000\n", - "Querying genes 15001 - 16000\n", - "Querying genes 16001 - 17000\n", - "Querying genes 17001 - 18000\n", - "Querying genes 18001 - 19000\n", - "Querying genes 19001 - 20000\n", - "Querying genes 20001 - 21000\n", - "Querying genes 21001 - 22000\n", - "Querying genes 22001 - 23000\n", - "Querying genes 23001 - 24000\n", - "Querying genes 24001 - 25000\n", - "Querying genes 25001 - 26000\n", - "Querying genes 26001 - 27000\n", - "Querying genes 27001 - 28000\n", - "Querying genes 28001 - 29000\n", - "Querying genes 29001 - 30000\n", - "Querying genes 30001 - 31000\n", - "Querying genes 31001 - 32000\n", - "Querying genes 32001 - 33000\n", - "Querying genes 33001 - 34000\n", - "Querying genes 34001 - 35000\n", - "Querying genes 35001 - 36000\n", - "Querying genes 36001 - 37000\n", - "Querying genes 37001 - 37452\n", - "37452\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
is_currentassemblyidversiontypepeptidelatestpossible_replacementrelease
374471GRCh38ENSG000002672066GeneNoneENSG00000267206.6[]111
374481GRCh38ENSG000002763874GeneNoneENSG00000276387.4[]111
374491GRCh38ENSG000002765181GeneNoneENSG00000276518.1[]111
374501GRCh38ENSG000002303739GeneNoneENSG00000230373.9[]111
374511GRCh38ENSG0000024973810GeneNoneENSG00000249738.10[]111
\n", - "
" - ], - "text/plain": [ - " is_current assembly id version type peptide \\\n", - "37447 1 GRCh38 ENSG00000267206 6 Gene None \n", - "37448 1 GRCh38 ENSG00000276387 4 Gene None \n", - "37449 1 GRCh38 ENSG00000276518 1 Gene None \n", - "37450 1 GRCh38 ENSG00000230373 9 Gene None \n", - "37451 1 GRCh38 ENSG00000249738 10 Gene None \n", - "\n", - " latest possible_replacement release \n", - "37447 ENSG00000267206.6 [] 111 \n", - "37448 ENSG00000276387.4 [] 111 \n", - "37449 ENSG00000276518.1 [] 111 \n", - "37450 ENSG00000230373.9 [] 111 \n", - "37451 ENSG00000249738.10 [] 111 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "url = \"https://rest.ensembl.org/archive/id\"\n", - "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n", - "\n", - "ids = gene_table_merged[\"ensembl_gene_id\"].tolist()\n", - "print(len(ids))\n", - "\n", - "# We can only query 1000 genes at a time\n", - "batch_ind = range(0, len(ids), 1000)\n", - "results = []\n", - "\n", - "for B in batch_ind:\n", - " end = min(len(ids), B + 1000)\n", - " print(\"Querying genes \" + str(B + 1) + \" - \" + str(end))\n", - "\n", - " request_data = '{ \"id\" : ' + str(ids[B:end]) + \" }\"\n", - " request_data = request_data.replace(\"'\", '\"')\n", - "\n", - " ok = False\n", - " tries = 0\n", - "\n", - " while tries < 5 and not ok:\n", - " try:\n", - " res = requests.post(url, headers=headers, data=request_data)\n", - " ok = res.ok\n", - " except:\n", - " ok = False\n", - "\n", - " tries = tries + 1\n", - "\n", - " if not ok:\n", - " # res.raise_for_status()\n", - " print(\n", - " \"Error retrieving Ensembl versions for genes \"\n", - " + str(B + 1)\n", - " + \" - \"\n", - " + str(end)\n", - " + \". Trying again...\"\n", - " )\n", - " else:\n", - " results = results + res.json()\n", - " break\n", - "\n", - "print(len(results))\n", - "\n", - "versions = pd.json_normalize(results)\n", + "versions = preprocessing_utils.query_ensembl_version_api(\n", + " ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n", + ")\n", "\n", "versions.tail()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "5c108238", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "release\n", - "100 22\n", - "101 8\n", - "102 16\n", - "103 15\n", - "104 19\n", - "105 9\n", - "106 35\n", - "107 10\n", - "108 4\n", - "109 4\n", - "110 11\n", - "111 36286\n", - "80 21\n", - "81 2\n", - "82 10\n", - "84 673\n", - "87 61\n", - "89 20\n", - "91 75\n", - "93 53\n", - "95 33\n", - "96 31\n", - "97 18\n", - "98 9\n", - "99 7\n", - "dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions.groupby(\"release\").size()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "bf5aecb1", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "37452\n", - "37452\n", - "True\n" - ] - } - ], + "outputs": [], "source": [ "# Check that all IDs are the same between the result and the gene table\n", "print(len(versions[\"id\"]))\n", @@ -1759,21 +410,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "7fc8bbcd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Make sure everything is GRCh38, not GRCh37\n", "all(versions[\"assembly\"] == \"GRCh38\")" @@ -1791,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "0d5b5652", "metadata": { "scrolled": true @@ -1815,40 +455,10 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "337b2890", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "closest_release\n", - "80 915\n", - "95 33\n", - "96 31\n", - "97 18\n", - "98 9\n", - "99 7\n", - "100 22\n", - "101 8\n", - "102 16\n", - "103 15\n", - "104 19\n", - "105 9\n", - "106 35\n", - "107 10\n", - "108 4\n", - "109 4\n", - "110 11\n", - "111 36286\n", - "dtype: int64" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[\"closest_release\"] = 0\n", "\n", @@ -1865,149 +475,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "343e5006", "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
is_currentassemblyidversiontypepeptidelatestpossible_replacementreleaseclosest_releasepermalink
01GRCh38ENSG0000016497214GeneNoneENSG00000164972.14[]111111https://jan2024.archive.ensembl.org/Homo_sapie...
11GRCh38ENSG000001691058GeneNoneENSG00000169105.8[]111111https://jan2024.archive.ensembl.org/Homo_sapie...
21GRCh38ENSG000002551363GeneNoneENSG00000255136.3[]111111https://jan2024.archive.ensembl.org/Homo_sapie...
31GRCh38ENSG0000010549914GeneNoneENSG00000105499.14[]111111https://jan2024.archive.ensembl.org/Homo_sapie...
41GRCh38ENSG0000010461112GeneNoneENSG00000104611.12[]111111https://jan2024.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " is_current assembly id version type peptide \\\n", - "0 1 GRCh38 ENSG00000164972 14 Gene None \n", - "1 1 GRCh38 ENSG00000169105 8 Gene None \n", - "2 1 GRCh38 ENSG00000255136 3 Gene None \n", - "3 1 GRCh38 ENSG00000105499 14 Gene None \n", - "4 1 GRCh38 ENSG00000104611 12 Gene None \n", - "\n", - " latest possible_replacement release closest_release \\\n", - "0 ENSG00000164972.14 [] 111 111 \n", - "1 ENSG00000169105.8 [] 111 111 \n", - "2 ENSG00000255136.3 [] 111 111 \n", - "3 ENSG00000105499.14 [] 111 111 \n", - "4 ENSG00000104611.12 [] 111 111 \n", - "\n", - " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[\"permalink\"] = \"\"\n", "\n", @@ -2024,166 +497,20 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "4b01719d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
is_currentassemblyidversiontypepeptidelatestpossible_replacementreleaseclosest_releasepermalink
51GRCh38ENSG000002667011GeneNoneENSG00000266701.1[]8480https://may2015.archive.ensembl.org/Homo_sapie...
99GRCh38ENSG000002682252GeneNoneENSG00000268225.2[]9898https://sep2019.archive.ensembl.org/Homo_sapie...
119GRCh38ENSG000002810181GeneNoneENSG00000281018.1[]8480https://may2015.archive.ensembl.org/Homo_sapie...
120GRCh38ENSG000002160112GeneNoneENSG00000216011.2[]8480https://may2015.archive.ensembl.org/Homo_sapie...
135GRCh38ENSG000002641031GeneNoneENSG00000264103.1[]8480https://may2015.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " is_current assembly id version type peptide \\\n", - "51 GRCh38 ENSG00000266701 1 Gene None \n", - "99 GRCh38 ENSG00000268225 2 Gene None \n", - "119 GRCh38 ENSG00000281018 1 Gene None \n", - "120 GRCh38 ENSG00000216011 2 Gene None \n", - "135 GRCh38 ENSG00000264103 1 Gene None \n", - "\n", - " latest possible_replacement release closest_release \\\n", - "51 ENSG00000266701.1 [] 84 80 \n", - "99 ENSG00000268225.2 [] 98 98 \n", - "119 ENSG00000281018.1 [] 84 80 \n", - "120 ENSG00000216011.2 [] 84 80 \n", - "135 ENSG00000264103.1 [] 84 80 \n", - "\n", - " permalink \n", - "51 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "99 https://sep2019.archive.ensembl.org/Homo_sapie... \n", - "119 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "120 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "135 https://may2015.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[versions[\"closest_release\"] < 100].head()" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "c4128cc9", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000164972\n", - "https://jul2023.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000279049\n" - ] - } - ], + "outputs": [], "source": [ "print(versions[\"permalink\"][0])\n", "print(versions[\"permalink\"][25])" @@ -2191,21 +518,10 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "73791e6c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Does every gene have an associated URL?\n", "url_base_len = len(archive_table[\"url\"][0]) + 1\n", @@ -2222,181 +538,10 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "f3edfd2f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(37452, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfoundensembl_releasepossible_replacementpermalink
0ENSG00000164972846882.0[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]sperm microtubule inner protein 6This gene encodes a nuclear- or perinuclear-lo...SPMIP6protein-codingNaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
1ENSG000001691051131892.0[ATCS, EDSMC1, HNK1ST, D4ST1]carbohydrate sulfotransferase 14This gene encodes a member of the HNK-1 family...CHST14protein-codingNaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
2ENSG00000255136ENSG000002551361.0[]TPBGL antisense RNA 1NaNTPBGL-AS1NaNNaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
3ENSG0000010549986051.0[CPLA2-gamma]phospholipase A2 group IVCThis gene encodes a protein which is a member ...PLA2G4Cprotein-codingNaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
4ENSG00000104611638981.0[PPP1R38, SH2A]SH2 domain containing 4AEnables phosphatase binding activity. Located ...SH2D4Aprotein-codingNaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "0 ENSG00000164972 84688 2.0 \n", - "1 ENSG00000169105 113189 2.0 \n", - "2 ENSG00000255136 ENSG00000255136 1.0 \n", - "3 ENSG00000105499 8605 1.0 \n", - "4 ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "0 [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22] \n", - "1 [ATCS, EDSMC1, HNK1ST, D4ST1] \n", - "2 [] \n", - "3 [CPLA2-gamma] \n", - "4 [PPP1R38, SH2A] \n", - "\n", - " name \\\n", - "0 sperm microtubule inner protein 6 \n", - "1 carbohydrate sulfotransferase 14 \n", - "2 TPBGL antisense RNA 1 \n", - "3 phospholipase A2 group IVC \n", - "4 SH2 domain containing 4A \n", - "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", - "\n", - " type_of_gene notfound ensembl_release possible_replacement \\\n", - "0 protein-coding NaN 111 [] \n", - "1 protein-coding NaN 111 [] \n", - "2 NaN NaN 111 [] \n", - "3 protein-coding NaN 111 [] \n", - "4 protein-coding NaN 111 [] \n", - "\n", - " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions = versions[[\"id\", \"release\", \"possible_replacement\", \"permalink\"]]\n", "versions.rename(\n", @@ -2421,277 +566,25 @@ "metadata": {}, "source": [ "### Final cleanup\n", - "Unfilled \"possible_replacement\" entries should be changed from NaN to empty lists. \n", - "\n", - "\"possible_replacement\" entries that have data in them exist as a list of dicts, and need to have the Ensembl IDs pulled out of them as a list of strings. \n", + "\"possible_replacement\" entries will either be an empty list or a list of dictionaries. Entries that have data in them need to have the Ensembl IDs pulled out of them as a list of strings.\n", "\n", "Remove unneeded columns. " ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "d0c07b7a", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_idnamealiassummarysymboltype_of_geneensembl_releasepossible_replacementpermalink
0ENSG00000164972sperm microtubule inner protein 6[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]This gene encodes a nuclear- or perinuclear-lo...SPMIP6protein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
1ENSG00000169105carbohydrate sulfotransferase 14[ATCS, EDSMC1, HNK1ST, D4ST1]This gene encodes a member of the HNK-1 family...CHST14protein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
2ENSG00000255136TPBGL antisense RNA 1[]NaNTPBGL-AS1NaN111[]https://jan2024.archive.ensembl.org/Homo_sapie...
3ENSG00000105499phospholipase A2 group IVC[CPLA2-gamma]This gene encodes a protein which is a member ...PLA2G4Cprotein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
4ENSG00000104611SH2 domain containing 4A[PPP1R38, SH2A]Enables phosphatase binding activity. Located ...SH2D4Aprotein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
..............................
37447ENSG00000267206lipocalin 6[hLcn5, LCN5, UNQ643]Predicted to enable small molecule binding act...LCN6protein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
37448ENSG00000276387killer cell immunoglobulin like receptor, two ...[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
37449ENSG00000276518putative killer cell immunoglobulin-like recep...[LOC128966730, LOC128966732, LOC128966731, LOC...NaNLOC128966722protein-coding111[]https://jan2024.archive.ensembl.org/Homo_sapie...
37450ENSG00000230373golgin A6 family like 3, pseudogene[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]NaNGOLGA6L3Ppseudo111[]https://jan2024.archive.ensembl.org/Homo_sapie...
37451ENSG00000249738uncharacterized LOC285626[LOC105377683]NaNLOC285626ncRNA111[]https://jan2024.archive.ensembl.org/Homo_sapie...
\n", - "

37452 rows × 9 columns

\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id name \\\n", - "0 ENSG00000164972 sperm microtubule inner protein 6 \n", - "1 ENSG00000169105 carbohydrate sulfotransferase 14 \n", - "2 ENSG00000255136 TPBGL antisense RNA 1 \n", - "3 ENSG00000105499 phospholipase A2 group IVC \n", - "4 ENSG00000104611 SH2 domain containing 4A \n", - "... ... ... \n", - "37447 ENSG00000267206 lipocalin 6 \n", - "37448 ENSG00000276387 killer cell immunoglobulin like receptor, two ... \n", - "37449 ENSG00000276518 putative killer cell immunoglobulin-like recep... \n", - "37450 ENSG00000230373 golgin A6 family like 3, pseudogene \n", - "37451 ENSG00000249738 uncharacterized LOC285626 \n", - "\n", - " alias \\\n", - "0 [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22] \n", - "1 [ATCS, EDSMC1, HNK1ST, D4ST1] \n", - "2 [] \n", - "3 [CPLA2-gamma] \n", - "4 [PPP1R38, SH2A] \n", - "... ... \n", - "37447 [hLcn5, LCN5, UNQ643] \n", - "37448 [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C... \n", - "37449 [LOC128966730, LOC128966732, LOC128966731, LOC... \n", - "37450 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3] \n", - "37451 [LOC105377683] \n", - "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", - "... ... ... \n", - "37447 Predicted to enable small molecule binding act... LCN6 \n", - "37448 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "37449 NaN LOC128966722 \n", - "37450 NaN GOLGA6L3P \n", - "37451 NaN LOC285626 \n", - "\n", - " type_of_gene ensembl_release possible_replacement \\\n", - "0 protein-coding 111 [] \n", - "1 protein-coding 111 [] \n", - "2 NaN 111 [] \n", - "3 protein-coding 111 [] \n", - "4 protein-coding 111 [] \n", - "... ... ... ... \n", - "37447 protein-coding 111 [] \n", - "37448 protein-coding 111 [] \n", - "37449 protein-coding 111 [] \n", - "37450 pseudo 111 [] \n", - "37451 ncRNA 111 [] \n", - "\n", - " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "... ... \n", - "37447 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37448 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37449 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37450 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37451 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "\n", - "[37452 rows x 9 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "for row in gene_table_merged.loc[\n", - " gene_table_merged[\"possible_replacement\"].isnull(), \"possible_replacement\"\n", - "].index:\n", - " gene_table_merged.at[row, \"possible_replacement\"] = []\n", + "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", + " \"possible_replacement\"\n", + "].apply(lambda pr: pr if pr is np.NaN or len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n", "\n", - "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n", - " lambda row: (\n", - " row[\"possible_replacement\"]\n", - " if len(row[\"possible_replacement\"]) == 0\n", - " else [x[\"stable_id\"] for x in row[\"possible_replacement\"]]\n", - " ),\n", - " axis=1,\n", - ")\n", + "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", + " \"possible_replacement\"\n", + "].apply(preprocessing_utils.standardize_list_item)\n", "\n", "gene_table_merged = gene_table_merged[\n", " [\n", @@ -2721,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "f2287922", "metadata": {}, "outputs": [], @@ -2736,7 +629,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "agora-data-tools-ywFp1Gf9", "language": "python", "name": "python3" }, diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index fbc1a2dc..e85f441a 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -1,12 +1,31 @@ +""" +This file includes several helper functions that are called from one or more of the pre-processing +notebooks. This helps avoid code duplication and/or keeps the notebooks cleaner and more straightforward. +Current public-facing functions: + manual_query_biomart - queries Biomart with a GET request + query_ensembl_version_api - queries the Ensembl API for Ensembl ID version info + r_query_biomart - queries Biomart using rpy2 + filter_hasgs - removes human alternative sequence genes from a data frame + get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT + standardize_list_item - turn values of varying types into a list. Used for fixing the "alias" and + "possible_replacement" fields of gene_metadata. + merge_duplicate_ensembl_ids - collapse rows with the same Ensembl ID but different gene symbols + or aliases into one row +""" + import pandas as pd +import numpy as np import requests import re +import synapseclient from io import StringIO -from typing import Union +from typing import Union, Dict, List, Set +import agoradatatools.etl.utils as utils +import agoradatatools.etl.extract as extract def manual_query_biomart( - attributes: list[str], filters: dict[Union[list, set]] + attributes: List[str], filters: Dict[str, Union[List[str], Set[str]]] ) -> pd.DataFrame: """Performs a GET request to the Biomart web service and returns the response. There is no canonical Python library to query Biomart and no Python library at all to query on @@ -49,6 +68,63 @@ def manual_query_biomart( return result +def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame: + """ + Queries the Ensembl API via POST to get version information for each Ensembl ID. The API can only + process 1000 IDs at a time so the query is broken into batches of 1000. If a request fails, this + function will try again up to 5 times on that batch before quitting and raising an error. + + Args: + ensembl_ids: a list of Ensembl IDs to query + + Returns: + a pandas data frame with Ensembl IDs, version, and release information + """ + url = "https://rest.ensembl.org/archive/id" + headers = {"Content-Type": "application/json", "Accept": "application/json"} + + # We can only query 1000 genes at a time + batch_ind = range(0, len(ensembl_ids), 1000) + results = [] + + for B in batch_ind: + end = min(len(ensembl_ids), B + 1000) + print("Querying genes " + str(B + 1) + " - " + str(end)) + + request_data = '{ "id" : ' + str(ensembl_ids[B:end]) + " }" + request_data = request_data.replace("'", '"') + + ok = False + tries = 0 + + while tries < 5 and not ok: + try: + res = requests.post(url, headers=headers, data=request_data) + ok = res.ok + except requests.RequestException as ex: + print(ex) + ok = False + + tries = tries + 1 + + if not ok and tries == 5: + res.raise_for_status() + elif not ok: + print( + "Error retrieving Ensembl versions for genes " + + str(B + 1) + + " - " + + str(end) + + ". Trying again..." + ) + else: + results = results + res.json() + break + + versions = pd.json_normalize(results) + return versions + + def filter_hasgs(df: pd.DataFrame, chromosome_name_column: str) -> pd.DataFrame: """Filters human alternative sequence genes (HASGs) from a data frame by using a regex to identify them for removal. Valid genes will either have a numerical chromosome name or have @@ -87,6 +163,7 @@ def r_query_biomart() -> pd.DataFrame: "chromosome_name", and "hgnc_symbol" retrived from BioMart """ from rpy2.robjects import r + from rpy2.rinterface_lib.embedded import RRuntimeError r( 'if (!require("BiocManager", character.only = TRUE)) { install.packages("BiocManager") }' @@ -96,7 +173,7 @@ def r_query_biomart() -> pd.DataFrame: r.library("biomaRt") # Sometimes Biomart doesn't respond and the command needs to be sent again. Try up to 5 times. - for T in range(5): + for _ in range(5): try: mart = r.useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl") ensembl_ids = r.getBM( @@ -105,7 +182,8 @@ def r_query_biomart() -> pd.DataFrame: useCache=False, ) - except: + except RRuntimeError as ex: + print(ex) print("Trying again...") ensembl_ids = None else: @@ -124,3 +202,213 @@ def r_query_biomart() -> pd.DataFrame: } ) return ensembl_ids_df + + +def get_all_adt_ensembl_ids( + config_filename: str, exclude_files: List[str] = [], token: str = None +) -> List[str]: + """ + Loops through an ADT config file, finds all data files that are ingested by ADT, and returns a + list containing all Ensembl IDs present in those files. Specific files can be excluded from the + list with the exclude_files argument. + + Args: + config_filename: full or relative file path to the ADT config.yaml file + exclude_files: list of file names to exclude when searching files for IDs. These names must + match what is in "name" field of the file specification in the config.yaml + file. Typical values are "gene_metadata" and "druggability". + token: a Synapse auth token, or None if the user has Synapse credentials saved. + + Returns: + a list of unique Ensembl IDs that exist in at least one data set ingested by ADT + """ + syn = utils._login_to_synapse(token=token) + config = utils._get_config(config_path=config_filename) + datasets = config["datasets"] + + # Get all unique files in the config since some files are listed multiple times by being + # included in multiple data sets. Also fetch all column rename values for standardizing Ensembl + # ID column names + unique_files = {} + column_renames = {} + + for dataset in datasets: + dataset_name = list(dataset.keys())[0] + + for file in dataset[dataset_name]["files"]: + # Make the Synapse ID the key so that "update" will only add a new item if the ID doesn't + # already exist + unique_files.update({file["id"]: file}) + + # Only some data sets have column rename values + if "column_rename" in dataset[dataset_name].keys(): + column_renames.update(dataset[dataset_name]["column_rename"]) + + # Print all the files we found + print("Found " + str(len(unique_files)) + " files:") + [print(x["name"] + ":\t" + x["id"]) for x in unique_files.values()] + print("") + + # Create a list of all Ensembl IDs in all files + file_ensembl_list = [] + + for entity in unique_files.values(): + # Ignore json files, which are post-processed and not what we're interested in. + # Also ignore any other files specified by 'exclude_files', which likely includes + # "gene_metadata" and "druggability". + if entity["format"] == "json" or entity["name"] in exclude_files: + continue + + file_ensembl_ids = _extract_ensembl_ids(syn, entity, column_renames) + file_ensembl_list = file_ensembl_list + file_ensembl_ids + + # Remove duplicate values + return list(set(file_ensembl_list)) + + +def _extract_ensembl_ids( + syn: synapseclient.Synapse, entity: Dict[str, str], column_renames: Dict[str, str] +) -> List[str]: + """ + Internal function used by get_all_adt_ensembl_ids to exctract a list of Ensembl IDs from a file. + The file is downloaded from Synapse and read in as a pandas data frame, column names are renamed + if necessary to ensure that most Ensembl ID columns are renamed to "ensembl_gene_id", and all + Ensembl IDs from relevant columns are put in a list. + + Note that the "networks" data set contains two columns with Ensembl IDs (genea_ensembl_gene_id + and geneb_ensembl_gene_id) which are not renamed, so this function searches for columns named + with any of those two names or with "ensembl_gene_id" when finding Ensembl ID columns. + + Note that this function depends on the column_rename specifications in the config to accurately + convert all Ensembl ID-containing columns in all files except networks to "ensembl_gene_id", so + that we don't have to hard-code a list of all possible column names. This assumption is valid + for the current set of data files and will likely remain valid for future data, but a warning + is printed out if no matching column is found, just in case. + + Args: + syn: a syanpseclient object which has already been initialized and successfully logged in + entity: a dictionary containing keys "id", "name", and "format" + column_renames: a dictionary containing all column rename pairs from the config file, where + key = old column name, and value = new column name + + Returns: + a list of unique Ensembl IDs in the file, or an empty list if no Ensembl ID column found + """ + df = extract.get_entity_as_df(syn_id=entity["id"], source=entity["format"], syn=syn) + + # Use column_renames from the config to convert most Ensembl ID column names to "ensembl_gene_id". + df = utils.standardize_column_names(df=df) + df = utils.rename_columns(df=df, column_map=column_renames) + + # Exception to the above comment: the 'networks' file has two ID columns (genea_ and geneb_ ensembl_gene_id) + # which do not get renamed + possible_col_names = [ + "ensembl_gene_id", + "genea_ensembl_gene_id", + "geneb_ensembl_gene_id", + ] + + file_ensembl_ids = [] + + # The data may have zero, one, or more than one (in the case of 'networks') column of Ensembl IDs + for C in possible_col_names: + if C in df.columns: + file_ensembl_ids = file_ensembl_ids + df[C].tolist() + + # Print any warnings and remove any NA values from the list before returning + if len(file_ensembl_ids) == 0: + print("WARNING: no Ensembl ID column found for " + entity["name"] + "!") + + if "n/A" in file_ensembl_ids: + print(entity["name"] + " has an n/A Ensembl ID") + file_ensembl_ids.remove("n/A") + + if np.NaN in file_ensembl_ids: + print( + entity["name"] + + " has " + + str(file_ensembl_ids.count(np.NaN)) + + " NaN Ensembl IDs" + ) + file_ensembl_ids = [x for x in file_ensembl_ids if x is not np.NaN] + + # Remove duplicate values + return list(set(file_ensembl_ids)) + + +def standardize_list_item(item: Union[str, List[str]]) -> List[str]: + """ + For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN, + a single string, and a list of strings. This function standardizes the column values so that + everything is a list, either empty (if NaN) or a list of strings. The final list is sorted + alphabetically to make comparison between different versions of the file easier. + + This function is intended to be called as part of an apply() statement on a pandas data frame + column. + + Args: + item: either a string, a list of strings, or np.NaN + + Returns: + A list of strings or an empty list. The list is sorted alphabetically. + """ + # Convert NaN to an empty list + if item is np.NaN: + return [] + + # Convert plain strings to a list of one string + if isinstance(item, str): + return [item] + + if isinstance(item, list): + # Get unique values only and sort them + item = list(set(item)) + item.sort() + + # No extra handling necessary for other data types + + return item + + +def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame: + """ + MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols + or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs + in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the + same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the + remaining rows in the set are then added as aliases to the "main" row, and all of their aliases + are added to the main row alias field as well. All rows in the set except the main row are then + deleted from the data frame, leaving a single row for that Ensembl ID with all symbols and aliases + from the duplicate rows merged into the alias field. + + Args: + gene_table: a pandas DataFrame containing gene metadata results from MyGene + + Returns: + a data frame with duplicate rows removed + """ + dupes = gene_table["ensembl_gene_id"].duplicated() + dupe_ids = gene_table.loc[dupes, "ensembl_gene_id"].drop_duplicates().tolist() + + for ens_id in dupe_ids: + rows = gene_table.loc[gene_table["ensembl_gene_id"] == ens_id] + + # Add duplicate rows' symbols to the alias field of the first row, then add duplicate rows' + # aliases to the first row's alias field. All other information in the duplicate rows is + # discarded. + new_alias = rows.iloc[0]["alias"] + + for row in rows.index[1:]: + new_alias.append(rows.loc[row, "symbol"]) + new_alias = new_alias + rows.loc[row, "alias"] + + # Remove any duplicate aliases and sort them + new_alias = list(set(new_alias)) + new_alias.sort() + + # Set the new aliases to the first row in this group and remove all duplicate rows from the + # data frame + gene_table.at[rows.index[0], "alias"] = new_alias + gene_table = gene_table.drop(rows.index[1:]) + + return gene_table diff --git a/tests/test_assets/.DS_Store b/tests/test_assets/.DS_Store deleted file mode 100644 index 46b71f5c..00000000 Binary files a/tests/test_assets/.DS_Store and /dev/null differ