Sage-Bionetworks · jaclynbeck-sage · Nov 25, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
@@ -133,6 +133,7 @@ dmypy.json
 
 # local generated files
 staging/*
+data_analysis/*/output/*
 
 #test staging location
 test_staging_dir/
@@ -141,3 +142,5 @@ test_staging_dir/
 dev_config.yaml
 
 .vscode/
+.ipynb_checkpoints/
+.Rhistory
@@ -20,16 +20,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from unipressed import IdMappingClient\n",
     "import time\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
-    "import agoradatatools.etl.utils as utils\n",
-    "import agoradatatools.etl.extract as extract\n",
+    "import preprocessing_utils\n",
     "\n",
     "config_filename = \"../../../../config.yaml\""
    ]
@@ -43,157 +41,19 @@
     "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
-       " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
-       " 'proteomics': ('syn18689335.3', 'csv'),\n",
-       " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
-       " 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
-       " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
-       " 'metabolomics': ('syn26064497.1', 'feather'),\n",
-       " 'igap': ('syn12514826.5', 'csv'),\n",
-       " 'eqtl': ('syn12514912.3', 'csv'),\n",
-       " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
-       " 'target_list': ('syn12540368.47', 'csv'),\n",
-       " 'median_expression': ('syn27211878.2', 'csv'),\n",
-       " 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
-       " 'team_info': ('syn12615624.18', 'csv'),\n",
-       " 'team_member_info': ('syn12615633.18', 'csv'),\n",
-       " 'overall_scores': ('syn25575156.13', 'table'),\n",
-       " 'networks': ('syn11685347.1', 'csv')}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config = utils._get_config(config_path=config_filename)\n",
-    "datasets = config[\"datasets\"]\n",
-    "\n",
-    "files = {}\n",
-    "\n",
-    "for dataset in datasets:\n",
-    "    dataset_name = list(dataset.keys())[0]\n",
-    "\n",
-    "    for entity in dataset[dataset_name][\"files\"]:\n",
-    "        entity_id = entity[\"id\"]\n",
-    "        entity_format = entity[\"format\"]\n",
-    "        entity_name = entity[\"name\"]\n",
-    "\n",
-    "        # Ignore json files, which are post-processed and not what we're interested in.\n",
-    "        # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n",
-    "        # \"gene_metadata\" which includes druggability genes.\n",
-    "        if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n",
-    "            files[entity_name] = (entity_id, entity_format)\n",
-    "\n",
-    "# There are some duplicate synID's in this list but that doesn't really matter\n",
-    "files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.6.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome, Jaclyn Beck!\n",
-      "\n",
-      "INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n",
-      "\n",
-      "genes_biodomains has an NaN Ensembl ID\n",
-      "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n"
-     ]
-    }
-   ],
-   "source": [
-    "syn = utils._login_to_synapse(token=None)  # Assumes you have already logged in with a valid token\n",
-    "\n",
-    "# The various column names used to store Ensembl IDs in the files\n",
-    "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
-    "file_ensembl_list = []\n",
-    "\n",
-    "for file in files.keys():\n",
-    "    df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
-    "\n",
-    "    file_ensembl_ids = None\n",
-    "\n",
-    "    for C in col_names:\n",
-    "        if C in df.columns:\n",
-    "            file_ensembl_ids = df[C]\n",
-    "\n",
-    "    # networks file is a special case\n",
-    "    if file == \"networks\":\n",
-    "        file_ensembl_ids = pd.melt(\n",
-    "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
-    "        )[\"value\"]\n",
-    "\n",
-    "    if file_ensembl_ids is not None:\n",
-    "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
-    "        if \"n/A\" in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an n/A Ensembl ID\")\n",
-    "            file_ensembl_list.remove(\"n/A\")\n",
-    "        if np.NaN in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an NaN Ensembl ID\")\n",
-    "    else:\n",
-    "        print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "35858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "file_ensembl_list = list(set(file_ensembl_list))\n",
-    "\n",
-    "# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n",
-    "ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n",
-    "\n",
-    "print(len(ensembl_ids))"
+    "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
+    "    config_filename=config_filename,\n",
+    "    exclude_files=[\"gene_metadata\", \"druggability\"],\n",
+    "    token=None,\n",
+    ")\n",
+    "print(\"\")\n",
+    "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
    ]
   },
   {