diff --git a/.gitignore b/.gitignore
index 20c38245..909837f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,6 +133,7 @@ dmypy.json
 
 # local generated files
 staging/*
+data_analysis/*/output/*
 
 #test staging location
 test_staging_dir/
@@ -141,3 +142,5 @@ test_staging_dir/
 dev_config.yaml
 
 .vscode/
+.ipynb_checkpoints/
+.Rhistory
diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
index ba477beb..2b369886 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
@@ -20,16 +20,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from unipressed import IdMappingClient\n",
     "import time\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
-    "import agoradatatools.etl.utils as utils\n",
-    "import agoradatatools.etl.extract as extract\n",
+    "import preprocessing_utils\n",
     "\n",
     "config_filename = \"../../../../config.yaml\""
    ]
@@ -43,157 +41,19 @@
     "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
-       " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
-       " 'proteomics': ('syn18689335.3', 'csv'),\n",
-       " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
-       " 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
-       " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
-       " 'metabolomics': ('syn26064497.1', 'feather'),\n",
-       " 'igap': ('syn12514826.5', 'csv'),\n",
-       " 'eqtl': ('syn12514912.3', 'csv'),\n",
-       " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
-       " 'target_list': ('syn12540368.47', 'csv'),\n",
-       " 'median_expression': ('syn27211878.2', 'csv'),\n",
-       " 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
-       " 'team_info': ('syn12615624.18', 'csv'),\n",
-       " 'team_member_info': ('syn12615633.18', 'csv'),\n",
-       " 'overall_scores': ('syn25575156.13', 'table'),\n",
-       " 'networks': ('syn11685347.1', 'csv')}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config = utils._get_config(config_path=config_filename)\n",
-    "datasets = config[\"datasets\"]\n",
-    "\n",
-    "files = {}\n",
-    "\n",
-    "for dataset in datasets:\n",
-    "    dataset_name = list(dataset.keys())[0]\n",
-    "\n",
-    "    for entity in dataset[dataset_name][\"files\"]:\n",
-    "        entity_id = entity[\"id\"]\n",
-    "        entity_format = entity[\"format\"]\n",
-    "        entity_name = entity[\"name\"]\n",
-    "\n",
-    "        # Ignore json files, which are post-processed and not what we're interested in.\n",
-    "        # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n",
-    "        # \"gene_metadata\" which includes druggability genes.\n",
-    "        if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n",
-    "            files[entity_name] = (entity_id, entity_format)\n",
-    "\n",
-    "# There are some duplicate synID's in this list but that doesn't really matter\n",
-    "files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.6.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome, Jaclyn Beck!\n",
-      "\n",
-      "INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n",
-      "\n",
-      "genes_biodomains has an NaN Ensembl ID\n",
-      "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n"
-     ]
-    }
-   ],
-   "source": [
-    "syn = utils._login_to_synapse(token=None)  # Assumes you have already logged in with a valid token\n",
-    "\n",
-    "# The various column names used to store Ensembl IDs in the files\n",
-    "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
-    "file_ensembl_list = []\n",
-    "\n",
-    "for file in files.keys():\n",
-    "    df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
-    "\n",
-    "    file_ensembl_ids = None\n",
-    "\n",
-    "    for C in col_names:\n",
-    "        if C in df.columns:\n",
-    "            file_ensembl_ids = df[C]\n",
-    "\n",
-    "    # networks file is a special case\n",
-    "    if file == \"networks\":\n",
-    "        file_ensembl_ids = pd.melt(\n",
-    "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
-    "        )[\"value\"]\n",
-    "\n",
-    "    if file_ensembl_ids is not None:\n",
-    "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
-    "        if \"n/A\" in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an n/A Ensembl ID\")\n",
-    "            file_ensembl_list.remove(\"n/A\")\n",
-    "        if np.NaN in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an NaN Ensembl ID\")\n",
-    "    else:\n",
-    "        print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "35858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "file_ensembl_list = list(set(file_ensembl_list))\n",
-    "\n",
-    "# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n",
-    "ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n",
-    "\n",
-    "print(len(ensembl_ids))"
+    "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
+    "    config_filename=config_filename,\n",
+    "    exclude_files=[\"gene_metadata\", \"druggability\"],\n",
+    "    token=None,\n",
+    ")\n",
+    "print(\"\")\n",
+    "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
    ]
   },
   {
diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index 9ef8fedf..7550d17c 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -108,178 +108,43 @@
    "source": [
     "## Get Ensembl IDs from data sets that will be processed by agora-data-tools\n",
     "\n",
-    "Loop through all data sets in the config file to get all Ensembl IDs used in every data set."
+    "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. Exclude `gene_metadata` since that's the file we are building, and `druggability` since that data is deprecated."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "a3fdbeec",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
-       " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
-       " 'proteomics': ('syn18689335.3', 'csv'),\n",
-       " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
-       " 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
-       " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
-       " 'metabolomics': ('syn26064497.1', 'feather'),\n",
-       " 'igap': ('syn12514826.5', 'csv'),\n",
-       " 'eqtl': ('syn12514912.3', 'csv'),\n",
-       " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
-       " 'target_list': ('syn12540368.47', 'csv'),\n",
-       " 'median_expression': ('syn27211878.2', 'csv'),\n",
-       " 'druggability': ('syn13363443.11', 'csv'),\n",
-       " 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
-       " 'team_info': ('syn12615624.18', 'csv'),\n",
-       " 'team_member_info': ('syn12615633.18', 'csv'),\n",
-       " 'overall_scores': ('syn25575156.13', 'table'),\n",
-       " 'networks': ('syn11685347.1', 'csv')}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "config = utils._get_config(config_path=config_filename)\n",
-    "datasets = config[\"datasets\"]\n",
-    "\n",
-    "files = {}\n",
-    "\n",
-    "for dataset in datasets:\n",
-    "    dataset_name = list(dataset.keys())[0]\n",
-    "\n",
-    "    for entity in dataset[dataset_name][\"files\"]:\n",
-    "        entity_id = entity[\"id\"]\n",
-    "        entity_format = entity[\"format\"]\n",
-    "        entity_name = entity[\"name\"]\n",
-    "\n",
-    "        # Ignore json files, which are post-processed and not what we're interested in.\n",
-    "        # Also ignore \"gene_metadata\" since that's the file we're making here.\n",
-    "        if entity_format != \"json\" and entity_name != \"gene_metadata\":\n",
-    "            files[entity_name] = (entity_id, entity_format)\n",
-    "\n",
-    "# There are some duplicate synID's in this list but that doesn't really matter\n",
-    "files"
+    "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n",
+    "    config_filename=config_filename,\n",
+    "    exclude_files=[\"gene_metadata\", \"druggability\"],\n",
+    "    token=None,\n",
+    ")\n",
+    "print(\"\")\n",
+    "print(str(len(file_ensembl_list)) + \" Ensembl IDs found.\")\n",
+    "print(file_ensembl_list[0:5])"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8f1a2120",
+   "id": "5fa76bfb",
    "metadata": {},
    "source": [
-    "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
+    "Create a data frame with these IDs so it can be merged with the MyGene query results below."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "9843689d",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.2.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.2.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome, Jaclyn Beck!\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:synapseclient_default:Welcome, Jaclyn Beck!\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "genes_biodomains has an NaN Ensembl ID\n",
-      "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n"
-     ]
-    }
-   ],
-   "source": [
-    "syn = utils._login_to_synapse(\n",
-    "    token=None\n",
-    ")  # Assumes you have already logged in with a valid token\n",
-    "\n",
-    "# The various column names used to store Ensembl IDs in the files\n",
-    "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
-    "file_ensembl_list = []\n",
-    "\n",
-    "for file in files.keys():\n",
-    "    df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
-    "\n",
-    "    file_ensembl_ids = None\n",
-    "\n",
-    "    for C in col_names:\n",
-    "        if C in df.columns:\n",
-    "            file_ensembl_ids = df[C]\n",
-    "\n",
-    "    # networks file is a special case\n",
-    "    if file == \"networks\":\n",
-    "        file_ensembl_ids = pd.melt(\n",
-    "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
-    "        )[\"value\"]\n",
-    "\n",
-    "    if file_ensembl_ids is not None:\n",
-    "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
-    "        if \"n/A\" in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an n/A Ensembl ID\")\n",
-    "            file_ensembl_list.remove(\"n/A\")\n",
-    "        if np.NaN in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an NaN Ensembl ID\")\n",
-    "    else:\n",
-    "        print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "f1303e5b",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "37452\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "file_ensembl_list = list(set(file_ensembl_list))\n",
-    "\n",
     "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n",
     "\n",
     "\"\"\" Removed due to no longer getting genes from BioMart, but saving code\n",
@@ -300,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "4e7a37c8",
    "metadata": {},
    "outputs": [],
@@ -321,243 +186,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "7ebd03d4",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:biothings.client:querying 1-1000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 1001-2000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 2001-3000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 3001-4000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 4001-5000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 5001-6000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 6001-7000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 7001-8000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 8001-9000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 9001-10000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 10001-11000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 11001-12000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 12001-13000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 13001-14000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 14001-15000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 15001-16000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 16001-17000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 17001-18000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 18001-19000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 19001-20000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 20001-21000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 21001-22000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 22001-23000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 23001-24000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 24001-25000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 25001-26000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 26001-27000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 27001-28000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 28001-29000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 29001-30000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 30001-31000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 31001-32000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 32001-33000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 33001-34000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 34001-35000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 35001-36000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 36001-37000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 37001-37452...\n",
-      "INFO:biothings.client:done.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000164972</th>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000169105</th>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, D4ST1, EDSMC1, HNK1ST]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000255136</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000105499</th>\n",
-       "      <td>8605</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>CPLA2-gamma</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000104611</th>\n",
-       "      <td>63898</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                             _id  _version  \\\n",
-       "ensembl_gene_id                              \n",
-       "ENSG00000164972            84688       2.0   \n",
-       "ENSG00000169105           113189       2.0   \n",
-       "ENSG00000255136  ENSG00000255136       1.0   \n",
-       "ENSG00000105499             8605       1.0   \n",
-       "ENSG00000104611            63898       1.0   \n",
-       "\n",
-       "                                                        alias  \\\n",
-       "ensembl_gene_id                                                 \n",
-       "ENSG00000164972  [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]   \n",
-       "ENSG00000169105                 [ATCS, D4ST1, EDSMC1, HNK1ST]   \n",
-       "ENSG00000255136                                           NaN   \n",
-       "ENSG00000105499                                   CPLA2-gamma   \n",
-       "ENSG00000104611                               [PPP1R38, SH2A]   \n",
-       "\n",
-       "                                              name  \\\n",
-       "ensembl_gene_id                                      \n",
-       "ENSG00000164972  sperm microtubule inner protein 6   \n",
-       "ENSG00000169105   carbohydrate sulfotransferase 14   \n",
-       "ENSG00000255136              TPBGL antisense RNA 1   \n",
-       "ENSG00000105499         phospholipase A2 group IVC   \n",
-       "ENSG00000104611           SH2 domain containing 4A   \n",
-       "\n",
-       "                                                           summary     symbol  \\\n",
-       "ensembl_gene_id                                                                 \n",
-       "ENSG00000164972  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "ENSG00000169105  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "ENSG00000255136                                                NaN  TPBGL-AS1   \n",
-       "ENSG00000105499  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "ENSG00000104611  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
-       "\n",
-       "                   type_of_gene notfound  \n",
-       "ensembl_gene_id                           \n",
-       "ENSG00000164972  protein-coding      NaN  \n",
-       "ENSG00000169105  protein-coding      NaN  \n",
-       "ENSG00000255136             NaN      NaN  \n",
-       "ENSG00000105499  protein-coding      NaN  \n",
-       "ENSG00000104611  protein-coding      NaN  "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "mg = mygene.MyGeneInfo()\n",
     "\n",
@@ -573,21 +207,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "23bb114e",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Annotations found for 36284 genes.\n",
-      "No annotations found for 1175 genes.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Annotations found for \" + str(sum(mygene_output[\"notfound\"].isna())) + \" genes.\")\n",
     "print(\n",
@@ -611,158 +236,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "186d8cb8",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(37459, 9)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, D4ST1, EDSMC1, HNK1ST]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>8605</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>CPLA2-gamma</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>63898</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   ensembl_gene_id              _id  _version  \\\n",
-       "0  ENSG00000164972            84688       2.0   \n",
-       "1  ENSG00000169105           113189       2.0   \n",
-       "2  ENSG00000255136  ENSG00000255136       1.0   \n",
-       "3  ENSG00000105499             8605       1.0   \n",
-       "4  ENSG00000104611            63898       1.0   \n",
-       "\n",
-       "                                          alias  \\\n",
-       "0  [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]   \n",
-       "1                 [ATCS, D4ST1, EDSMC1, HNK1ST]   \n",
-       "2                                           NaN   \n",
-       "3                                   CPLA2-gamma   \n",
-       "4                               [PPP1R38, SH2A]   \n",
-       "\n",
-       "                                name  \\\n",
-       "0  sperm microtubule inner protein 6   \n",
-       "1   carbohydrate sulfotransferase 14   \n",
-       "2              TPBGL antisense RNA 1   \n",
-       "3         phospholipase A2 group IVC   \n",
-       "4           SH2 domain containing 4A   \n",
-       "\n",
-       "                                             summary     symbol  \\\n",
-       "0  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "1  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "2                                                NaN  TPBGL-AS1   \n",
-       "3  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "4  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
-       "\n",
-       "     type_of_gene notfound  \n",
-       "0  protein-coding      NaN  \n",
-       "1  protein-coding      NaN  \n",
-       "2             NaN      NaN  \n",
-       "3  protein-coding      NaN  \n",
-       "4  protein-coding      NaN  "
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "gene_table_merged = pd.merge(\n",
     "    left=ensembl_ids_df,\n",
@@ -791,36 +270,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "285c10d2",
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "# NaN or NULL alias values become empty lists\n",
-    "for row in gene_table_merged.loc[gene_table_merged[\"alias\"].isnull(), \"alias\"].index:\n",
-    "    gene_table_merged.at[row, \"alias\"] = []\n",
-    "\n",
-    "# Some alias values are a single string, not a list. Turn them into lists here.\n",
-    "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
-    "    lambda cell: cell if isinstance(cell, list) else [cell]\n",
-    ")\n",
-    "\n",
-    "\n",
-    "# Some alias values are lists of lists or have duplicate values\n",
-    "def flatten(row):\n",
-    "    flattened = []\n",
-    "    for item in row:\n",
-    "        if isinstance(item, list):\n",
-    "            flattened = flattened + item\n",
-    "        else:\n",
-    "            flattened.append(item)\n",
-    "    return flattened\n",
-    "\n",
-    "\n",
     "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
-    "    lambda row: list(set(flatten(row)))\n",
+    "    preprocessing_utils.standardize_list_item\n",
     ")"
    ]
   },
@@ -831,542 +289,42 @@
    "source": [
     "## Remove duplicate Ensembl IDs from the list. \n",
     "\n",
-    "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. There's not a good way to reconcile this, so we first check for entries whose `symbol` is something other than \"LOC#######\", and designate that entry as the main row. If there are multiple or zero entries meeting that criteria, we just use the first entry in the list for each ensembl ID and discard the rest, which is what the Agora front end does. The gene symbols of duplicate rows are then added as aliases to the matching unique row."
+    "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. For every set of duplicated rows with the same Ensembl ID, we remove all rows but the first row in the set, and the symbols and aliases of the removed rows get added to the \"alias\" field of the first row."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "bc63cc53",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>6011</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966722</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6012</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966732</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966732</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6013</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966730</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966730</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6014</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966731</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966731</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6015</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966733</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966733</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12139</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>100133220</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L3]</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12140</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>642402</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L21P]</td>\n",
-       "      <td>golgin A6 family like 17, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L17P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23329</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>124900571</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>killer cell immunoglobulin-like receptor 2DS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC124900571</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23330</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31304</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>285626</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31305</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>105377683</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC105377683</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC105377683</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id        _id  _version  \\\n",
-       "6011   ENSG00000276518  128966722       1.0   \n",
-       "6012   ENSG00000276518  128966732       1.0   \n",
-       "6013   ENSG00000276518  128966730       1.0   \n",
-       "6014   ENSG00000276518  128966731       1.0   \n",
-       "6015   ENSG00000276518  128966733       1.0   \n",
-       "12139  ENSG00000230373  100133220       1.0   \n",
-       "12140  ENSG00000230373     642402       1.0   \n",
-       "23329  ENSG00000276387  124900571       1.0   \n",
-       "23330  ENSG00000276387       3802       2.0   \n",
-       "31304  ENSG00000249738     285626       1.0   \n",
-       "31305  ENSG00000249738  105377683       1.0   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "6011                                                  []   \n",
-       "6012                                                  []   \n",
-       "6013                                                  []   \n",
-       "6014                                                  []   \n",
-       "6015                                                  []   \n",
-       "12139                                         [GOLGA6L3]   \n",
-       "12140                                       [GOLGA6L21P]   \n",
-       "23329                                                 []   \n",
-       "23330  [NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...   \n",
-       "31304                                                 []   \n",
-       "31305                                                 []   \n",
-       "\n",
-       "                                                    name  \\\n",
-       "6011   putative killer cell immunoglobulin-like recep...   \n",
-       "6012   putative killer cell immunoglobulin-like recep...   \n",
-       "6013   putative killer cell immunoglobulin-like recep...   \n",
-       "6014   putative killer cell immunoglobulin-like recep...   \n",
-       "6015   putative killer cell immunoglobulin-like recep...   \n",
-       "12139                golgin A6 family like 3, pseudogene   \n",
-       "12140               golgin A6 family like 17, pseudogene   \n",
-       "23329      killer cell immunoglobulin-like receptor 2DS1   \n",
-       "23330  killer cell immunoglobulin like receptor, two ...   \n",
-       "31304                          uncharacterized LOC285626   \n",
-       "31305                       uncharacterized LOC105377683   \n",
-       "\n",
-       "                                                 summary        symbol  \\\n",
-       "6011                                                 NaN  LOC128966722   \n",
-       "6012                                                 NaN  LOC128966732   \n",
-       "6013                                                 NaN  LOC128966730   \n",
-       "6014                                                 NaN  LOC128966731   \n",
-       "6015                                                 NaN  LOC128966733   \n",
-       "12139                                                NaN     GOLGA6L3P   \n",
-       "12140                                                NaN    GOLGA6L17P   \n",
-       "23329                                                NaN  LOC124900571   \n",
-       "23330  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "31304                                                NaN     LOC285626   \n",
-       "31305                                                NaN  LOC105377683   \n",
-       "\n",
-       "         type_of_gene notfound  \n",
-       "6011   protein-coding      NaN  \n",
-       "6012   protein-coding      NaN  \n",
-       "6013   protein-coding      NaN  \n",
-       "6014   protein-coding      NaN  \n",
-       "6015   protein-coding      NaN  \n",
-       "12139          pseudo      NaN  \n",
-       "12140          pseudo      NaN  \n",
-       "23329  protein-coding      NaN  \n",
-       "23330  protein-coding      NaN  \n",
-       "31304           ncRNA      NaN  \n",
-       "31305           ncRNA      NaN  "
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# duplicated() will return true if the ID is a duplicate and is not the first one to appear the list.\n",
+    "# For printing only\n",
     "dupes = gene_table_merged[\"ensembl_gene_id\"].duplicated()\n",
-    "dupe_vals = gene_table_merged[dupes]\n",
+    "dupe_ids = gene_table_merged.loc[dupes, \"ensembl_gene_id\"]\n",
+    "print(\n",
+    "    gene_table_merged.loc[\n",
+    "        gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids),\n",
+    "        [\"ensembl_gene_id\", \"symbol\", \"alias\"],\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "# Rows with duplicated Ensembl IDs\n",
-    "all_duplicated = gene_table_merged.loc[\n",
-    "    gene_table_merged[\"ensembl_gene_id\"].isin(dupe_vals[\"ensembl_gene_id\"])\n",
-    "]\n",
-    "all_duplicated"
+    "# Remove duplicates\n",
+    "gene_table_merged = preprocessing_utils.merge_duplicate_ensembl_ids(gene_table_merged)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "093a2e98",
+   "execution_count": null,
+   "id": "bc76d96e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "4 duplicated genes have been processed.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>37442</th>\n",
-       "      <td>ENSG00000163811</td>\n",
-       "      <td>23160</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[NET12, UTP5]</td>\n",
-       "      <td>WD repeat domain 43</td>\n",
-       "      <td>Enables RNA binding activity. Involved in posi...</td>\n",
-       "      <td>WDR43</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37443</th>\n",
-       "      <td>ENSG00000226467</td>\n",
-       "      <td>10554</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]</td>\n",
-       "      <td>1-acylglycerol-3-phosphate O-acyltransferase 1</td>\n",
-       "      <td>This gene encodes an enzyme that converts lyso...</td>\n",
-       "      <td>AGPAT1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37444</th>\n",
-       "      <td>ENSG00000120533</td>\n",
-       "      <td>56943</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[Sus1, e(y)2, DC6]</td>\n",
-       "      <td>ENY2 transcription and export complex 2 subunit</td>\n",
-       "      <td>Enables nuclear receptor coactivator activity....</td>\n",
-       "      <td>ENY2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37445</th>\n",
-       "      <td>ENSG00000214759</td>\n",
-       "      <td>ENSG00000214759</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>ribosomal protein L36a pseudogene 2</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>RPL36AP2</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37446</th>\n",
-       "      <td>ENSG00000253981</td>\n",
-       "      <td>ENSG00000253981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>ALG1 like 13, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ALG1L13P</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>158062</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[hLcn5, LCN5, UNQ643]</td>\n",
-       "      <td>lipocalin 6</td>\n",
-       "      <td>Predicted to enable small molecule binding act...</td>\n",
-       "      <td>LCN6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966722</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[LOC128966730, LOC128966732, LOC128966731, LOC...</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37450</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>100133220</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37451</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>285626</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[LOC105377683]</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id              _id _version  \\\n",
-       "37442  ENSG00000163811            23160      1.0   \n",
-       "37443  ENSG00000226467            10554      1.0   \n",
-       "37444  ENSG00000120533            56943      1.0   \n",
-       "37445  ENSG00000214759  ENSG00000214759      1.0   \n",
-       "37446  ENSG00000253981  ENSG00000253981      1.0   \n",
-       "37447  ENSG00000267206           158062      1.0   \n",
-       "37448  ENSG00000276387             3802      2.0   \n",
-       "37449  ENSG00000276518        128966722      1.0   \n",
-       "37450  ENSG00000230373        100133220      1.0   \n",
-       "37451  ENSG00000249738           285626      1.0   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "37442                                      [NET12, UTP5]   \n",
-       "37443       [G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]   \n",
-       "37444                                 [Sus1, e(y)2, DC6]   \n",
-       "37445                                                 []   \n",
-       "37446                                                 []   \n",
-       "37447                              [hLcn5, LCN5, UNQ643]   \n",
-       "37448  [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...   \n",
-       "37449  [LOC128966730, LOC128966732, LOC128966731, LOC...   \n",
-       "37450                 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]   \n",
-       "37451                                     [LOC105377683]   \n",
-       "\n",
-       "                                                    name  \\\n",
-       "37442                                WD repeat domain 43   \n",
-       "37443     1-acylglycerol-3-phosphate O-acyltransferase 1   \n",
-       "37444    ENY2 transcription and export complex 2 subunit   \n",
-       "37445                ribosomal protein L36a pseudogene 2   \n",
-       "37446                           ALG1 like 13, pseudogene   \n",
-       "37447                                        lipocalin 6   \n",
-       "37448  killer cell immunoglobulin like receptor, two ...   \n",
-       "37449  putative killer cell immunoglobulin-like recep...   \n",
-       "37450                golgin A6 family like 3, pseudogene   \n",
-       "37451                          uncharacterized LOC285626   \n",
-       "\n",
-       "                                                 summary        symbol  \\\n",
-       "37442  Enables RNA binding activity. Involved in posi...         WDR43   \n",
-       "37443  This gene encodes an enzyme that converts lyso...        AGPAT1   \n",
-       "37444  Enables nuclear receptor coactivator activity....          ENY2   \n",
-       "37445                                                NaN      RPL36AP2   \n",
-       "37446                                                NaN      ALG1L13P   \n",
-       "37447  Predicted to enable small molecule binding act...          LCN6   \n",
-       "37448  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "37449                                                NaN  LOC128966722   \n",
-       "37450                                                NaN     GOLGA6L3P   \n",
-       "37451                                                NaN     LOC285626   \n",
-       "\n",
-       "         type_of_gene notfound  \n",
-       "37442  protein-coding      NaN  \n",
-       "37443  protein-coding      NaN  \n",
-       "37444  protein-coding      NaN  \n",
-       "37445             NaN      NaN  \n",
-       "37446             NaN      NaN  \n",
-       "37447  protein-coding      NaN  \n",
-       "37448  protein-coding      NaN  \n",
-       "37449  protein-coding      NaN  \n",
-       "37450          pseudo      NaN  \n",
-       "37451           ncRNA      NaN  "
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n",
-    "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n",
-    "\n",
-    "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n",
-    "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n",
-    "    group = all_duplicated.loc[all_duplicated[\"ensembl_gene_id\"] == ens_id].copy(\n",
-    "        deep=True\n",
-    "    )\n",
-    "    # Put any entries with symbols that aren't \"LOC#####\" at the top of the data frame\n",
-    "    matches = group[\"symbol\"].str.startswith(\"LOC\") == False\n",
-    "    group = pd.concat([group.loc[matches], group.loc[matches == False]]).reset_index(\n",
-    "        drop=True\n",
-    "    )\n",
-    "\n",
-    "    # Add all duplicate symbols and their aliases to the alias field of the first entry\n",
-    "    for row in group.index[1:]:\n",
-    "        group.at[group.index[0], \"alias\"].append(group[\"symbol\"][row])\n",
-    "        if len(group.at[row, \"alias\"]) > 0:\n",
-    "            group.at[group.index[0], \"alias\"] = (\n",
-    "                group.at[group.index[0], \"alias\"] + group[\"alias\"][row]\n",
-    "            )\n",
-    "\n",
-    "    # Make sure we didn't add duplicate aliases\n",
-    "    group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n",
-    "\n",
-    "    # Keep the first row only, which now has all the aliases\n",
-    "    keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n",
-    "\n",
-    "print(\n",
-    "    str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n",
-    "    + \" duplicated genes have been processed.\"\n",
-    ")\n",
-    "gene_table_merged = keep_df.reset_index(drop=True)\n",
-    "gene_table_merged.tail(n=10)"
+    "print(str(len(dupe_ids.drop_duplicates())) + \" duplicated genes have been processed.\")\n",
+    "print(gene_table_merged.shape)\n",
+    "print(gene_table_merged.loc[gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids), \"alias\"])"
    ]
   },
   {
@@ -1383,66 +341,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "4a1bbdee",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "             name     date                                 url version\n",
-      "1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37\n",
-      "2     Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org     111\n",
-      "3     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110\n",
-      "4     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109\n",
-      "5     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108\n",
-      "6     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107\n",
-      "7     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106\n",
-      "8     Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105\n",
-      "9     Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104\n",
-      "10    Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org     103\n",
-      "11    Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org     102\n",
-      "12    Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org     101\n",
-      "13    Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org     100\n",
-      "14     Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org      99\n",
-      "15     Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org      98\n",
-      "16     Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org      97\n",
-      "17     Ensembl 96 Apr 2019 https://apr2019.archive.ensembl.org      96\n",
-      "18     Ensembl 95 Jan 2019 https://jan2019.archive.ensembl.org      95\n",
-      "19     Ensembl 80 May 2015 https://may2015.archive.ensembl.org      80\n",
-      "20     Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org      77\n",
-      "21     Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org      75\n",
-      "22     Ensembl 54 May 2009 https://may2009.archive.ensembl.org      54\n",
-      "   current_release\n",
-      "1                 \n",
-      "2                *\n",
-      "3                 \n",
-      "4                 \n",
-      "5                 \n",
-      "6                 \n",
-      "7                 \n",
-      "8                 \n",
-      "9                 \n",
-      "10                \n",
-      "11                \n",
-      "12                \n",
-      "13                \n",
-      "14                \n",
-      "15                \n",
-      "16                \n",
-      "17                \n",
-      "18                \n",
-      "19                \n",
-      "20                \n",
-      "21                \n",
-      "22                \n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "archive_df = r.listEnsemblArchives()\n",
     "archive_df.to_csvfile(path=archive_filename, row_names=False, quote=False)\n",
@@ -1462,291 +366,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "9a747309",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "37452\n",
-      "Querying genes 1 - 1000\n",
-      "Querying genes 1001 - 2000\n",
-      "Querying genes 2001 - 3000\n",
-      "Querying genes 3001 - 4000\n",
-      "Querying genes 4001 - 5000\n",
-      "Querying genes 5001 - 6000\n",
-      "Querying genes 6001 - 7000\n",
-      "Querying genes 7001 - 8000\n",
-      "Querying genes 8001 - 9000\n",
-      "Querying genes 9001 - 10000\n",
-      "Querying genes 10001 - 11000\n",
-      "Querying genes 11001 - 12000\n",
-      "Querying genes 12001 - 13000\n",
-      "Querying genes 13001 - 14000\n",
-      "Querying genes 14001 - 15000\n",
-      "Querying genes 15001 - 16000\n",
-      "Querying genes 16001 - 17000\n",
-      "Querying genes 17001 - 18000\n",
-      "Querying genes 18001 - 19000\n",
-      "Querying genes 19001 - 20000\n",
-      "Querying genes 20001 - 21000\n",
-      "Querying genes 21001 - 22000\n",
-      "Querying genes 22001 - 23000\n",
-      "Querying genes 23001 - 24000\n",
-      "Querying genes 24001 - 25000\n",
-      "Querying genes 25001 - 26000\n",
-      "Querying genes 26001 - 27000\n",
-      "Querying genes 27001 - 28000\n",
-      "Querying genes 28001 - 29000\n",
-      "Querying genes 29001 - 30000\n",
-      "Querying genes 30001 - 31000\n",
-      "Querying genes 31001 - 32000\n",
-      "Querying genes 32001 - 33000\n",
-      "Querying genes 33001 - 34000\n",
-      "Querying genes 34001 - 35000\n",
-      "Querying genes 35001 - 36000\n",
-      "Querying genes 36001 - 37000\n",
-      "Querying genes 37001 - 37452\n",
-      "37452\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>is_current</th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>6</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000267206.6</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>4</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000276387.4</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000276518.1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37450</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>9</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000230373.9</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37451</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>10</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000249738.10</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      is_current assembly               id  version  type peptide  \\\n",
-       "37447          1   GRCh38  ENSG00000267206        6  Gene    None   \n",
-       "37448          1   GRCh38  ENSG00000276387        4  Gene    None   \n",
-       "37449          1   GRCh38  ENSG00000276518        1  Gene    None   \n",
-       "37450          1   GRCh38  ENSG00000230373        9  Gene    None   \n",
-       "37451          1   GRCh38  ENSG00000249738       10  Gene    None   \n",
-       "\n",
-       "                   latest possible_replacement release  \n",
-       "37447   ENSG00000267206.6                   []     111  \n",
-       "37448   ENSG00000276387.4                   []     111  \n",
-       "37449   ENSG00000276518.1                   []     111  \n",
-       "37450   ENSG00000230373.9                   []     111  \n",
-       "37451  ENSG00000249738.10                   []     111  "
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "url = \"https://rest.ensembl.org/archive/id\"\n",
-    "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n",
-    "\n",
-    "ids = gene_table_merged[\"ensembl_gene_id\"].tolist()\n",
-    "print(len(ids))\n",
-    "\n",
-    "# We can only query 1000 genes at a time\n",
-    "batch_ind = range(0, len(ids), 1000)\n",
-    "results = []\n",
-    "\n",
-    "for B in batch_ind:\n",
-    "    end = min(len(ids), B + 1000)\n",
-    "    print(\"Querying genes \" + str(B + 1) + \" - \" + str(end))\n",
-    "\n",
-    "    request_data = '{ \"id\" : ' + str(ids[B:end]) + \" }\"\n",
-    "    request_data = request_data.replace(\"'\", '\"')\n",
-    "\n",
-    "    ok = False\n",
-    "    tries = 0\n",
-    "\n",
-    "    while tries < 5 and not ok:\n",
-    "        try:\n",
-    "            res = requests.post(url, headers=headers, data=request_data)\n",
-    "            ok = res.ok\n",
-    "        except:\n",
-    "            ok = False\n",
-    "\n",
-    "        tries = tries + 1\n",
-    "\n",
-    "        if not ok:\n",
-    "            # res.raise_for_status()\n",
-    "            print(\n",
-    "                \"Error retrieving Ensembl versions for genes \"\n",
-    "                + str(B + 1)\n",
-    "                + \" - \"\n",
-    "                + str(end)\n",
-    "                + \". Trying again...\"\n",
-    "            )\n",
-    "        else:\n",
-    "            results = results + res.json()\n",
-    "            break\n",
-    "\n",
-    "print(len(results))\n",
-    "\n",
-    "versions = pd.json_normalize(results)\n",
+    "versions = preprocessing_utils.query_ensembl_version_api(\n",
+    "    ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n",
+    ")\n",
     "\n",
     "versions.tail()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "5c108238",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "release\n",
-       "100       22\n",
-       "101        8\n",
-       "102       16\n",
-       "103       15\n",
-       "104       19\n",
-       "105        9\n",
-       "106       35\n",
-       "107       10\n",
-       "108        4\n",
-       "109        4\n",
-       "110       11\n",
-       "111    36286\n",
-       "80        21\n",
-       "81         2\n",
-       "82        10\n",
-       "84       673\n",
-       "87        61\n",
-       "89        20\n",
-       "91        75\n",
-       "93        53\n",
-       "95        33\n",
-       "96        31\n",
-       "97        18\n",
-       "98         9\n",
-       "99         7\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions.groupby(\"release\").size()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "bf5aecb1",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "37452\n",
-      "37452\n",
-      "True\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Check that all IDs are the same between the result and the gene table\n",
     "print(len(versions[\"id\"]))\n",
@@ -1759,21 +410,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "7fc8bbcd",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Make sure everything is GRCh38, not GRCh37\n",
     "all(versions[\"assembly\"] == \"GRCh38\")"
@@ -1791,7 +431,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "0d5b5652",
    "metadata": {
     "scrolled": true
@@ -1815,40 +455,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "id": "337b2890",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "closest_release\n",
-       "80       915\n",
-       "95        33\n",
-       "96        31\n",
-       "97        18\n",
-       "98         9\n",
-       "99         7\n",
-       "100       22\n",
-       "101        8\n",
-       "102       16\n",
-       "103       15\n",
-       "104       19\n",
-       "105        9\n",
-       "106       35\n",
-       "107       10\n",
-       "108        4\n",
-       "109        4\n",
-       "110       11\n",
-       "111    36286\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[\"closest_release\"] = 0\n",
     "\n",
@@ -1865,149 +475,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "343e5006",
    "metadata": {
     "scrolled": false
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>is_current</th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "      <th>closest_release</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>14</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000164972.14</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>8</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000169105.8</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000255136.3</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>14</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000105499.14</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>12</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000104611.12</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  is_current assembly               id  version  type peptide  \\\n",
-       "0          1   GRCh38  ENSG00000164972       14  Gene    None   \n",
-       "1          1   GRCh38  ENSG00000169105        8  Gene    None   \n",
-       "2          1   GRCh38  ENSG00000255136        3  Gene    None   \n",
-       "3          1   GRCh38  ENSG00000105499       14  Gene    None   \n",
-       "4          1   GRCh38  ENSG00000104611       12  Gene    None   \n",
-       "\n",
-       "               latest possible_replacement release  closest_release  \\\n",
-       "0  ENSG00000164972.14                   []     111              111   \n",
-       "1   ENSG00000169105.8                   []     111              111   \n",
-       "2   ENSG00000255136.3                   []     111              111   \n",
-       "3  ENSG00000105499.14                   []     111              111   \n",
-       "4  ENSG00000104611.12                   []     111              111   \n",
-       "\n",
-       "                                           permalink  \n",
-       "0  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://jan2024.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[\"permalink\"] = \"\"\n",
     "\n",
@@ -2024,166 +497,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "4b01719d",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>is_current</th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "      <th>closest_release</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>51</th>\n",
-       "      <td></td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000266701</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000266701.1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>99</th>\n",
-       "      <td></td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000268225</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000268225.2</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>98</td>\n",
-       "      <td>98</td>\n",
-       "      <td>https://sep2019.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>119</th>\n",
-       "      <td></td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000281018</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000281018.1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>120</th>\n",
-       "      <td></td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000216011</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000216011.2</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>135</th>\n",
-       "      <td></td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000264103</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>None</td>\n",
-       "      <td>ENSG00000264103.1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    is_current assembly               id  version  type peptide  \\\n",
-       "51               GRCh38  ENSG00000266701        1  Gene    None   \n",
-       "99               GRCh38  ENSG00000268225        2  Gene    None   \n",
-       "119              GRCh38  ENSG00000281018        1  Gene    None   \n",
-       "120              GRCh38  ENSG00000216011        2  Gene    None   \n",
-       "135              GRCh38  ENSG00000264103        1  Gene    None   \n",
-       "\n",
-       "                latest possible_replacement release  closest_release  \\\n",
-       "51   ENSG00000266701.1                   []      84               80   \n",
-       "99   ENSG00000268225.2                   []      98               98   \n",
-       "119  ENSG00000281018.1                   []      84               80   \n",
-       "120  ENSG00000216011.2                   []      84               80   \n",
-       "135  ENSG00000264103.1                   []      84               80   \n",
-       "\n",
-       "                                             permalink  \n",
-       "51   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "99   https://sep2019.archive.ensembl.org/Homo_sapie...  \n",
-       "119  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "120  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "135  https://may2015.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[versions[\"closest_release\"] < 100].head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "id": "c4128cc9",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000164972\n",
-      "https://jul2023.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000279049\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(versions[\"permalink\"][0])\n",
     "print(versions[\"permalink\"][25])"
@@ -2191,21 +518,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "id": "73791e6c",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Does every gene have an associated URL?\n",
     "url_base_len = len(archive_table[\"url\"][0]) + 1\n",
@@ -2222,181 +538,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "id": "f3edfd2f",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(37452, 12)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "      <th>ensembl_release</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, EDSMC1, HNK1ST, D4ST1]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>8605</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[CPLA2-gamma]</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>63898</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   ensembl_gene_id              _id _version  \\\n",
-       "0  ENSG00000164972            84688      2.0   \n",
-       "1  ENSG00000169105           113189      2.0   \n",
-       "2  ENSG00000255136  ENSG00000255136      1.0   \n",
-       "3  ENSG00000105499             8605      1.0   \n",
-       "4  ENSG00000104611            63898      1.0   \n",
-       "\n",
-       "                                          alias  \\\n",
-       "0  [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]   \n",
-       "1                 [ATCS, EDSMC1, HNK1ST, D4ST1]   \n",
-       "2                                            []   \n",
-       "3                                 [CPLA2-gamma]   \n",
-       "4                               [PPP1R38, SH2A]   \n",
-       "\n",
-       "                                name  \\\n",
-       "0  sperm microtubule inner protein 6   \n",
-       "1   carbohydrate sulfotransferase 14   \n",
-       "2              TPBGL antisense RNA 1   \n",
-       "3         phospholipase A2 group IVC   \n",
-       "4           SH2 domain containing 4A   \n",
-       "\n",
-       "                                             summary     symbol  \\\n",
-       "0  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "1  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "2                                                NaN  TPBGL-AS1   \n",
-       "3  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "4  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
-       "\n",
-       "     type_of_gene notfound ensembl_release possible_replacement  \\\n",
-       "0  protein-coding      NaN             111                   []   \n",
-       "1  protein-coding      NaN             111                   []   \n",
-       "2             NaN      NaN             111                   []   \n",
-       "3  protein-coding      NaN             111                   []   \n",
-       "4  protein-coding      NaN             111                   []   \n",
-       "\n",
-       "                                           permalink  \n",
-       "0  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://jan2024.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions = versions[[\"id\", \"release\", \"possible_replacement\", \"permalink\"]]\n",
     "versions.rename(\n",
@@ -2421,277 +566,25 @@
    "metadata": {},
    "source": [
     "### Final cleanup\n",
-    "Unfilled \"possible_replacement\" entries should be changed from NaN to empty lists. \n",
-    "\n",
-    "\"possible_replacement\" entries that have data in them exist as a list of dicts, and need to have the Ensembl IDs pulled out of them as a list of strings. \n",
+    "\"possible_replacement\" entries will either be an empty list or a list of dictionaries. Entries that have data in them need to have the Ensembl IDs pulled out of them as a list of strings.\n",
     "\n",
     "Remove unneeded columns. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "id": "d0c07b7a",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>ensembl_release</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>[ATCS, EDSMC1, HNK1ST, D4ST1]</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>[CPLA2-gamma]</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>lipocalin 6</td>\n",
-       "      <td>[hLcn5, LCN5, UNQ643]</td>\n",
-       "      <td>Predicted to enable small molecule binding act...</td>\n",
-       "      <td>LCN6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>[LOC128966730, LOC128966732, LOC128966731, LOC...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37450</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37451</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>[LOC105377683]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>111</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>37452 rows × 9 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id                                               name  \\\n",
-       "0      ENSG00000164972                  sperm microtubule inner protein 6   \n",
-       "1      ENSG00000169105                   carbohydrate sulfotransferase 14   \n",
-       "2      ENSG00000255136                              TPBGL antisense RNA 1   \n",
-       "3      ENSG00000105499                         phospholipase A2 group IVC   \n",
-       "4      ENSG00000104611                           SH2 domain containing 4A   \n",
-       "...                ...                                                ...   \n",
-       "37447  ENSG00000267206                                        lipocalin 6   \n",
-       "37448  ENSG00000276387  killer cell immunoglobulin like receptor, two ...   \n",
-       "37449  ENSG00000276518  putative killer cell immunoglobulin-like recep...   \n",
-       "37450  ENSG00000230373                golgin A6 family like 3, pseudogene   \n",
-       "37451  ENSG00000249738                          uncharacterized LOC285626   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "0           [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]   \n",
-       "1                          [ATCS, EDSMC1, HNK1ST, D4ST1]   \n",
-       "2                                                     []   \n",
-       "3                                          [CPLA2-gamma]   \n",
-       "4                                        [PPP1R38, SH2A]   \n",
-       "...                                                  ...   \n",
-       "37447                              [hLcn5, LCN5, UNQ643]   \n",
-       "37448  [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...   \n",
-       "37449  [LOC128966730, LOC128966732, LOC128966731, LOC...   \n",
-       "37450                 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]   \n",
-       "37451                                     [LOC105377683]   \n",
-       "\n",
-       "                                                 summary        symbol  \\\n",
-       "0      This gene encodes a nuclear- or perinuclear-lo...        SPMIP6   \n",
-       "1      This gene encodes a member of the HNK-1 family...        CHST14   \n",
-       "2                                                    NaN     TPBGL-AS1   \n",
-       "3      This gene encodes a protein which is a member ...       PLA2G4C   \n",
-       "4      Enables phosphatase binding activity. Located ...        SH2D4A   \n",
-       "...                                                  ...           ...   \n",
-       "37447  Predicted to enable small molecule binding act...          LCN6   \n",
-       "37448  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "37449                                                NaN  LOC128966722   \n",
-       "37450                                                NaN     GOLGA6L3P   \n",
-       "37451                                                NaN     LOC285626   \n",
-       "\n",
-       "         type_of_gene ensembl_release possible_replacement  \\\n",
-       "0      protein-coding             111                   []   \n",
-       "1      protein-coding             111                   []   \n",
-       "2                 NaN             111                   []   \n",
-       "3      protein-coding             111                   []   \n",
-       "4      protein-coding             111                   []   \n",
-       "...               ...             ...                  ...   \n",
-       "37447  protein-coding             111                   []   \n",
-       "37448  protein-coding             111                   []   \n",
-       "37449  protein-coding             111                   []   \n",
-       "37450          pseudo             111                   []   \n",
-       "37451           ncRNA             111                   []   \n",
-       "\n",
-       "                                               permalink  \n",
-       "0      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "...                                                  ...  \n",
-       "37447  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37448  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37449  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37450  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37451  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "\n",
-       "[37452 rows x 9 columns]"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "for row in gene_table_merged.loc[\n",
-    "    gene_table_merged[\"possible_replacement\"].isnull(), \"possible_replacement\"\n",
-    "].index:\n",
-    "    gene_table_merged.at[row, \"possible_replacement\"] = []\n",
+    "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
+    "    \"possible_replacement\"\n",
+    "].apply(lambda pr: pr if pr is np.NaN or len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n",
     "\n",
-    "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n",
-    "    lambda row: (\n",
-    "        row[\"possible_replacement\"]\n",
-    "        if len(row[\"possible_replacement\"]) == 0\n",
-    "        else [x[\"stable_id\"] for x in row[\"possible_replacement\"]]\n",
-    "    ),\n",
-    "    axis=1,\n",
-    ")\n",
+    "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
+    "    \"possible_replacement\"\n",
+    "].apply(preprocessing_utils.standardize_list_item)\n",
     "\n",
     "gene_table_merged = gene_table_merged[\n",
     "    [\n",
@@ -2721,7 +614,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "id": "f2287922",
    "metadata": {},
    "outputs": [],
@@ -2736,7 +629,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "agora-data-tools-ywFp1Gf9",
    "language": "python",
    "name": "python3"
   },
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index fbc1a2dc..e85f441a 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -1,12 +1,31 @@
+"""
+This file includes several helper functions that are called from one or more of the pre-processing
+notebooks. This helps avoid code duplication and/or keeps the notebooks cleaner and more straightforward.
+Current public-facing functions:
+    manual_query_biomart - queries Biomart with a GET request
+    query_ensembl_version_api - queries the Ensembl API for Ensembl ID version info
+    r_query_biomart - queries Biomart using rpy2
+    filter_hasgs - removes human alternative sequence genes from a data frame
+    get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT
+    standardize_list_item - turn values of varying types into a list. Used for fixing the "alias" and
+                            "possible_replacement" fields of gene_metadata.
+    merge_duplicate_ensembl_ids - collapse rows with the same Ensembl ID but different gene symbols
+                                  or aliases into one row
+"""
+
 import pandas as pd
+import numpy as np
 import requests
 import re
+import synapseclient
 from io import StringIO
-from typing import Union
+from typing import Union, Dict, List, Set
+import agoradatatools.etl.utils as utils
+import agoradatatools.etl.extract as extract
 
 
 def manual_query_biomart(
-    attributes: list[str], filters: dict[Union[list, set]]
+    attributes: List[str], filters: Dict[str, Union[List[str], Set[str]]]
 ) -> pd.DataFrame:
     """Performs a GET request to the Biomart web service and returns the response. There is no
     canonical Python library to query Biomart and no Python library at all to query on
@@ -49,6 +68,63 @@ def manual_query_biomart(
     return result
 
 
+def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame:
+    """
+    Queries the Ensembl API via POST to get version information for each Ensembl ID. The API can only
+    process 1000 IDs at a time so the query is broken into batches of 1000. If a request fails, this
+    function will try again up to 5 times on that batch before quitting and raising an error.
+
+    Args:
+        ensembl_ids: a list of Ensembl IDs to query
+
+    Returns:
+        a pandas data frame with Ensembl IDs, version, and release information
+    """
+    url = "https://rest.ensembl.org/archive/id"
+    headers = {"Content-Type": "application/json", "Accept": "application/json"}
+
+    # We can only query 1000 genes at a time
+    batch_ind = range(0, len(ensembl_ids), 1000)
+    results = []
+
+    for B in batch_ind:
+        end = min(len(ensembl_ids), B + 1000)
+        print("Querying genes " + str(B + 1) + " - " + str(end))
+
+        request_data = '{ "id" : ' + str(ensembl_ids[B:end]) + " }"
+        request_data = request_data.replace("'", '"')
+
+        ok = False
+        tries = 0
+
+        while tries < 5 and not ok:
+            try:
+                res = requests.post(url, headers=headers, data=request_data)
+                ok = res.ok
+            except requests.RequestException as ex:
+                print(ex)
+                ok = False
+
+            tries = tries + 1
+
+            if not ok and tries == 5:
+                res.raise_for_status()
+            elif not ok:
+                print(
+                    "Error retrieving Ensembl versions for genes "
+                    + str(B + 1)
+                    + " - "
+                    + str(end)
+                    + ". Trying again..."
+                )
+            else:
+                results = results + res.json()
+                break
+
+    versions = pd.json_normalize(results)
+    return versions
+
+
 def filter_hasgs(df: pd.DataFrame, chromosome_name_column: str) -> pd.DataFrame:
     """Filters human alternative sequence genes (HASGs) from a data frame by using a regex to
     identify them for removal. Valid genes will either have a numerical chromosome name or have
@@ -87,6 +163,7 @@ def r_query_biomart() -> pd.DataFrame:
                                       "chromosome_name", and "hgnc_symbol" retrived from BioMart
     """
     from rpy2.robjects import r
+    from rpy2.rinterface_lib.embedded import RRuntimeError
 
     r(
         'if (!require("BiocManager", character.only = TRUE)) { install.packages("BiocManager") }'
@@ -96,7 +173,7 @@ def r_query_biomart() -> pd.DataFrame:
     r.library("biomaRt")
 
     # Sometimes Biomart doesn't respond and the command needs to be sent again. Try up to 5 times.
-    for T in range(5):
+    for _ in range(5):
         try:
             mart = r.useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
             ensembl_ids = r.getBM(
@@ -105,7 +182,8 @@ def r_query_biomart() -> pd.DataFrame:
                 useCache=False,
             )
 
-        except:
+        except RRuntimeError as ex:
+            print(ex)
             print("Trying again...")
             ensembl_ids = None
         else:
@@ -124,3 +202,213 @@ def r_query_biomart() -> pd.DataFrame:
             }
         )
         return ensembl_ids_df
+
+
+def get_all_adt_ensembl_ids(
+    config_filename: str, exclude_files: List[str] = [], token: str = None
+) -> List[str]:
+    """
+    Loops through an ADT config file, finds all data files that are ingested by ADT, and returns a
+    list containing all Ensembl IDs present in those files. Specific files can be excluded from the
+    list with the exclude_files argument.
+
+    Args:
+        config_filename: full or relative file path to the ADT config.yaml file
+        exclude_files: list of file names to exclude when searching files for IDs. These names must
+                       match what is in "name" field of the file specification in the config.yaml
+                       file. Typical values are "gene_metadata" and "druggability".
+        token: a Synapse auth token, or None if the user has Synapse credentials saved.
+
+    Returns:
+        a list of unique Ensembl IDs that exist in at least one data set ingested by ADT
+    """
+    syn = utils._login_to_synapse(token=token)
+    config = utils._get_config(config_path=config_filename)
+    datasets = config["datasets"]
+
+    # Get all unique files in the config since some files are listed multiple times by being
+    # included in multiple data sets. Also fetch all column rename values for standardizing Ensembl
+    # ID column names
+    unique_files = {}
+    column_renames = {}
+
+    for dataset in datasets:
+        dataset_name = list(dataset.keys())[0]
+
+        for file in dataset[dataset_name]["files"]:
+            # Make the Synapse ID the key so that "update" will only add a new item if the ID doesn't
+            # already exist
+            unique_files.update({file["id"]: file})
+
+        # Only some data sets have column rename values
+        if "column_rename" in dataset[dataset_name].keys():
+            column_renames.update(dataset[dataset_name]["column_rename"])
+
+    # Print all the files we found
+    print("Found " + str(len(unique_files)) + " files:")
+    [print(x["name"] + ":\t" + x["id"]) for x in unique_files.values()]
+    print("")
+
+    # Create a list of all Ensembl IDs in all files
+    file_ensembl_list = []
+
+    for entity in unique_files.values():
+        # Ignore json files, which are post-processed and not what we're interested in.
+        # Also ignore any other files specified by 'exclude_files', which likely includes
+        # "gene_metadata" and "druggability".
+        if entity["format"] == "json" or entity["name"] in exclude_files:
+            continue
+
+        file_ensembl_ids = _extract_ensembl_ids(syn, entity, column_renames)
+        file_ensembl_list = file_ensembl_list + file_ensembl_ids
+
+    # Remove duplicate values
+    return list(set(file_ensembl_list))
+
+
+def _extract_ensembl_ids(
+    syn: synapseclient.Synapse, entity: Dict[str, str], column_renames: Dict[str, str]
+) -> List[str]:
+    """
+    Internal function used by get_all_adt_ensembl_ids to exctract a list of Ensembl IDs from a file.
+    The file is downloaded from Synapse and read in as a pandas data frame, column names are renamed
+    if necessary to ensure that most Ensembl ID columns are renamed to "ensembl_gene_id", and all
+    Ensembl IDs from relevant columns are put in a list.
+
+    Note that the "networks" data set contains two columns with Ensembl IDs (genea_ensembl_gene_id
+    and geneb_ensembl_gene_id) which are not renamed, so this function searches for columns named
+    with any of those two names or with "ensembl_gene_id" when finding Ensembl ID columns.
+
+    Note that this function depends on the column_rename specifications in the config to accurately
+    convert all Ensembl ID-containing columns in all files except networks to "ensembl_gene_id", so
+    that we don't have to hard-code a list of all possible column names. This assumption is valid
+    for the current set of data files and will likely remain valid for future data, but a warning
+    is printed out if no matching column is found, just in case.
+
+    Args:
+        syn: a syanpseclient object which has already been initialized and successfully logged in
+        entity: a dictionary containing keys "id", "name", and "format"
+        column_renames: a dictionary containing all column rename pairs from the config file, where
+                        key = old column name, and value = new column name
+
+    Returns:
+        a list of unique Ensembl IDs in the file, or an empty list if no Ensembl ID column found
+    """
+    df = extract.get_entity_as_df(syn_id=entity["id"], source=entity["format"], syn=syn)
+
+    # Use column_renames from the config to convert most Ensembl ID column names to "ensembl_gene_id".
+    df = utils.standardize_column_names(df=df)
+    df = utils.rename_columns(df=df, column_map=column_renames)
+
+    # Exception to the above comment: the 'networks' file has two ID columns (genea_ and geneb_ ensembl_gene_id)
+    # which do not get renamed
+    possible_col_names = [
+        "ensembl_gene_id",
+        "genea_ensembl_gene_id",
+        "geneb_ensembl_gene_id",
+    ]
+
+    file_ensembl_ids = []
+
+    # The data may have zero, one, or more than one (in the case of 'networks') column of Ensembl IDs
+    for C in possible_col_names:
+        if C in df.columns:
+            file_ensembl_ids = file_ensembl_ids + df[C].tolist()
+
+    # Print any warnings and remove any NA values from the list before returning
+    if len(file_ensembl_ids) == 0:
+        print("WARNING: no Ensembl ID column found for " + entity["name"] + "!")
+
+    if "n/A" in file_ensembl_ids:
+        print(entity["name"] + " has an n/A Ensembl ID")
+        file_ensembl_ids.remove("n/A")
+
+    if np.NaN in file_ensembl_ids:
+        print(
+            entity["name"]
+            + " has "
+            + str(file_ensembl_ids.count(np.NaN))
+            + " NaN Ensembl IDs"
+        )
+        file_ensembl_ids = [x for x in file_ensembl_ids if x is not np.NaN]
+
+    # Remove duplicate values
+    return list(set(file_ensembl_ids))
+
+
+def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
+    """
+    For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN,
+    a single string, and a list of strings. This function standardizes the column values so that
+    everything is a list, either empty (if NaN) or a list of strings. The final list is sorted
+    alphabetically to make comparison between different versions of the file easier.
+
+    This function is intended to be called as part of an apply() statement on a pandas data frame
+    column.
+
+    Args:
+        item: either a string, a list of strings, or np.NaN
+
+    Returns:
+        A list of strings or an empty list. The list is sorted alphabetically.
+    """
+    # Convert NaN to an empty list
+    if item is np.NaN:
+        return []
+
+    # Convert plain strings to a list of one string
+    if isinstance(item, str):
+        return [item]
+
+    if isinstance(item, list):
+        # Get unique values only and sort them
+        item = list(set(item))
+        item.sort()
+
+    # No extra handling necessary for other data types
+
+    return item
+
+
+def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame:
+    """
+    MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols
+    or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs
+    in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the
+    same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the
+    remaining rows in the set are then added as aliases to the "main" row, and all of their aliases
+    are added to the main row alias field as well. All rows in the set except the main row are then
+    deleted from the data frame, leaving a single row for that Ensembl ID with all symbols and aliases
+    from the duplicate rows merged into the alias field.
+
+    Args:
+        gene_table: a pandas DataFrame containing gene metadata results from MyGene
+
+    Returns:
+        a data frame with duplicate rows removed
+    """
+    dupes = gene_table["ensembl_gene_id"].duplicated()
+    dupe_ids = gene_table.loc[dupes, "ensembl_gene_id"].drop_duplicates().tolist()
+
+    for ens_id in dupe_ids:
+        rows = gene_table.loc[gene_table["ensembl_gene_id"] == ens_id]
+
+        # Add duplicate rows' symbols to the alias field of the first row, then add duplicate rows'
+        # aliases to the first row's alias field. All other information in the duplicate rows is
+        # discarded.
+        new_alias = rows.iloc[0]["alias"]
+
+        for row in rows.index[1:]:
+            new_alias.append(rows.loc[row, "symbol"])
+            new_alias = new_alias + rows.loc[row, "alias"]
+
+        # Remove any duplicate aliases and sort them
+        new_alias = list(set(new_alias))
+        new_alias.sort()
+
+        # Set the new aliases to the first row in this group and remove all duplicate rows from the
+        # data frame
+        gene_table.at[rows.index[0], "alias"] = new_alias
+        gene_table = gene_table.drop(rows.index[1:])
+
+    return gene_table
diff --git a/tests/test_assets/.DS_Store b/tests/test_assets/.DS_Store
deleted file mode 100644
index 46b71f5c..00000000
Binary files a/tests/test_assets/.DS_Store and /dev/null differ