Skip to content
This repository has been archived by the owner on Mar 13, 2024. It is now read-only.

Commit

Permalink
Preclinical and supplementary figures
Browse files Browse the repository at this point in the history
  • Loading branch information
brendanreardon committed Sep 22, 2020
1 parent 1ec621b commit 683dad1
Show file tree
Hide file tree
Showing 116 changed files with 8,761,086 additions and 0 deletions.
449 changes: 449 additions & 0 deletions analyses/preclinical/00.map-almanac-to-gdsc.ipynb

Large diffs are not rendered by default.

314 changes: 314 additions & 0 deletions analyses/preclinical/01.organize-samples.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Organize samples\n",
"We leverage the sample tables from CCLE, Sanger, and DepMap to make sure we have a consistent set of samples to work with. We export a file that contains names of cell lines based on the Broad, CCLE, and Sanger (`formatted/cell-lines-names.raw.txt`) and then manually checked it (`formatted/cell-lines-names.formatted.txt`). "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"gdsc_model_info = pd.read_csv('source/gdsc/model_list_20200204.csv')\n",
"ccle_model_info = pd.read_csv('source/ccle-2019/data_clinical_sample.txt', sep='\\t', comment='#')\n",
"\n",
"fibroblast_maps = (pd.\n",
" read_excel('source/ccle-2019/41586_2019_1186_MOESM4_ESM.xlsx', \n",
" sheet_name='Cell line name changes')\n",
" .iloc[:45, :]\n",
" .set_index('old_CCLE_ID')\n",
" .loc[:, 'new_CCLE_ID'])\n",
"\n",
"gdsc_to_ccle = gdsc_model_info.loc[:, ['model_id', 'CCLE_ID']].dropna()\n",
"gdsc_to_ccle['CCLE_ID'].replace(fibroblast_maps, inplace=True)\n",
"gdsc_to_ccle = gdsc_to_ccle.set_index('model_id')['CCLE_ID']\n",
"\n",
"depmap = pd.read_csv('source/depmap/sample_info.csv')\n",
"depmap['CCLE_Name'].replace(fibroblast_maps, inplace=True)\n",
"depmap_maps = depmap.loc[:, ['CCLE_Name', 'Sanger_Model_ID']].dropna()\n",
"depmap_maps = depmap_maps[~depmap_maps['Sanger_Model_ID'].isin(gdsc_to_ccle.to_frame().reset_index()['model_id'])].set_index('Sanger_Model_ID')['CCLE_Name']\n",
"gdsc_to_ccle = pd.concat([gdsc_to_ccle, depmap_maps])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate unique samples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ccle_unique = ccle_model_info['SAMPLE_ID'].replace(fibroblast_maps).drop_duplicates().sort_values()\n",
"gdsc_unique = gdsc_model_info['model_id'].drop_duplicates().sort_values()\n",
"broad_unique = depmap['DepMap_ID'].drop_duplicates().sort_values()\n",
"other_broad = ccle_model_info['DEPMAPID'].dropna()[~ccle_model_info['DEPMAPID'].dropna().isin(broad_unique)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Concat by sample name type"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"all_broad = pd.concat([\n",
" depmap['DepMap_ID'],\n",
" ccle_model_info['DEPMAPID'],\n",
" gdsc_model_info['BROAD_ID']\n",
"]).dropna().drop_duplicates().sort_values().reset_index(drop=True)\n",
"\n",
"all_broad = all_broad[all_broad.str.len().eq(10)].reset_index(drop=True)\n",
"all_broad = pd.DataFrame('', index=all_broad, columns=['ccle', 'sanger'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"all_ccle = pd.concat([\n",
" ccle_model_info['SAMPLE_ID'].replace(fibroblast_maps),\n",
" gdsc_model_info['CCLE_ID'].replace(fibroblast_maps),\n",
" depmap['CCLE_Name'].replace(fibroblast_maps)\n",
"]).dropna().drop_duplicates().sort_values().reset_index(drop=True)\n",
"\n",
"all_ccle = pd.DataFrame('', index=all_ccle, columns=['sanger', 'broad'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"all_sanger = pd.concat([\n",
" gdsc_model_info['model_id'],\n",
" depmap['Sanger_Model_ID'],\n",
"]).dropna().drop_duplicates().sort_values().reset_index(drop=True)\n",
"\n",
"all_sanger = pd.DataFrame('', index=all_sanger, columns=['ccle', 'broad'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"depmap_formatted = (depmap\n",
" .loc[:, ['DepMap_ID', 'CCLE_Name', 'Sanger_Model_ID']]\n",
" .rename(columns={'CCLE_Name': 'ccle_name', 'Sanger_Model_ID': 'sanger', 'DepMap_ID': 'broad'})\n",
")\n",
"depmap_formatted['ccle_name'] = depmap_formatted['ccle_name'].replace(fibroblast_maps)\n",
"\n",
"ccle_formatted = (ccle_model_info\n",
" .loc[:, ['SAMPLE_ID', 'DEPMAPID']]\n",
" .rename(columns={'SAMPLE_ID': 'ccle_name', 'DEPMAPID': 'broad'})\n",
" )\n",
"ccle_formatted['ccle_name'] = ccle_formatted['ccle_name'].replace(fibroblast_maps)\n",
"\n",
"sanger_formatted = (\n",
" gdsc_model_info\n",
" .loc[:, ['model_id', 'CCLE_ID', 'BROAD_ID']]\n",
" .rename(columns={'model_id': 'sanger', 'CCLE_ID': 'ccle_name', 'BROAD_ID': 'broad'})\n",
")\n",
"sanger_formatted['ccle_name'] = sanger_formatted['ccle_name'].replace(fibroblast_maps)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"depmap_sanger = pd.concat([\n",
" depmap_formatted,\n",
" sanger_formatted,\n",
"])\n",
"\n",
"missing_ccle = ccle_formatted[~ccle_formatted['ccle_name'].isin(depmap_sanger['ccle_name'])]\n",
"depmap_sanger = pd.concat([\n",
" depmap_sanger,\n",
" missing_ccle\n",
"])\n",
"\n",
"depmap_sanger = (depmap_sanger\n",
" .sort_values(['broad', 'ccle_name', 'sanger'])\n",
" .drop_duplicates(['broad', 'ccle_name'], keep='first')\n",
")\n",
"\n",
"depmap_sanger.to_csv('formatted/cell-line-names.raw.txt', sep='\\t', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"checked = pd.read_csv('formatted/cell-line-names.formatted.txt', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 1927\n",
"False 1\n",
"Name: index, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx_ccle = (all_ccle.reset_index()['index'].isin(checked['ccle_name']) | all_ccle.reset_index()['index'].isin(checked['alt_ccle']))\n",
"idx_ccle.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1673 SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE;SR786...\n",
"Name: index, dtype: object"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_ccle.reset_index()[~idx_ccle]['index']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 1823\n",
"Name: index, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx_broad = all_broad.reset_index()['index'].isin(checked['broad']) | all_broad.reset_index()['index'].isin(checked['alt_broad'])\n",
"idx_broad.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Series([], Name: broad, dtype: int64)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"checked['broad'].value_counts()[checked['broad'].value_counts().gt(1)]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 1580\n",
"Name: sanger, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"checked['sanger'].value_counts().value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "moalmanac",
"language": "python",
"name": "moalmanac"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 683dad1

Please sign in to comment.