diff --git a/datamol/data/__init__.py b/datamol/data/__init__.py index a13718e3..b2e40418 100644 --- a/datamol/data/__init__.py +++ b/datamol/data/__init__.py @@ -194,13 +194,14 @@ def chembl_drugs(as_df: Literal[False] = False) -> List[Mol]: def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: - """A list of ~2k molecules from ChEMBL (all drugs). + """A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format. + Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name. - Originally, proposed by Patrick Walters at . + List was generated with ['Get_ChEMBL_Approved_Drugs.ipynb'](https://github.com/datamol-io/datamol/notebooks/Get_ChEMBL_Approved_Drugs.ipynb) on 2023-10-18. + The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date. """ - - with open_datamol_data_file("chembl_drugs.csv") as f: - data = pd.read_csv(f) + with open_datamol_data_file("chembl_approved_drugs.parquet", open_binary=True) as f: + data = pd.read_parquet(f) if not as_df: data = from_df(data) diff --git a/datamol/data/chembl_approved_drugs.parquet b/datamol/data/chembl_approved_drugs.parquet new file mode 100644 index 00000000..a21c4f36 Binary files /dev/null and b/datamol/data/chembl_approved_drugs.parquet differ diff --git a/notebooks/Get_ChEMBL_Approved_Drugs.ipynb b/notebooks/Get_ChEMBL_Approved_Drugs.ipynb new file mode 100644 index 00000000..103ba151 --- /dev/null +++ b/notebooks/Get_ChEMBL_Approved_Drugs.ipynb @@ -0,0 +1,424 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28c39f67", + "metadata": {}, + "source": [ + "Retrieve all the approved drugs from ChEMBL as well as the date of approval and the SMILES.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2dd3e4ab-5921-4422-b6c6-adf2b6801254", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import datamol as dm\n", + "\n", + "from chembl_webresource_client.new_client import new_client as client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cc2c16ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4192" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First, we retrieve the ChEMBL IDs for all the approved drugs (max_phase=4)\n", + "mol_ids = client.molecule.filter(max_phase=4).only([\"molecule_chembl_id\"])\n", + "mol_ids = pd.DataFrame(mol_ids)\n", + "\n", + "len(mol_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2744b624", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/shawnwhitfield/miniconda3/envs/datamol/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "100%|██████████| 4192/4192 [00:08<00:00, 498.63it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_approvalmolecule_chembl_idmolecule_typepref_namesmiles
01976.0CHEMBL2Small moleculePRAZOSINCOc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
11984.0CHEMBL3Small moleculeNICOTINECN1CCC[C@H]1c1cccnc1
21990.0CHEMBL4Small moleculeOFLOXACINCC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
31964.0CHEMBL5Small moleculeNALIDIXIC ACIDCCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
41965.0CHEMBL6Small moleculeINDOMETHACINCOc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
\n", + "
" + ], + "text/plain": [ + " first_approval molecule_chembl_id molecule_type pref_name \\\n", + "0 1976.0 CHEMBL2 Small molecule PRAZOSIN \n", + "1 1984.0 CHEMBL3 Small molecule NICOTINE \n", + "2 1990.0 CHEMBL4 Small molecule OFLOXACIN \n", + "3 1964.0 CHEMBL5 Small molecule NALIDIXIC ACID \n", + "4 1965.0 CHEMBL6 Small molecule INDOMETHACIN \n", + "\n", + " smiles \n", + "0 COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC \n", + "1 CN1CCC[C@H]1c1cccnc1 \n", + "2 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 \n", + "3 CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 \n", + "4 COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now retrieve other columns of interest\n", + "\n", + "columns = [\n", + " \"molecule_chembl_id\",\n", + " \"pref_name\",\n", + " \"first_approval\",\n", + " \"molecule_structures\",\n", + " \"molecule_type\",\n", + "]\n", + "\n", + "\n", + "def _get_mol(molecule_chembl_id):\n", + " mols = client.molecule.filter(molecule_chembl_id=molecule_chembl_id).only(columns)\n", + " assert len(mols) == 1\n", + " mol = mols[0]\n", + "\n", + " if mol.get(\"molecule_structures\") is not None and \"canonical_smiles\" in mol.get(\n", + " \"molecule_structures\", []\n", + " ):\n", + " mol[\"smiles\"] = mol[\"molecule_structures\"][\"canonical_smiles\"]\n", + "\n", + " if \"molecule_structures\" in mol:\n", + " del mol[\"molecule_structures\"]\n", + "\n", + " return pd.Series(mol)\n", + "\n", + "\n", + "mols = dm.parallelized(\n", + " _get_mol,\n", + " mol_ids[\"molecule_chembl_id\"],\n", + " n_jobs=256,\n", + " scheduler=\"threads\",\n", + " progress=True,\n", + ")\n", + "mols = pd.DataFrame(mols)\n", + "\n", + "mols.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "240561e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_approvalmolecule_chembl_idmolecule_typepref_namesmiles
01976.0CHEMBL2Small moleculePRAZOSINCOc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
11984.0CHEMBL3Small moleculeNICOTINECN1CCC[C@H]1c1cccnc1
21990.0CHEMBL4Small moleculeOFLOXACINCC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
31964.0CHEMBL5Small moleculeNALIDIXIC ACIDCCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
41965.0CHEMBL6Small moleculeINDOMETHACINCOc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
..................
26232015.0CHEMBL5095048Small moleculeAMPHETAMINE ASPARTATE/DEXTROAMPHETAMINE SULFATECC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1...
26242022.0CHEMBL5095049Small moleculePACRITINIB CITRATEC1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc...
26252021.0CHEMBL5095050Small moleculeFINGOLIMOD LAURYL SULFATECCCCCCCCCCCCOS(=O)(=O)O.CCCCCCCCc1ccc(CCC(N)(C...
26262022.0CHEMBL5095051Small moleculeVENLAFAXINE BESYLATECOc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.O=S(=O)(O)c1c...
26272015.0CHEMBL5095505Small moleculeAMPHETAMINE/DEXTROAMPHETAMINECC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1...
\n", + "

2628 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " first_approval molecule_chembl_id molecule_type \\\n", + "0 1976.0 CHEMBL2 Small molecule \n", + "1 1984.0 CHEMBL3 Small molecule \n", + "2 1990.0 CHEMBL4 Small molecule \n", + "3 1964.0 CHEMBL5 Small molecule \n", + "4 1965.0 CHEMBL6 Small molecule \n", + "... ... ... ... \n", + "2623 2015.0 CHEMBL5095048 Small molecule \n", + "2624 2022.0 CHEMBL5095049 Small molecule \n", + "2625 2021.0 CHEMBL5095050 Small molecule \n", + "2626 2022.0 CHEMBL5095051 Small molecule \n", + "2627 2015.0 CHEMBL5095505 Small molecule \n", + "\n", + " pref_name \\\n", + "0 PRAZOSIN \n", + "1 NICOTINE \n", + "2 OFLOXACIN \n", + "3 NALIDIXIC ACID \n", + "4 INDOMETHACIN \n", + "... ... \n", + "2623 AMPHETAMINE ASPARTATE/DEXTROAMPHETAMINE SULFATE \n", + "2624 PACRITINIB CITRATE \n", + "2625 FINGOLIMOD LAURYL SULFATE \n", + "2626 VENLAFAXINE BESYLATE \n", + "2627 AMPHETAMINE/DEXTROAMPHETAMINE \n", + "\n", + " smiles \n", + "0 COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC \n", + "1 CN1CCC[C@H]1c1cccnc1 \n", + "2 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 \n", + "3 CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 \n", + "4 COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 \n", + "... ... \n", + "2623 CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... \n", + "2624 C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc... \n", + "2625 CCCCCCCCCCCCOS(=O)(=O)O.CCCCCCCCc1ccc(CCC(N)(C... \n", + "2626 COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.O=S(=O)(O)c1c... \n", + "2627 CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... \n", + "\n", + "[2628 rows x 5 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's focus on small molecules with a valid SMILES and a first approval date\n", + "mols = mols.query(\"molecule_type == 'Small molecule' & smiles.notna() & first_approval.notna()\")\n", + "mols = mols.reset_index(drop=True)\n", + "\n", + "mols" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9e2a65f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Save as Parquet\n", + "mols.to_parquet(\"../datamol/data/chembl_approved_drugs.parquet\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "medchem", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_data.py b/tests/test_data.py index 55fdeaf7..b4be7cca 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -42,8 +42,14 @@ def test_solubility(): def test_chembl_drugs(): data = dm.data.chembl_drugs() - assert data.shape == (1935, 1) - assert list(data.columns) == ["smiles"] + assert data.shape == (2628, 5) + assert list(data.columns) == [ + "first_approval", + "molecule_chembl_id", + "molecule_type", + "pref_name", + "smiles", + ] def test_chembl_samples():