Skip to content

Commit

Permalink
Merge pull request #201 from datamol-io/drop_solvent_salt
Browse files Browse the repository at this point in the history
Add two new functions to remove salts and solvents from a molecule
  • Loading branch information
zhu0619 authored Jun 22, 2023
2 parents 639e684 + eb51b5b commit 6ec1615
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 1 deletion.
2 changes: 2 additions & 0 deletions datamol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"clear_atom_map_number": "datamol.mol",
"set_atom_positions": "datamol.mol",
"get_atom_positions": "datamol.mol",
"remove_salts_solvents": "datamol.mol",
# cluster
"cluster_mols": "datamol.cluster",
"pick_diverse": "datamol.cluster",
Expand Down Expand Up @@ -276,6 +277,7 @@ def __dir__():
from .mol import clear_atom_map_number
from .mol import set_atom_positions
from .mol import get_atom_positions
from .mol import remove_salts_solvents

from .cluster import cluster_mols
from .cluster import pick_diverse
Expand Down
249 changes: 249 additions & 0 deletions datamol/data/salts_solvents.smi
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
//////////////////////////////////// PART 1 /////////////////////////////////////////////
// Salts data from Rdkit
// https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt

// $Id: Salts.txt 198 2006-12-15 18:06:48Z landrgr1 $
// Created by Greg Landrum, December 2006
// Definitions from Thomas Zoller
//
// Version history:
// 15 Dec, 2006: created (GL)

// Notes:
// 1) don't include charges
// 2) The search for salts is a substructure search where the substructure
// must match the entire fragment, so we don't need to be choosy about bond
// types
// 3) The matching is done in order, so if you put the more complex stuff at the
// bottom the "don't remove the last fragment" algorithm has a chance of
// of returning something sensible

// start with simple inorganics:
[Cl,Br,I]
[Li,Na,K,Ca,Mg]
[O,N]

// "complex" inorganics
[N](=O)(O)O
[P](=O)(O)(O)O
[P](F)(F)(F)(F)(F)F
[S](=O)(=O)(O)O
[CH3][S](=O)(=O)(O)
c1cc([CH3])ccc1[S](=O)(=O)(O) p-Toluene sulfonate

// organics
[CH3]C(=O)O Acetic acid
FC(F)(F)C(=O)O TFA
OC(=O)C=CC(=O)O Fumarate/Maleate
OC(=O)C(=O)O Oxalate
OC(=O)C(O)C(O)C(=O)O Tartrate
C1CCCCC1[NH]C1CCCCC1 Dicylcohexylammonium

// Copyright (c) 2010, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////// PART 2 /////////////////////////////////////////////
// Salt data from Chembl structure pipeline
// Version: 2022.09
// https://github.com/chembl/ChEMBL_Structure_Pipeline/tree/master/chembl_structure_pipeline/data
F[B-](F)(F)F Tetrafluoroboranuide
NC(CCCNC(=N)N)C(=O)O Arginine
CN(C)CCO Deanol
CCN(CC)CCO 2-(Diethylamino)ethanol
NCCO Ethanolamine
CNCC(O)C(O)C(O)C(O)CO DiMeglumine
CC(=O)O Acetate
CC(=O)NCC(=O)O Aceturate
CCCCCCCCCCCCCCCCCC(=O)O Stearate
OC(=O)CCCCC(=O)O Adipate
[Al] Aluminium
N Ammonium
OCC(O)C1OC(=O)C(=C1O)O Ascorbate
NC(CC(=O)O)C(=O)O Aspartate
[Ba] Barium
C(Cc1ccccc1)NCc2ccccc2 Benethamine
C(CNCc1ccccc1)NCc2ccccc2 Benzathine
OC(=O)c1ccccc1 Benzoate
OS(=O)(=O)c1ccccc1 Besylate
[Bi] Bismuth
Br Bromide
CCCC=O Butyraldehyde
CCCC(=O)OCC Ethyl Butanoate
[Ca] Calcium
CC1(C)C2CCC1(CS(=O)(=O)O)C(=O)C2 Camsylate
OC(=O)O Carbonate
Cl Chloride
C[N+](C)(C)CCO Choline
OC(=O)CC(O)(CC(=O)O)C(=O)O Citrate
OS(=O)(=O)c1ccc(Cl)cc1 Closylate
OS(=O)(=O)NC1CCCCC1 Cyclamate
OC(=O)C(Cl)Cl Dichloroacetate
CCNCC Diethylamine
CC(C)(N)CO Dimethylethanolamine
OCCNCCO Diolamine
NCCN Edamine
OS(=O)(=O)CCS(=O)(=O)O Edisylate
OCCN1CCCC1 Epolamine
CC(C)(C)N Erbumine
CCCCCCCCCCCCOS(=O)(=O)O Estolate
CCS(=O)(=O)O Esylate
CCOS(=O)(=O)O Ethylsulfate
F Fluoride
OC=O Formate
OCC(O)C(O)C(O)C(O)C(O)C(=O)O Gluceptate
OCC(O)C(O)C(O)C(O)C(=O)O Gluconate
OC1OC(C(O)C(O)C1O)C(=O)O Glucuronate
NC(CCC(=O)O)C(=O)O Glutamate
OCC(O)CO Glycerate
OCC(O)COP(=O)(O)O Glycerophosphate
F[P](F)(F)(F)(F)F Hexafluorophosphate
OP=O Hypophosphite
I Iodide
OCCS(=O)(=O)O Isethionate
[K] Potassium
CC(O)C(=O)O Lactate
OCC(O)C(OC1OC(CO)C(O)C(O)C1O)C(O)C(O)C(=O)O Lactobionate
[Li] Lithium
NCCCCC(N)C(=O)O Lysine
OC(CC(=O)O)C(=O)O Malate
OC(=O)C=CC(=O)O Maleate and Fumarate
CS(=O)(=O)O Mesylate
OP(=O)=O Metaphosphate
COS(=O)(=O)O Methosulfate
[Mg] Magnesium
OP(=O)(O)F Monofluorophosphate
[Na] Sodium
OS(=O)(=O)c1cccc2c(cccc12)S(=O)(=O)O Napadisilate
OS(=O)(=O)c1ccc2ccccc2c1 Napsylate
O[N](=O)O Nitrate
OC(=O)C(=O)O Oxalate
CCCCCCCCCCCCCCCC(=O)O Palmitate
OC(=O)c1cc2ccccc2c(Cc3c(O)c(cc4ccccc34)C(=O)O)c1O Pamoate
OCl(=O)(=O)=O Perchlorate
Nc1ccc(cc1)P(=O)(O)O Phosphanilate
OP(=O)(O)O Phosphate
Oc1c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[O-] Picrate
C1CNCCN1 Piperazine
CC(O)CO Propylene Glycol
O=C1NS(=O)(=O)c2ccccc12 Saccharin
OC(=O)c1ccccc1O Salicylate
[Ag] Silver
[Sr] Strontium
OC(=O)CCC(=O)O Succinate
OS(=O)(=O)O Sulfate
OC(=O)c1cccc(c1O)S(=O)(=O)O Sulfosalicylate
[S-2] Sulphide
OC(=O)c1ccc(cc1)C(=O)O Terephthalate
Cc1ccc(cc1)S(=O)(=O)O Tosylate
Oc1cc(Cl)c(Cl)cc1Cl Triclofenate
CCN(CC)CC Triethylamine
OC(=O)C(c1ccccc1)(c2ccccc2)c3ccccc3 Trifenatate
OC(=O)C(F)(F)F Triflutate
NC(CO)(CO)CO Tromethamine
CCCCC1CCC(CC1)C(=O)O Buciclate
CCCC(=O)O Butyrate
CCCCCC(=O)O Caproate
CC12CCC(CC1)(C=C2)C(=O)O Cyclotate
OC(=O)CCC1CCCC1 Cypionate
CN(C)CCC(=O)O Daproate
OC(=O)CN(CCN(CC(=O)O)CC(=O)O)CC(=O)O EDTA
CCCCCCCCC=CCCCCCCCC(=O)O Elaidate and oleate
CCCCCCC(=O)O Enanthate
CCOC(=O)O Etabonate
COCCO Ethanediol
OC(=O)CNC(=O)c1ccccc1 Etiprate
CCC(CC)C(=O)OCO Etzadroxil
CCCCCCCCCCCCCCOP(=O)(O)O Fostedate
OC(=O)c1occc1 Furoate
OC(=O)c1ccccc1C(=O)c2ccc(O)cc2 Hybenzate
CCCCCCCCCCCC(=O)O Laurate
CC=C(C)C(=O)O Mebutate
COC(=O)CC(O)(CCCC(C)(C)O)C(=O)O Mepesuccinate
OC(=O)c1cccc(c1)S(=O)(=O)O Metazoate
CSCCC(N)C(=O)C Methionil
OC(=O)c1cccnc1 Nicotinate
OO Peroxide
OC(=O)CCc1ccccc1 Phenpropionate
OC(=O)Cc1ccccc1 Phenylacetate
CC(C)(C)C(=O)O Pivalate
CCC(=O)O Propionate
CC(C)(C)CC(=O)O Tebutate
OCCN(CCO)CCO Trolamine
CCCCCCCCCCC(=O)O Undecylate
OC(=O)CCCCCCCCC=C Undecylenate
CCCCC(=O)O Valerate
O Water
OC(=O)c1ccc2ccccc2c1O Xinafoate
[Zn] Zinc
c1c[nH]cn1 Imidazole
OCCN1CCOCC1 4-(2-Hydroxyethyl)morpholine
CC(=O)Nc1ccc(cc1)C(=O)O 4-Acetamidobenzoic acid
CC1(C)C(CCC1(C)C(=O)O)C(=O)O Camphoric acid
CCCCCCCCCC(=O)O Capric acid
CCCCCCCC(=O)O Caprylic acid
OC(=O)C=Cc1ccccc1 Cinnamic acid
OC(C(O)C(O)C(=O)O)C(O)C(=O)O Mucic acid
OC(=O)c1cc(O)ccc1O Gentisic acid
OC(=O)CCCC(=O)O Glutaric acid
OC(=O)CCC(=O)C(=O)O 2-Oxoglutaric acid
OCC(=O)O Glycolic acid
CC(C)C(=O)O Isobutyric acid
OC(C(=O)O)c1ccccc1 Mandelic acid
OC(=O)c1cc(=O)nc(=O)n1 Orotic acid
OC(=O)C1CCC(=O)N1 Pyroglutamic acid
OC(C(O)C(=O)O)C(=O)O Tartrate
SC#N Thiocyanic acid
CI Methyl Iodide
OS(=O)O Sulfurous Acid
C1CCC(CC1)NC2CCCCC2 Dicyclohexylamine
OS(=O)(=O)C(F)(F)F Triflate
Cc1cc(C)c(c(C)c1)S(=O)(=O)O Mesitylene sulfonate
OC(=O)CC(=O)O Malonic acid
OS(=O)(=O)F Fluorosulfuric acid
CC(=O)OS(=O)(=O)O Acetylsulfate
[H] Proton
[Rb] Rubidium
[Cs] Cesium
[Fr] Francium
[Be] Beryllium
[Ra] Radium
C(=O)C(O)C(O)C(O)C(O)C(=O)O Glucuronate open form
CC(O)CN(C)C Dimepranol

// Solvent data from Chembl structure pipeline
// Version: 2019
// https://github.com/chembl/ChEMBL_Structure_Pipeline/tree/master/chembl_structure_pipeline/data
[OH2] WATER
ClCCl DICHLOROMETHANE
ClC(Cl)Cl TRICHLOROMETHANE
CCOC(=O)C ETHYL ACETATE
CO METHANOL
CC(C)O PROPAN-2-OL
CC(=O)C ACETONE
CS(=O)C DMSO
CCO ETHANOL
43 changes: 42 additions & 1 deletion datamol/mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@

from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.MolStandardize import canonicalize_tautomer_smiles
from rdkit.Chem.SaltRemover import SaltRemover

import datamol
from . import _sanifix4
from .types import Mol
from .convert import to_inchikey_non_standard
Expand All @@ -36,14 +38,15 @@
from .log import without_rdkit_log
from ._version import is_lower_than_current_rdkit_version


PERIODIC_TABLE = Chem.rdchem.GetPeriodicTable()
TRIPLE_BOND = Chem.rdchem.BondType.TRIPLE
DOUBLE_BOND = Chem.rdchem.BondType.DOUBLE
SINGLE_BOND = Chem.rdchem.BondType.SINGLE
AROMATIC_BOND = Chem.rdchem.BondType.AROMATIC
DATIVE_BOND = Chem.rdchem.BondType.DATIVE
UNSPECIFIED_BOND = Chem.rdchem.BondType.UNSPECIFIED
SALT_SOLVENT_PATH = datamol.data.open_datamol_data_file("salts_solvents.smi").name
SALT_SOLVENT_REMOVER = SaltRemover(defnFilename=SALT_SOLVENT_PATH)


def copy_mol(mol: Mol) -> Mol:
Expand Down Expand Up @@ -1376,3 +1379,41 @@ def get_atom_positions(
positions = positions[mapped_indices, :]

return positions


def remove_salts_solvents(
mol: Mol,
defn_data: str = None,
defn_format: str = "smarts",
dont_remove_everything: bool = False,
sanitize: bool = True,
) -> Mol:
"""Remove all salts and solvents from the molecule.
In most cases when dealing with small drug-like molecules, the salt/solvent units are smaller
than the parent molecule. `dm.mol.keep_largest_fragment` can be applied in that scenario.
However, in some cases the molecules of interested is smaller than the salt/solvent units,
it's recommended to define the salt/solvent units and apply `remove_salt_solvent` to remove
unwanted salt/solvent. A predefined salts and solvents are listed in file "datamol/data/salts_solvents.smi".
User can also define the salt/solvent units by passing string to argument `dafnData` and `defnFormat`.
Args:
mol: A molecule.
defn_data: A string to define salts and solvents. Use "\n" as seperator for multiple units.
defn_format: "smarts" or "smiles" when define the above salt/solvent units.
sanitize: Whether sanitize molecule after removing salt/solvent units.
dont_remove_everything: When set to `True`, the last salt/solvent will remain when the molecule is consisted by
multiple salt/solvent units.
See Also:
<rdkit.Chem.SaltRemover.SaltRemover>
<datamol.mol.keep_largest_fragment>
"""
mol_copy = copy_mol(mol)
if defn_data is None:
remover = SALT_SOLVENT_REMOVER
else:
remover = SaltRemover(defnData=defn_data, defnFormat=defn_format)
return remover.StripMol(
mol_copy, dontRemoveEverything=dont_remove_everything, sanitize=sanitize
)
55 changes: 55 additions & 0 deletions tests/test_mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,3 +903,58 @@ def test_set_atom_positions_fails():
conf_id=0,
use_atom_map_numbers=True,
)


def test_remove_salt():
smiles = "CN(C)C.Cl.Cl.Br"
mol = dm.to_mol(smiles)

# case of success
mol_no_salt = dm.remove_salts_solvents(mol)
assert mol_no_salt.GetNumAtoms() == mol.GetNumAtoms() - 3

# case to keep one salt in case the molecule is consisted by multiple salts
smiles = "[Cl].[Ca]"
mol = dm.to_mol(smiles)
mol_no_salt = dm.remove_salts_solvents(mol, dont_remove_everything=True)
assert mol_no_salt.GetNumAtoms() == 1
mol_no_salt = dm.remove_salts_solvents(mol)
assert mol_no_salt.GetNumAtoms() == 0

# case salt-like atoms in the molecule are unchanged
smiles = "CN(Br)Cl"
mol = dm.to_mol(smiles)
mol_no_salt = dm.remove_salts_solvents(mol)
assert mol_no_salt.GetNumAtoms() == mol.GetNumAtoms()


def test_remove_solvent():
smiles = "CN(C)C.CS(=O)C"
mol = dm.to_mol(smiles)

# case of success
mol_no_solvent = dm.remove_salts_solvents(mol)
assert mol_no_solvent.GetNumAtoms() == mol.GetNumAtoms() - 4

# case solvent-like atoms in the molecule are unchanged
smiles = "CCOc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)
mol_no_solvent = dm.remove_salts_solvents(mol)
assert mol_no_solvent.GetNumAtoms() == mol.GetNumAtoms()

# case solvent is larger than molecule of interest
smiles = (
"CC(CCC1=CC=C(C=C1)O)NCCC2=CC(=C(C=C2)O)O.C(C1C(C(C(C(O1)OC(C(CO)O)C(C(C(=O)O)O)O)O)O)O)O"
)
smi_compound = "CC(CCc1ccc(O)cc1)NCCc1ccc(O)c(O)c1"
mol = dm.to_mol(smiles)

# largest fragment removes the wrong unit
largest_fragment = dm.keep_largest_fragment(mol)
assert dm.to_smiles(largest_fragment, canonical=True) != smi_compound

# define the solvent to be removed
mol_no_solvent = dm.remove_salts_solvents(
mol, defn_data="C(C1C(C(C(C(O1)OC(C(CO)O)C(C(C(=O)O)O)O)O)O)O)O", defn_format="smiles"
)
assert dm.to_smiles(mol_no_solvent, canonical=True) == smi_compound

0 comments on commit 6ec1615

Please sign in to comment.