Skip to content

Commit

Permalink
Merge pull request #39 from datamol-io/fix/stereo-rdkit
Browse files Browse the repository at this point in the history
Fix Stereo issues on some bond slicing
  • Loading branch information
maclandrol authored May 6, 2024
2 parents d6f4271 + 182ffc7 commit e70e5ca
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 14 deletions.
50 changes: 36 additions & 14 deletions safe/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
from rdkit import Chem
from rdkit.Chem import BRICS
from loguru import logger

from ._exception import SAFEDecodeError, SAFEEncodeError, SAFEFragmentationError
from .utils import standardize_attach
Expand All @@ -34,30 +35,33 @@ class SAFEConverter:
"""

SUPPORTED_SLICERS = ["hr", "recap", "mmpa", "attach", "brics"]
SUPPORTED_SLICERS = ["hr", "rotatable", "recap", "mmpa", "attach", "brics"]
__SLICE_SMARTS = {
"hr": ["[*]!@-[*]"], # any non ring single bond
"recap": [
"[C;$(C=O)]!@-N", # amides and urea
"[C;$(C=O)]!@-O", # esters
"C!@-[N;!$(NC=O)]", # amines
"C!@-[O;!$(NC=O)]", # ether
"[CX3]!@=[CX3]", # olefin
"[N+X4]!@-C", # quaternary nitrogen
"n!@-C", # aromatic N - aliphatic C
"[$([NR][CR]=O)]!@-C", # lactam nitrogen - aliphatic carbon
"c!@-c", # aromatic C - aromatic C
"N!@-[$(S(=O)=O)]", # sulphonamides
"[$([C;!$(C([#7])[#7])](=!@[O]))]!@[$([#7;+0;!D1])]",
"[$(C=!@O)]!@[$([O;+0])]",
"[$([N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*]))]-!@[$([*])]",
"[$(C(=!@O)([#7;+0;D2,D3])!@[#7;+0;D2,D3])]!@[$([#7;+0;D2,D3])]",
"[$([O;+0](-!@[#6!$(C=O)])-!@[#6!$(C=O)])]-!@[$([#6!$(C=O)])]",
"C=!@C",
"[N;+1;D4]!@[#6]",
"[$([n;+0])]-!@C",
"[$([O]=[C]-@[N;+0])]-!@[$([C])]",
"c-!@c",
"[$([#7;+0;D2,D3])]-!@[$([S](=[O])=[O])]",
],
"mmpa": ["[#6+0;!$(*=,#[!#6])]!@!=!#[*]"], # classical mmpa slicing smarts
"attach": ["[*]!@[*]"], # any potential attachment point, including hydrogens when explicit
"rotatable": ["[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]"],
}

def __init__(
self,
slicer: Optional[Union[str, List[str], Callable]] = "brics",
require_hs: Optional[bool] = None,
use_original_opener_for_attach: bool = True,
ignore_stereo: bool = False,
):
"""Constructor for the SAFE converter
Expand All @@ -69,6 +73,7 @@ def __init__(
`attach` slicer requires adding hydrogens.
use_original_opener_for_attach: whether to use the original branch opener digit when adding back
mapping number to attachment points, or use simple enumeration.
ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.
"""
self.slicer = slicer
Expand All @@ -78,8 +83,11 @@ def __init__(
self.slicer = [self.slicer]
if isinstance(self.slicer, (list, tuple)):
self.slicer = [dm.from_smarts(x) for x in self.slicer]
if any(x is None for x in self.slicer):
raise ValueError(f"Slicer: {slicer} cannot be valid")
self.require_hs = require_hs or (slicer == "attach")
self.use_original_opener_for_attach = use_original_opener_for_attach
self.ignore_stereo = ignore_stereo

@staticmethod
def randomize(mol: dm.Mol, rng: Optional[int] = None):
Expand Down Expand Up @@ -258,6 +266,10 @@ def encoder(
branch_numbers = self._find_branch_number(inp)

mol = dm.to_mol(inp, remove_hs=False)
potential_stereos = Chem.FindPotentialStereo(mol)
has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)
if self.ignore_stereo:
mol = dm.remove_stereochemistry(mol)

bond_map_id = 1
for atom in mol.GetAtoms():
Expand Down Expand Up @@ -344,9 +356,13 @@ def encoder(
scaffold_str = wrong_attach.sub(r"\g<1>", scaffold_str)
# furthermore, we autoapply rdkit-compatible digit standardization.
if rdkit_safe:
pattern = r"\(([=-@#]?)(%?\d{1,2})\)"
pattern = r"\(([=-@#\/\\]{0,2})(%?\d{1,2})\)"
replacement = r"\g<1>\g<2>"
scaffold_str = re.sub(pattern, replacement, scaffold_str)
if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):
logger.warning(
"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation"
)
return scaffold_str


Expand All @@ -358,6 +374,7 @@ def encode(
slicer: Optional[Union[List[str], str, Callable]] = None,
require_hs: Optional[bool] = None,
constraints: Optional[List[dm.Mol]] = None,
ignore_stereo: Optional[bool] = False,
):
"""
Convert input smiles to SAFE representation
Expand All @@ -370,14 +387,19 @@ def encode(
slicer: slicer algorithm to use for encoding. Defaults to "brics".
require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.
constraints: List of molecules or pattern to preserve during the SAFE construction.
ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.
"""
if slicer is None:
slicer = "brics"
with dm.without_rdkit_log():
safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs)
safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs, ignore_stereo=ignore_stereo)
try:
encoded = safe_obj.encoder(
inp, canonical=canonical, randomize=randomize, constraints=constraints, seed=seed
inp,
canonical=canonical,
randomize=randomize,
constraints=constraints,
seed=seed,
)
except SAFEFragmentationError as e:
raise e
Expand Down
25 changes: 25 additions & 0 deletions tests/test_safe.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,28 @@ def test_fused_ring_issue():
for fused_ring in FUSED_RING_LIST:
output_string = safe.decode(safe.encode(fused_ring))
assert dm.same_mol(fused_ring, output_string)


def test_stereochemistry_issue():
STEREO_MOL_LIST = [
"CC(=C\\c1ccccc1)/N=C/C(=O)O",
"CC(=C/c1ccccc1)/N=C/C(=O)O",
"CC(=C\\c1ccccc1)/N=C\\C(=O)O",
"CC(=C/c1ccccc1)/N=C\\C(=O)O",
"CC(=Cc1ccccc1)N=CC(=O)O",
"Cc1ccc(-n2c(C)cc(/C=N/Nc3ccc([N+](=O)[O-])cn3)c2C)c(C)c1",
"Cc1ccc(-n2c(C)cc(/C=N\\Nc3ccc([N+](=O)[O-])cn3)c2C)c(C)c1",
]
for mol in STEREO_MOL_LIST:
output_string = safe.encode(mol, ignore_stereo=False, slicer="rotatable")
assert dm.same_mol(mol, output_string)

# now let's test failure case where we fail because we split on a double bond
output = safe.encode(STEREO_MOL_LIST[0], ignore_stereo=False, slicer="brics")
assert dm.same_mol(STEREO_MOL_LIST[0], output) is False
same_stereo = [dm.remove_stereochemistry(dm.to_mol(x)) for x in [output, STEREO_MOL_LIST[0]]]
assert dm.same_mol(same_stereo[0], same_stereo[1])

# check if we ignore the stereo
output = safe.encode(STEREO_MOL_LIST[0], ignore_stereo=True, slicer="brics")
assert dm.same_mol(dm.remove_stereochemistry(dm.to_mol(STEREO_MOL_LIST[0])), output)

0 comments on commit e70e5ca

Please sign in to comment.