MolecularAI · SGenheden · Oct 11, 2024 · Sep 4, 2024 · Oct 7, 2024 · Oct 11, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "reaction_utils"
-version = "1.6.0"
+version = "1.7.0"
 description = "Utilities for working with reactions, reaction templates and template extraction"
 authors = ["Genheden, Samuel <[email protected]>", "Kannas, Christos <[email protected]>"]
 license = "Apache-2.0"
@@ -28,6 +28,8 @@ numpy = "^1.0.0"
 rdkit = "^2023.9.1"
 cgrtools = "^4.1.35"
 scipy = "^1.11.4"
+pydantic = "^2.8.2"
+apted = "^1.0.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.2"

diff --git a/rxnutils/chem/augmentation.py b/rxnutils/chem/augmentation.py
@@ -0,0 +1,22 @@
+""" Routines for augmenting chemical reactions
+"""
+
+_SINGLE_REACTANT_REAGENTS = {"10.1.1": "Br", "10.1.2": "Cl"}
+
+
+def single_reactant_augmentation(smiles: str, classification: str) -> str:
+    """
+    Augment single-reactant reaction with additional reagent if possible
+    based on the classification of the reaction
+    :param smiles: the reaction SMILES to augment
+    :param classification: the classification of the reaction or an empty string
+    :return: the processed SMILES
+    """
+    reactants = smiles.split(">")[0]
+    if "." in reactants:
+        return smiles
+    classification = classification.split(" ")[0]
+    new_reactant = _SINGLE_REACTANT_REAGENTS.get(classification)
+    if new_reactant:
+        return new_reactant + "." + smiles
+    return smiles
diff --git a/rxnutils/chem/disconnection_sites/atom_map_tagging.py b/rxnutils/chem/disconnection_sites/atom_map_tagging.py
@@ -95,6 +95,30 @@ def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]:
     return atom_list
 
 
+def atom_map_tag_reactants(mapped_rxn: str) -> str:
+    """
+    Given atom-mapped reaction, returns disconnection site-tagged reactants where atoms
+    with changed atom environment are represented by [<atom>:1].
+
+    :param mapped_rxn: Atom-mapped reaction SMILES
+    :return: SMILES of the reactants containing tags corresponding to atoms changed in the
+        reaction.
+    """
+    reactants_smiles, _, product_smiles = mapped_rxn.split(">")
+
+    reactants_mol = Chem.MolFromSmiles(reactants_smiles)
+    atom_list = get_atom_list(reactants_smiles, product_smiles)
+
+    # Set atoms in product with a different environment in reactants to 1
+    for atom in reactants_mol.GetAtoms():
+        if atom.GetAtomMapNum() in atom_list:
+            atom.SetAtomMapNum(1)
+        else:
+            atom.SetAtomMapNum(0)
+
+    return Chem.MolToSmiles(reactants_mol)
+
+
 def atom_map_tag_products(mapped_rxn: str) -> str:
     """
     Given atom-mapped reaction, returns disconnection site-tagged product where atoms

diff --git a/rxnutils/chem/disconnection_sites/tag_converting.py b/rxnutils/chem/disconnection_sites/tag_converting.py
@@ -15,10 +15,16 @@ def smiles_tokens(smiles: str) -> List[str]:
     :param smiles: SMILES to tokenize
     :return: List of tokens identified in SMILES.
     """
-    pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
+    pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smiles)]
-    assert smiles == "".join(tokens)
+
+    tokenized_smiles = "".join(tokens)
+    if smiles != tokenized_smiles:
+        raise AssertionError(
+            f"tokenized SMILES not the same as input SMILES: {tokenized_smiles}, "
+            "{smiles}, tokens: {tokens}"
+        )
     return tokens
 
 
@@ -68,8 +74,6 @@ def tagged_smiles_from_tokens(
         reaction using "<atom>!", and SMILES of the (reconstructed) untagged product
     """
 
-    print(product_tagged_tokens)
-
     product_converted = ""
     product_untagged = ""
 

diff --git a/rxnutils/chem/utils.py b/rxnutils/chem/utils.py
@@ -1,5 +1,4 @@
 """Module containing various chemical utility routines"""
-
 import logging
 import functools
 from typing import List, Tuple
@@ -292,7 +291,7 @@ def get_special_groups(mol) -> List[Tuple[Tuple[int, ...], Tuple[int, ...]]]:
 
     # Build list
     groups = []
-    for add_if_match, template in group_templates:
+    for (add_if_match, template) in group_templates:
         matches = mol.GetSubstructMatches(
             Chem.MolFromSmarts(template), useChirality=True
         )

diff --git a/rxnutils/data/uspto/combine.py b/rxnutils/data/uspto/combine.py
@@ -5,12 +5,15 @@
     * preserve the ReactionSmiles and Year columns
     * create an ID from PatentNumber and ParagraphNum and row index in the original file
 """
+
 import argparse
 from pathlib import Path
 from typing import Optional, Sequence
 
 import pandas as pd
 
+from rxnutils.data.uspto.uspto_yield import UsptoYieldCuration
+
 DEFAULT_FILENAMES = [
     "1976_Sep2016_USPTOgrants_smiles.rsmi",
     "2001_Sep2016_USPTOapplications_smiles.rsmi",
@@ -29,6 +32,12 @@ def main(args: Optional[Sequence[str]] = None) -> None:
         "--output", default="uspto_data.csv", help="the output filename"
     )
     parser.add_argument("--folder", default=".", help="folder with downloaded files")
+    parser.add_argument(
+        "--with_yields",
+        action="store_true",
+        default=False,
+        help="if to add yield columns",
+    )
     args = parser.parse_args(args)
 
     filenames = [Path(args.folder) / filename for filename in args.filenames]
@@ -42,11 +51,18 @@ def main(args: Optional[Sequence[str]] = None) -> None:
     para_num = data["ParagraphNum"].fillna("")
     row_num = data.index.astype(str)
     data["ID"] = data["PatentNumber"] + ";" + para_num + ";" + row_num
-    data2 = data[["ID", "Year", "ReactionSmiles"]]
+    columns = ["ID", "Year", "ReactionSmiles"]
+    if args.with_yields:
+        columns += ["TextMinedYield", "CalculatedYield"]
+    data2 = data[columns]
 
     print(f"Total number of unique IDs: {len(set(data2['ID']))}")
     print(f"Total number of records: {len(data2)}")
 
+    if args.with_yields:
+        print("Curating yields...")
+        data2 = UsptoYieldCuration()(data2)
+
     data2.to_csv(Path(args.folder) / args.output, sep="\t", index=False)
 
 

diff --git a/rxnutils/data/uspto/preparation_pipeline.py b/rxnutils/data/uspto/preparation_pipeline.py
@@ -2,6 +2,7 @@
 Module containing pipeline for downloading, transforming and cleaning USPTO data
 This needs to be run in an environment with rxnutils installed
 """
+
 from pathlib import Path
 
 from metaflow import step

diff --git a/rxnutils/data/uspto/uspto_yield.py b/rxnutils/data/uspto/uspto_yield.py
@@ -0,0 +1,51 @@
+"""
+Code for curating USPTO yields.
+
+Inspiration from this code: https://github.com/DocMinus/Yield_curation_USPTO
+
+This could potentially be an action, but since it only make sens to use it
+with USPTO data, it resides here for now.
+"""
+
+from dataclasses import dataclass
+
+import pandas as pd
+import numpy as np
+
+
+@dataclass
+class UsptoYieldCuration:
+    """
+    Action for curating USPTO yield columns
+    """
+
+    text_yield_column: str = "TextMinedYield"
+    calc_yield_column: str = "CalculatedYield"
+    out_column: str = "CuratedYield"
+
+    def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
+        calc_yield = data[self.calc_yield_column].str.rstrip("%")
+        calc_yield = pd.to_numeric(calc_yield, errors="coerce")
+        calc_yield[(calc_yield < 0) | (calc_yield > 100)] = np.nan
+
+        text_yield = data[self.text_yield_column].str.lstrip("~")
+        text_yield = text_yield.str.rstrip("%")
+        text_yield = text_yield.str.replace(">=", "", regex=False)
+        text_yield = text_yield.str.replace(">", "", regex=False)
+        text_yield = text_yield.str.replace("<", "", regex=False)
+        text_yield = text_yield.str.replace(r"\d{1,2}\sto\s", "", regex=True)
+        text_yield = pd.to_numeric(text_yield, errors="coerce")
+        text_yield[(text_yield < 0) | (text_yield > 100)] = np.nan
+
+        curated_yield = text_yield.copy()
+
+        sel = (~calc_yield.isna()) & (~text_yield.isna())
+        curated_yield[sel] = np.maximum(calc_yield[sel], text_yield[sel])
+
+        sel = (~calc_yield.isna()) & (text_yield.isna())
+        curated_yield[sel] = calc_yield[sel]
+
+        return data.assign(**{self.out_column: curated_yield})
+
+    def __str__(self) -> str:
+        return f"{self.pretty_name} (create one column with curated yield values)"