Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Route comparison and scoring updates #22

Merged
merged 4 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
680 changes: 361 additions & 319 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "reaction_utils"
version = "1.6.0"
version = "1.7.0"
description = "Utilities for working with reactions, reaction templates and template extraction"
authors = ["Genheden, Samuel <[email protected]>", "Kannas, Christos <[email protected]>"]
license = "Apache-2.0"
Expand Down Expand Up @@ -28,6 +28,8 @@ numpy = "^1.0.0"
rdkit = "^2023.9.1"
cgrtools = "^4.1.35"
scipy = "^1.11.4"
pydantic = "^2.8.2"
apted = "^1.0.3"

[tool.poetry.dev-dependencies]
pytest = "^6.2.2"
Expand Down
22 changes: 22 additions & 0 deletions rxnutils/chem/augmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
""" Routines for augmenting chemical reactions
"""

_SINGLE_REACTANT_REAGENTS = {"10.1.1": "Br", "10.1.2": "Cl"}


def single_reactant_augmentation(smiles: str, classification: str) -> str:
"""
Augment single-reactant reaction with additional reagent if possible
based on the classification of the reaction
:param smiles: the reaction SMILES to augment
:param classification: the classification of the reaction or an empty string
:return: the processed SMILES
"""
reactants = smiles.split(">")[0]
if "." in reactants:
return smiles
classification = classification.split(" ")[0]
new_reactant = _SINGLE_REACTANT_REAGENTS.get(classification)
if new_reactant:
return new_reactant + "." + smiles
return smiles
24 changes: 24 additions & 0 deletions rxnutils/chem/disconnection_sites/atom_map_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,30 @@ def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]:
return atom_list


def atom_map_tag_reactants(mapped_rxn: str) -> str:
"""
Given atom-mapped reaction, returns disconnection site-tagged reactants where atoms
with changed atom environment are represented by [<atom>:1].

:param mapped_rxn: Atom-mapped reaction SMILES
:return: SMILES of the reactants containing tags corresponding to atoms changed in the
reaction.
"""
reactants_smiles, _, product_smiles = mapped_rxn.split(">")

reactants_mol = Chem.MolFromSmiles(reactants_smiles)
atom_list = get_atom_list(reactants_smiles, product_smiles)

# Set atoms in product with a different environment in reactants to 1
for atom in reactants_mol.GetAtoms():
if atom.GetAtomMapNum() in atom_list:
atom.SetAtomMapNum(1)
else:
atom.SetAtomMapNum(0)

return Chem.MolToSmiles(reactants_mol)


def atom_map_tag_products(mapped_rxn: str) -> str:
"""
Given atom-mapped reaction, returns disconnection site-tagged product where atoms
Expand Down
12 changes: 8 additions & 4 deletions rxnutils/chem/disconnection_sites/tag_converting.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,16 @@ def smiles_tokens(smiles: str) -> List[str]:
:param smiles: SMILES to tokenize
:return: List of tokens identified in SMILES.
"""
pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
regex = re.compile(pattern)
tokens = [token for token in regex.findall(smiles)]
assert smiles == "".join(tokens)

tokenized_smiles = "".join(tokens)
if smiles != tokenized_smiles:
raise AssertionError(
f"tokenized SMILES not the same as input SMILES: {tokenized_smiles}, "
"{smiles}, tokens: {tokens}"
)
return tokens


Expand Down Expand Up @@ -68,8 +74,6 @@ def tagged_smiles_from_tokens(
reaction using "<atom>!", and SMILES of the (reconstructed) untagged product
"""

print(product_tagged_tokens)

product_converted = ""
product_untagged = ""

Expand Down
3 changes: 1 addition & 2 deletions rxnutils/chem/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Module containing various chemical utility routines"""

import logging
import functools
from typing import List, Tuple
Expand Down Expand Up @@ -292,7 +291,7 @@ def get_special_groups(mol) -> List[Tuple[Tuple[int, ...], Tuple[int, ...]]]:

# Build list
groups = []
for add_if_match, template in group_templates:
for (add_if_match, template) in group_templates:
matches = mol.GetSubstructMatches(
Chem.MolFromSmarts(template), useChirality=True
)
Expand Down
18 changes: 17 additions & 1 deletion rxnutils/data/uspto/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
* preserve the ReactionSmiles and Year columns
* create an ID from PatentNumber and ParagraphNum and row index in the original file
"""

import argparse
from pathlib import Path
from typing import Optional, Sequence

import pandas as pd

from rxnutils.data.uspto.uspto_yield import UsptoYieldCuration

DEFAULT_FILENAMES = [
"1976_Sep2016_USPTOgrants_smiles.rsmi",
"2001_Sep2016_USPTOapplications_smiles.rsmi",
Expand All @@ -29,6 +32,12 @@ def main(args: Optional[Sequence[str]] = None) -> None:
"--output", default="uspto_data.csv", help="the output filename"
)
parser.add_argument("--folder", default=".", help="folder with downloaded files")
parser.add_argument(
"--with_yields",
action="store_true",
default=False,
help="if to add yield columns",
)
args = parser.parse_args(args)

filenames = [Path(args.folder) / filename for filename in args.filenames]
Expand All @@ -42,11 +51,18 @@ def main(args: Optional[Sequence[str]] = None) -> None:
para_num = data["ParagraphNum"].fillna("")
row_num = data.index.astype(str)
data["ID"] = data["PatentNumber"] + ";" + para_num + ";" + row_num
data2 = data[["ID", "Year", "ReactionSmiles"]]
columns = ["ID", "Year", "ReactionSmiles"]
if args.with_yields:
columns += ["TextMinedYield", "CalculatedYield"]
data2 = data[columns]

print(f"Total number of unique IDs: {len(set(data2['ID']))}")
print(f"Total number of records: {len(data2)}")

if args.with_yields:
print("Curating yields...")
data2 = UsptoYieldCuration()(data2)

data2.to_csv(Path(args.folder) / args.output, sep="\t", index=False)


Expand Down
1 change: 1 addition & 0 deletions rxnutils/data/uspto/preparation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Module containing pipeline for downloading, transforming and cleaning USPTO data
This needs to be run in an environment with rxnutils installed
"""

from pathlib import Path

from metaflow import step
Expand Down
51 changes: 51 additions & 0 deletions rxnutils/data/uspto/uspto_yield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Code for curating USPTO yields.

Inspiration from this code: https://github.com/DocMinus/Yield_curation_USPTO

This could potentially be an action, but since it only make sens to use it
with USPTO data, it resides here for now.
"""

from dataclasses import dataclass

import pandas as pd
import numpy as np


@dataclass
class UsptoYieldCuration:
"""
Action for curating USPTO yield columns
"""

text_yield_column: str = "TextMinedYield"
calc_yield_column: str = "CalculatedYield"
out_column: str = "CuratedYield"

def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
calc_yield = data[self.calc_yield_column].str.rstrip("%")
calc_yield = pd.to_numeric(calc_yield, errors="coerce")
calc_yield[(calc_yield < 0) | (calc_yield > 100)] = np.nan

text_yield = data[self.text_yield_column].str.lstrip("~")
text_yield = text_yield.str.rstrip("%")
text_yield = text_yield.str.replace(">=", "", regex=False)
text_yield = text_yield.str.replace(">", "", regex=False)
text_yield = text_yield.str.replace("<", "", regex=False)
text_yield = text_yield.str.replace(r"\d{1,2}\sto\s", "", regex=True)
text_yield = pd.to_numeric(text_yield, errors="coerce")
text_yield[(text_yield < 0) | (text_yield > 100)] = np.nan

curated_yield = text_yield.copy()

sel = (~calc_yield.isna()) & (~text_yield.isna())
curated_yield[sel] = np.maximum(calc_yield[sel], text_yield[sel])

sel = (~calc_yield.isna()) & (text_yield.isna())
curated_yield[sel] = calc_yield[sel]

return data.assign(**{self.out_column: curated_yield})

def __str__(self) -> str:
return f"{self.pretty_name} (create one column with curated yield values)"
Loading
Loading