OpenBioML · kjappelbaum · Nov 7, 2023 · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/.github/workflows/install.yaml b/.github/workflows/install.yaml
@@ -14,11 +14,16 @@ jobs:
               with:
                   environment-file: conda.yaml
                   activate-environment: chemnlp
-                  python-version: 3.8
+                  python-version: 3.9
                   auto-update-conda: true
                   auto-activate-base: false
             - name: Validate yaml
               shell: bash -l {0}
               run: |
                   conda activate chemnlp
                   python -m src.chemnlp.data_val.validate data
+            - name: Tests
+              shell: bash -l {0}
+              run: |
+                  pip install pytest
+                  pytest tests
diff --git a/conda.yaml b/conda.yaml
@@ -1,9 +1,9 @@
 ---
 name: dummy
 dependencies:
-    - python==3.8.*
+    - python==3.9.*
     - pip
     - pip:
           - .
           - .[dev]
-        #   - ".[dataset_creation]"
+          - .[dataset_creation]
diff --git a/data/text_sampling/extend_tabular.py b/data/text_sampling/extend_tabular.py
@@ -5,109 +5,16 @@
 import time
 from functools import partial
 
-import deepsmiles
 import pandas as pd
-import pubchempy as pcp
-import requests
-import selfies
-from rdkit import Chem
-
-# tucan needs very likely python 3.10
-# from tucan.canonicalization import canonicalize_molecule
-# from tucan.io import graph_from_molfile_text
-# from tucan.serialization import serialize_molecule
 from utils import load_yaml
 
-# not used yet
-# def augment_smiles(smiles: str, int_aug: int = 50, deduplicate: bool = True) -> str:
-#    """
-#    Takes a SMILES (not necessarily canonical) and returns `int_aug` random variations of this SMILES.
-#    """
-#
-#    mol = Chem.MolFromSmiles(smiles)
-#
-#    if mol is None:
-#        return None
-#    else:
-#        if int_aug > 0:
-#            augmented = [
-#                Chem.MolToSmiles(mol, canonical=False, doRandom=True)
-#                for _ in range(int_aug)
-#            ]
-#            if deduplicate:
-#                augmented = list(set(augmented))
-#            return augmented
-#        else:
-#            raise ValueError("int_aug must be greater than zero.")
-
-
-def smiles_to_selfies(smiles: str) -> str:
-    """
-    Takes a SMILES and return the selfies encoding.
-    """
-
-    return selfies.encoder(smiles)
-
-
-def smiles_to_deepsmiles(smiles: str) -> str:
-    """
-    Takes a SMILES and return the DeepSMILES encoding.
-    """
-    converter = deepsmiles.Converter(rings=True, branches=True)
-    return converter.encode(smiles)
-
-
-def smiles_to_canoncial(smiles: str) -> str:
-    """
-    Takes a SMILES and return the canoncial SMILES.
-    """
-    mol = Chem.MolFromSmiles(smiles)
-    return Chem.MolToSmiles(mol)
-
-
-def smiles_to_inchi(smiles: str) -> str:
-    """
-    Takes a SMILES and return the InChI.
-    """
-    mol = Chem.MolFromSmiles(smiles)
-    return Chem.MolToInchi(mol)
-
-
-# def smiles_to_tucan(smiles: str) -> str:
-#    """
-#    Takes a SMILES and return the Tucan encoding.
-#    For this, create a molfile as StringIO, read it with graph_from_file,
-#    canonicalize it and serialize it.
-#    """
-#    molfile = Chem.MolToMolBlock(Chem.MolFromSmiles(smiles), forceV3000=True)
-#    mol = graph_from_molfile_text(molfile)
-#    mol = canonicalize_molecule(mol)
-#    return serialize_molecule(mol)
-
-
-CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"
-
-
-def smiles_to_iupac_name(smiles: str) -> str:
-    """Use the chemical name resolver https://cactus.nci.nih.gov/chemical/structure.
-    If this does not work, use pubchem.
-    """
-    try:
-        time.sleep(0.001)
-        rep = "iupac_name"
-        url = CACTUS.format(smiles, rep)
-        response = requests.get(url, allow_redirects=True, timeout=10)
-        response.raise_for_status()
-        name = response.text
-        if "html" in name:
-            return None
-        return name
-    except Exception:
-        try:
-            compound = pcp.get_compounds(smiles, "smiles")
-            return compound[0].iupac_name
-        except Exception:
-            return None
+from chemnlp.data.reprs import (  # smiles_to_safe,
+    smiles_to_canoncial,
+    smiles_to_deepsmiles,
+    smiles_to_inchi,
+    smiles_to_iupac_name,
+    smiles_to_selfies,
+)
 
 
 def _try_except_none(func, *args, **kwargs):
@@ -141,8 +48,8 @@ def line_reps_from_smiles(
             "deepsmiles": _try_except_none(smiles_to_deepsmiles, smiles),
             "canonical": _try_except_none(smiles_to_canoncial, smiles),
             "inchi": _try_except_none(smiles_to_inchi, smiles),
-            # "tucan": _try_except_none(smiles_to_tucan, smiles),
             "iupac_name": _try_except_none(smiles_to_iupac_name, smiles),
+            # "safe": _try_except_none(smiles_to_safe, smiles),
         }
 
         # Note: This needs proper filelocking to work.
@@ -248,6 +155,7 @@ def line_reps_from_smiles(
             "inchi": [],
             # "tucan": [],
             "iupac_name": [],
+            # "safe": [],
         }
 
         for entry in parsed:

diff --git a/data/text_sampling/extend_tabular_processed.py b/data/text_sampling/extend_tabular_processed.py
@@ -16,6 +16,7 @@
         "canonical",
         "inchi",
         "iupac_name",
+        # "safe",
     ]
 
     if not os.path.isfile(path_processed_smiles):

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,17 +6,10 @@ build-backend = "setuptools.build_meta"
 name = "chemnlp"
 description = "Open source chemistry dataset & LLM"
 readme = "README.md"
-requires-python = "==3.8.*"  # required for gpt-neox
+requires-python = "==3.9.*"
 dependencies = [
-        "datasets>=2.8.0",
-        "numpy>=1.21.2",
-        "openpyxl>=3.0.9",
-        "pandas>=1.3.3",
-        "peft",
+        "pandas",
         "pydantic",
-        "pytdc>=0.3.9",
-        "transformers",
-        "wandb==0.10.28"
 ]
 
 dynamic = ["version"]
@@ -30,7 +23,7 @@ dev = [
         "pre-commit",
         "pydantic_yaml<=0.11.2",
         "pytest",
-        "pubchempy"
+        "pubchempy",
 ]
 
 dataset_creation = [
@@ -44,9 +37,10 @@ dataset_creation = [
         "bioc",
         "pylatexenc",
         "canonicalize_psmiles@git+https://github.com/Ramprasad-Group/canonicalize_psmiles.git",
-        #"tucan@git+https://github.com/TUCAN-nest/TUCAN.git"  # the current version has bugs due to the type checking, maybe this is due to our python version?
         "rxn-chem-utils",
-        "givemeconformer"
+      #  "safe-mol",
+        "backoff",
+        "givemeconformer",
 ]
 
 training = [
@@ -61,7 +55,7 @@ tokenisation = [
         "zstandard",
         "apache_beam",
         "mwparserfromhell",
-        "jsonlines"
+        "jsonlines",
 ]
 
 [tool.setuptools_scm]

diff --git a/src/chemnlp/data/ner.py b/src/chemnlp/data/ner.py
@@ -1,7 +1,8 @@
 import re
 
 
-def group_tokens_by_labels(tokens, labels, join=True):
+def group_tokens_by_labels(tokens, labels):
+    join = True
     grouped_tokens = []
     current_group = []
 

diff --git a/src/chemnlp/data/reprs.py b/src/chemnlp/data/reprs.py
@@ -0,0 +1,77 @@
+import backoff
+import deepsmiles
+import pubchempy as pcp
+import requests
+import safe
+import selfies
+from rdkit import Chem
+
+
+def smiles_to_selfies(smiles: str) -> str:
+    """
+    Takes a SMILES and return the selfies encoding.
+    """
+
+    return selfies.encoder(smiles)
+
+
+def smiles_to_deepsmiles(smiles: str) -> str:
+    """
+    Takes a SMILES and return the DeepSMILES encoding.
+    """
+    converter = deepsmiles.Converter(rings=True, branches=True)
+    return converter.encode(smiles)
+
+
+def smiles_to_canoncial(smiles: str) -> str:
+    """
+    Takes a SMILES and return the canoncial SMILES.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    return Chem.MolToSmiles(mol)
+
+
+def smiles_to_inchi(smiles: str) -> str:
+    """
+    Takes a SMILES and return the InChI.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    return Chem.MolToInchi(mol)
+
+
+def smiles_to_safe(smiles: str) -> str:
+    """
+    Takes a SMILES and return the SAFE.
+    """
+    return safe.encode(smiles, seed=42, canonical=True, randomize=False)
+
+
+CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"
+
+
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
+def cactus_request_w_backoff(smiles, rep="iupac_name"):
+    url = CACTUS.format(smiles, rep)
+    response = requests.get(url, allow_redirects=True, timeout=10)
+    response.raise_for_status()
+    name = response.text
+    if "html" in name:
+        return None
+    return name
+
+
+def smiles_to_iupac_name(smiles: str) -> str:
+    """Use the chemical name resolver https://cactus.nci.nih.gov/chemical/structure.
+    If this does not work, use pubchem.
+    """
+    try:
+        name = cactus_request_w_backoff(smiles, rep="iupac_name")
+        if name is None:
+            raise Exception
+        return name
+    except Exception:
+        try:
+            compound = pcp.get_compounds(smiles, "smiles")
+            return compound[0].iupac_name
+        except Exception:
+            return None
diff --git a/tests/test_ner.py b/tests/test_ner.py
@@ -3,21 +3,14 @@
 
 def test_tokens_by_label():
     tokens = ["a", "b", "c", "d", "e", "f"]
-    labels = [0, 1, 1, 0, 1, 0]
-    grouped_tokens = group_tokens_by_labels(tokens, labels, join=False)
-    assert grouped_tokens == [["b"], ["c"], ["e"]]
-
-    labels = [0, 1, 2, 0, 1, 0]
-    grouped_tokens = group_tokens_by_labels(tokens, labels, join=False)
-    assert grouped_tokens == [["b", "c"], ["e"]]
 
     labels = [0, 1, 1, 0, 1, 0]
-    grouped_tokens = group_tokens_by_labels(tokens, labels, join=True)
-    assert grouped_tokens == ["b", "c", "e"]
+    grouped_tokens = group_tokens_by_labels(tokens, labels)
+    assert set(grouped_tokens) == set(["b", "c", "e"])
 
     labels = [0, 1, 2, 0, 1, 0]
-    grouped_tokens = group_tokens_by_labels(tokens, labels, join=True)
-    assert grouped_tokens == ["b c", "e"]
+    grouped_tokens = group_tokens_by_labels(tokens, labels)
+    assert set(grouped_tokens) == set(["b c", "e"])
 
 
 def test_join_punctuation():

diff --git a/tests/test_reprs.py b/tests/test_reprs.py
@@ -0,0 +1,15 @@
+from chemnlp.data.reprs import smiles_to_iupac_name, smiles_to_safe
+
+
+def test_smiles_to_safe():
+    safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
+    # equivalent, only rotations, it is not completely deterministic
+    assert (
+        safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
+        or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
+    )
+
+
+def test_smiles_to_iupac_name():
+    iupac_name = smiles_to_iupac_name("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
+    assert iupac_name == "2-[4-(2-methylpropyl)phenyl]propanoic acid"