From 8bdeda6448a3152e2bd6c8fd452d5c63b2f44e02 Mon Sep 17 00:00:00 2001
From: Lior Zimmerman <lior@enzymit.com>
Date: Thu, 21 Nov 2024 23:11:50 +0200
Subject: [PATCH] minor change to fasta parsing

---
 src/boltz/data/parse/fasta.py  | 14 +++---
 src/boltz/data/parse/schema.py | 82 +++++++++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/src/boltz/data/parse/fasta.py b/src/boltz/data/parse/fasta.py
index 7858159..e810597 100644
--- a/src/boltz/data/parse/fasta.py
+++ b/src/boltz/data/parse/fasta.py
@@ -66,13 +66,10 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target:  # noqa: C901
         # Get chain id, entity type and sequence
         header = seq_record.id.split("|")
         chain_id, entity_type = header[:2]
-        if len(header) == 3 and header[2] != "":
-            assert (
-                entity_type.lower() == "protein"
-            ), "MSA_ID is only allowed for proteins"
-            msa_id = header[2]
+        if len(header) == 3:
+            supp_file = header[2]
         else:
-            msa_id = None
+            supp_file = None
 
         entity_type = entity_type.upper()
         seq = str(seq_record.seq)
@@ -83,7 +80,7 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target:  # noqa: C901
                     "id": chain_id,
                     "sequence": seq,
                     "modifications": [],
-                    "msa": msa_id,
+                    "msa": supp_file if len(header) == 3 else None,
                 },
             }
         elif entity_type == "RNA":
@@ -114,9 +111,10 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target:  # noqa: C901
                 "ligand": {
                     "id": chain_id,
                     "smiles": seq,
+                    "conformer": supp_file if len(header) == 3 else None,
                 }
             }
-
+        
         sequences.append(molecule)
 
     data = {
diff --git a/src/boltz/data/parse/schema.py b/src/boltz/data/parse/schema.py
index 8d3325b..192b9c7 100644
--- a/src/boltz/data/parse/schema.py
+++ b/src/boltz/data/parse/schema.py
@@ -7,6 +7,7 @@
 from rdkit import rdBase
 from rdkit.Chem import AllChem
 from rdkit.Chem.rdchem import Conformer, Mol
+from rdkit.Chem import SDMolSupplier, MolFromMol2File, MolFromPDBFile
 
 from boltz.data import const
 from boltz.data.types import (
@@ -179,7 +180,7 @@ def get_conformer(mol: Mol) -> Conformer:
     # Fallback to the ideal coordinates
     for c in mol.GetConformers():
         try:
-            if c.GetProp("name") == "Ideal":
+            if c.GetProp("name") in ["Ideal", "File"]:
                 return c
         except KeyError:  # noqa: PERF203
             pass
@@ -446,6 +447,63 @@ def parse_polymer(
     )
 
 
+def parse_conformer_file(path: str) -> None:
+    """Parse a conformer file and add it to the molecule.
+
+    Parameters
+    ----------
+    path : str
+        The path to the conformer file.
+    mol : Mol
+        The molecule to add the conformer to.
+
+    """
+    if path.endswith('.sdf'):
+        supplier = SDMolSupplier(path)
+        mol = next(supplier)
+
+    elif path.endswith('.mol2'):
+        mol = MolFromMol2File(path)
+        if mol is None:
+            msg = f"Failed to read conformer from file: {path}"
+            raise ValueError(msg)
+
+    elif path.endswith('.pdb'):
+        mol = MolFromPDBFile(path)
+        if mol is None:
+            msg = f"Failed to read conformer from file: {path}"
+            raise ValueError(msg)
+    else:
+        msg = f"Unsupported conformer file format: {path}"
+        raise ValueError(msg)
+    for conf in mol.GetConformers():
+        conf.SetProp("name", "File")
+    return mol
+
+def align_conf_to_mol(mol: Mol, conf_mol: Mol) -> None:
+    """Align a conformer to a molecule.
+
+    Parameters
+    ----------
+    mol : Mol
+        The molecule to align to.
+    conf_mol : Mol
+        The conformer to align.
+
+    """
+    if not mol.HasSubstructMatch(conf_mol):
+        msg = "Conformer does not match molecule!"
+        raise ValueError(msg)
+    match = mol.GetSubstructMatch(conf_mol)
+    for idx, atom in enumerate(conf_mol.GetAtoms()):
+        smiles_atom = mol.GetAtomWithIdx(match[idx])
+        assert atom.GetProp("name") == smiles_atom.GetProp("name")
+        atom.SetFormalCharge(smiles_atom.GetFormalCharge())  # Transfer formal charges
+        atom.SetProp("name", smiles_atom.GetProp("name"))  # Transfer atom names
+        atom.SetAtomicNum(smiles_atom.GetAtomicNum())  # Transfer atomic numbers
+
+
+
 def parse_boltz_schema(  # noqa: C901, PLR0915, PLR0912
     name: str,
     schema: dict,
@@ -471,6 +529,7 @@ def parse_boltz_schema(  # noqa: C901, PLR0915, PLR0912
         - ligand:
             id: E
             smiles: "CC1=CC=CC=C1"
+            conformer: path/to/conformer.sdf # Optional
         - ligand:
             id: [F, G]
             ccd: []
@@ -519,12 +578,15 @@ def parse_boltz_schema(  # noqa: C901, PLR0915, PLR0912
         if entity_type in {"protein", "dna", "rna"}:
             seq = str(item[entity_type]["sequence"])
         elif entity_type == "ligand":
-            assert "smiles" in item[entity_type] or "ccd" in item[entity_type]
-            assert "smiles" not in item[entity_type] or "ccd" not in item[entity_type]
+            assert "smiles" in item[entity_type] or "ccd" in item[entity_type] 
             if "smiles" in item[entity_type]:
                 seq = str(item[entity_type]["smiles"])
-            else:
+            elif "ccd" in item[entity_type]:
                 seq = str(item[entity_type]["ccd"])
+            else:
+                msg = f"Invalid ligand type: {item[entity_type]}"
+                raise ValueError(msg)
+
         items_to_group.setdefault((entity_type, seq), []).append(item)
 
     # Go through entities and parse them
@@ -643,11 +705,21 @@ def parse_boltz_schema(  # noqa: C901, PLR0915, PLR0912
             canonical_order = AllChem.CanonicalRankAtoms(mol)
             for atom, can_idx in zip(mol.GetAtoms(), canonical_order):
                 atom.SetProp("name", atom.GetSymbol().upper() + str(can_idx + 1))
-
             success = compute_3d_conformer(mol)
             if not success:
                 msg = f"Failed to compute 3D conformer for {seq}"
                 raise ValueError(msg)
+            if "conformer" in items[0][entity_type] and items[0][entity_type]["conformer"] is not None:
+                conf_mol = parse_conformer_file(items[0][entity_type]["conformer"])
+                conf_mol = AllChem.AddHs(conf_mol)
+                can_order = AllChem.CanonicalRankAtoms(conf_mol)
+                for atom, can_idx in zip(conf_mol.GetAtoms(), can_order):
+                    atom.SetProp("name", atom.GetSymbol().upper() + str(can_idx + 1))
+                #align conformer to mol, set the metadata:
+                align_conf_to_mol(mol, conf_mol)
+                #replace the computed conformer with the one from the file
+                
+                mol = conf_mol
 
             mol_no_h = AllChem.RemoveHs(mol)
             residue = parse_ccd_residue(