From 8bdeda6448a3152e2bd6c8fd452d5c63b2f44e02 Mon Sep 17 00:00:00 2001 From: Lior Zimmerman Date: Thu, 21 Nov 2024 23:11:50 +0200 Subject: [PATCH] minor change to fasta parsing --- src/boltz/data/parse/fasta.py | 14 +++--- src/boltz/data/parse/schema.py | 82 +++++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/src/boltz/data/parse/fasta.py b/src/boltz/data/parse/fasta.py index 7858159..e810597 100644 --- a/src/boltz/data/parse/fasta.py +++ b/src/boltz/data/parse/fasta.py @@ -66,13 +66,10 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target: # noqa: C901 # Get chain id, entity type and sequence header = seq_record.id.split("|") chain_id, entity_type = header[:2] - if len(header) == 3 and header[2] != "": - assert ( - entity_type.lower() == "protein" - ), "MSA_ID is only allowed for proteins" - msa_id = header[2] + if len(header) == 3: + supp_file = header[2] else: - msa_id = None + supp_file = None entity_type = entity_type.upper() seq = str(seq_record.seq) @@ -83,7 +80,7 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target: # noqa: C901 "id": chain_id, "sequence": seq, "modifications": [], - "msa": msa_id, + "msa": supp_file if len(header) == 3 else None, }, } elif entity_type == "RNA": @@ -114,9 +111,10 @@ def parse_fasta(path: Path, ccd: Mapping[str, Mol]) -> Target: # noqa: C901 "ligand": { "id": chain_id, "smiles": seq, + "conformer": supp_file if len(header) == 3 else None, } } - + sequences.append(molecule) data = { diff --git a/src/boltz/data/parse/schema.py b/src/boltz/data/parse/schema.py index 8d3325b..192b9c7 100644 --- a/src/boltz/data/parse/schema.py +++ b/src/boltz/data/parse/schema.py @@ -7,6 +7,7 @@ from rdkit import rdBase from rdkit.Chem import AllChem from rdkit.Chem.rdchem import Conformer, Mol +from rdkit.Chem import SDMolSupplier, MolFromMol2File, MolFromPDBFile from boltz.data import const from boltz.data.types import ( @@ -179,7 +180,7 @@ def get_conformer(mol: Mol) -> Conformer: # Fallback to the ideal coordinates for c in mol.GetConformers(): try: - if c.GetProp("name") == "Ideal": + if c.GetProp("name") in ["Ideal", "File"]: return c except KeyError: # noqa: PERF203 pass @@ -446,6 +447,63 @@ def parse_polymer( ) +def parse_conformer_file(path: str) -> None: + """Parse a conformer file and add it to the molecule. + + Parameters + ---------- + path : str + The path to the conformer file. + mol : Mol + The molecule to add the conformer to. + + """ + if path.endswith('.sdf'): + supplier = SDMolSupplier(path) + mol = next(supplier) + + elif path.endswith('.mol2'): + mol = MolFromMol2File(path) + if mol is None: + msg = f"Failed to read conformer from file: {path}" + raise ValueError(msg) + + elif path.endswith('.pdb'): + mol = MolFromPDBFile(path) + if mol is None: + msg = f"Failed to read conformer from file: {path}" + raise ValueError(msg) + else: + msg = f"Unsupported conformer file format: {path}" + raise ValueError(msg) + for conf in mol.GetConformers(): + conf.SetProp("name", "File") + return mol + +def align_conf_to_mol(mol: Mol, conf_mol: Mol) -> None: + """Align a conformer to a molecule. + + Parameters + ---------- + mol : Mol + The molecule to align to. + conf_mol : Mol + The conformer to align. + + """ + if not mol.HasSubstructMatch(conf_mol): + msg = "Conformer does not match molecule!" + raise ValueError(msg) + match = mol.GetSubstructMatch(conf_mol) + for idx, atom in enumerate(conf_mol.GetAtoms()): + smiles_atom = mol.GetAtomWithIdx(match[idx]) + assert atom.GetProp("name") == smiles_atom.GetProp("name") + atom.SetFormalCharge(smiles_atom.GetFormalCharge()) # Transfer formal charges + atom.SetProp("name", smiles_atom.GetProp("name")) # Transfer atom names + atom.SetAtomicNum(smiles_atom.GetAtomicNum()) # Transfer atomic numbers + + + def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912 name: str, schema: dict, @@ -471,6 +529,7 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912 - ligand: id: E smiles: "CC1=CC=CC=C1" + conformer: path/to/conformer.sdf # Optional - ligand: id: [F, G] ccd: [] @@ -519,12 +578,15 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912 if entity_type in {"protein", "dna", "rna"}: seq = str(item[entity_type]["sequence"]) elif entity_type == "ligand": - assert "smiles" in item[entity_type] or "ccd" in item[entity_type] - assert "smiles" not in item[entity_type] or "ccd" not in item[entity_type] + assert "smiles" in item[entity_type] or "ccd" in item[entity_type] if "smiles" in item[entity_type]: seq = str(item[entity_type]["smiles"]) - else: + elif "ccd" in item[entity_type]: seq = str(item[entity_type]["ccd"]) + else: + msg = f"Invalid ligand type: {item[entity_type]}" + raise ValueError(msg) + items_to_group.setdefault((entity_type, seq), []).append(item) # Go through entities and parse them @@ -643,11 +705,21 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912 canonical_order = AllChem.CanonicalRankAtoms(mol) for atom, can_idx in zip(mol.GetAtoms(), canonical_order): atom.SetProp("name", atom.GetSymbol().upper() + str(can_idx + 1)) - success = compute_3d_conformer(mol) if not success: msg = f"Failed to compute 3D conformer for {seq}" raise ValueError(msg) + if "conformer" in items[0][entity_type] and items[0][entity_type]["conformer"] is not None: + conf_mol = parse_conformer_file(items[0][entity_type]["conformer"]) + conf_mol = AllChem.AddHs(conf_mol) + can_order = AllChem.CanonicalRankAtoms(conf_mol) + for atom, can_idx in zip(conf_mol.GetAtoms(), can_order): + atom.SetProp("name", atom.GetSymbol().upper() + str(can_idx + 1)) + #align conformer to mol, set the metadata: + align_conf_to_mol(mol, conf_mol) + #replace the computed conformer with the one from the file + + mol = conf_mol mol_no_h = AllChem.RemoveHs(mol) residue = parse_ccd_residue(