Add files via upload

HergenrotherLab · Mar 25, 2021 · edd293a · edd293a
1 parent 7cb9c6b
commit edd293a
Show file tree

Hide file tree

Showing 2 changed files with 398 additions and 0 deletions.
diff --git a/calc_props.py b/calc_props.py
@@ -0,0 +1,243 @@
+#
+# Calculation of molecular descriptors and complexity index
+#
+# Implements RDKit
+#
+# Bryon Drown, May 2015
+# Updated Oct. 9, 2015
+# University of Illinois, Urbana-Champaign
+#
+__doc__ = """
+Performs calculations of physiochemical properties of set of compounds
+Properties to be calculated:
+Fsp3, chiral centers
+"""
+
+import sys
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+from rdkit.ML.Descriptors import MoleculeDescriptors
+from collections import defaultdict
+from collections import OrderedDict
+import optparse
+from os.path import basename
+import csv
+
+
+def main():
+
+    args = parse_args()
+
+    ms = [x for x in Chem.SDMolSupplier(args.input_file) if x is not None]
+
+    mols = []
+    for mol in ms:
+        remove_ligprep_props(mol)
+        temp = calc_builtin_props(mol)
+        calcRingDescriptors(temp)
+        calc_chiral_centers(temp)
+        mols.append(temp)
+
+    filename_base = args.output_path + '/' + \
+        basename(args.input_file).strip((".sdf"))
+    if(args.csv):
+        with open(filename_base + '.csv', 'w') as out:
+            write_mol_csv(mols, out)
+    if(args.sdf):
+        write_mol_sdf(mols, filename_base)
+
+
+def write_mol_csv(mols, outfile, includeChirality=True):
+    """Writes list of molecules and properties to CSV file
+    """
+    w = csv.writer(outfile)
+
+    # Get prop names from first mol and get header
+    first = mols[0]
+    propNames = list(first.GetPropNames())
+    outL = []
+    outL.append('SMILES')
+    outL.extend(propNames)
+    w.writerow(outL)
+
+    # Write out properties for each molecule
+    for mol in mols:
+        smi = Chem.MolToSmiles(mol, isomericSmiles=includeChirality)
+        outL = []
+        outL.append(smi)
+        for prop in propNames:
+            if mol.HasProp(prop):
+                outL.append(str(mol.GetProp(prop)))
+            else:
+                outL.append('')
+        w.writerow(outL)
+    return
+
+
+def csv_header(mol):
+    """Creates string for header of csv file
+    """
+    properties = ['name', 'smiles']
+    props = mol.GetPropNames()
+    for prop in props:
+        properties.append(prop)
+    return ','.join(map(str, properties))
+
+
+def mol_props_to_csv(mol):
+    """Creates string for properties of individual molecule when writing to csv
+    TODO: keep width the same when some compounds don't have a given property
+    """
+    values = []
+    values.append(mol.GetProp('_Name'))
+    values.append(Chem.MolToSmiles(mol, isomericSmiles=True))
+
+    properties = mol.GetPropNames()
+    for prop in properties:
+        values.append(mol.GetProp(prop))
+
+    return ','.join(map(str, values))
+
+
+def write_mol_sdf(mols, filename):
+    """Writes list of molecules and properties to SDF file
+    """
+    ms_wr = Chem.SDWriter(filename + ".sdf")
+    for mol in mols:
+        ms_wr.write(mol)
+    ms_wr.close()
+
+
+def parse_args():
+    """Parse the command line options.
+
+    @return:  All script options
+    """
+
+    parser = optparse.OptionParser(__doc__)
+    parser.set_defaults(verbose=False)
+    parser.add_option("-i", "--input", dest="input_file", default=None,
+                      help="Input sdf file that contains structures for which properties will be calculated [default: %default]")
+    parser.add_option("-o", "--output", dest="output_path", default='',
+                      help="Folder to which output files should be saved [default: %default]")
+    parser.add_option("-c", "--csv", action="store_true", dest="csv")
+    parser.add_option("-s", "--sdf", action="store_true", dest="sdf")
+
+    (options, args) = parser.parse_args()
+    if(options.input_file == None):
+        print("Input file %s is needed" % options.input_file)
+        parser.print_help()
+        sys.exit(1)
+    return options
+
+
+def remove_ligprep_props(m):
+    """The properties that are attached to molecules by ligprep were are removed
+    """
+    props = ['chiral flag', 'version', 's_m_source_file', 'i_m_source_file_index', 'i_lp_mmshare_version', 'r_lp_tautomer_probability',
+             'r_epik_Ionization_Penalty', 'r_epik_Ionization_Penalty_Charging', 'r_epik_Ionization_Penalty_Neutral',
+             'r_epik_State_Penalty', 'r_epik_Charging_Adjusted_Penalty', 'i_epik_Tot_Q', 'i_epik_Tot_abs_Q', 'i_f3d_flags',
+             's_lp_Force_Field', 'r_lp_Energy', 'b_lp_Chiralities_Consistent', 's_lp_Variant', 's_epik_Chemistry_Notes', 's_epik_input',
+             's_epik_cmdline']
+    for prop in props:
+        m.ClearProp(prop)
+
+
+def calc_builtin_props(m):
+    """Calculates properties that are part of rdkit base
+
+    @param m: molecule for which to perform calculations
+    @return: molecule with properties attached
+    """
+
+    nms = ('FractionCSP3', 'MolWt', 'RingCount')
+    calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
+
+    descrs = calc.CalcDescriptors(m)
+    for x in range(len(descrs)):
+        m.SetProp(str(nms[x]), str(descrs[x]))
+
+    return m
+
+def calc_chiral_centers(m):
+    """Calculates the number of chiral centers in a molecule
+
+    @param m: molecule for which to perform calculations
+    """
+
+    centers = Chem.FindMolChiralCenters(m, force=True, includeUnassigned=True)
+    m.SetProp('NumChiralCenters', str(len(centers)))
+    return
+
+
+
+def calcRingDescriptors(m):
+    """Calculates a set of properties that measure ring complexity
+
+    @param m: molecule for which to perform calculations
+    """
+
+    nBonds = m.GetNumBonds()
+    nAtoms = m.GetNumAtoms()
+    cyclomatic = nBonds - nAtoms + 1
+    if(cyclomatic < 1):
+        return
+
+    ri = m.GetRingInfo()
+    if(ri.NumRings() < 1):
+        return
+    # get total ring path and nBondRings
+    totalRing = 0
+    Bonds = []
+    Bridges = []
+    for ring in ri.BondRings():
+
+        for id in ring:
+
+            if (ri.NumBondRings(id) > 1):
+
+                Bridges.append(id)
+            totalRing += 1
+            Bonds.append(id)
+
+    # remove duplicates, then get length
+    nBondRings = len(OrderedDict.fromkeys(Bonds).keys())
+    nBridgeEdges = len(OrderedDict.fromkeys(Bridges).keys())
+
+    # get nAtomRings
+    Atoms = []
+    for ring in ri.AtomRings():
+
+        for id in ring:
+
+            Atoms.append(id)
+    nAtomRings = len(OrderedDict.fromkeys(Atoms).keys())
+
+    # descriptors
+    ringFusionDensity = 2 * float(nBridgeEdges) / float(nAtomRings)
+    ringComplexityIndex = float(totalRing) / float(nAtomRings)
+    molecularCyclizedDegree = float(nAtomRings) / float(nAtoms)
+    nRingSystems = (nBonds - nBondRings) - (nAtoms - nAtomRings) + 1
+    if(nRingSystems < 1):
+
+        ringFusionDegree = 0
+    else:
+
+        ringFusionDegree = float(cyclomatic) / float(nRingSystems)
+
+    # set props
+    m.SetProp('TotalRing', str(totalRing))
+    m.SetProp('NumBridges', str(nBridgeEdges))
+    m.SetProp('nBondRings', str(nBondRings))
+    m.SetProp('nAtomRings', str(nAtomRings))
+    m.SetProp('ringFusionDensity', str(ringFusionDensity))
+    m.SetProp('ringFusionDegree', str(ringFusionDegree))
+    m.SetProp('ringComplexityIndex', str(ringComplexityIndex))
+    m.SetProp('molecularCyclizedDegree', str(molecularCyclizedDegree))
+    m.SetProp('NumRingSystems', str(nRingSystems))
+
+    return
+
+
+if __name__ == '__main__':
+    main()
diff --git a/my_scaffold_metrisc.py b/my_scaffold_metrisc.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Feb 21 14:05:04 2021
+
+@author: lucia
+"""
+
+# Import scaffoldgraph
+import scaffoldgraph as sg
+
+# Import networkx
+import networkx as nx
+
+# Import plotting tools
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+# Import rdkit
+from rdkit.Chem import Draw
+from rdkit import Chem
+import rdkit
+import random
+import os
+from collections import Counter
+os.chdir('C:/Users/lucia/OneDrive/Desktop/ScaffoldGraph-master')
+
+#%%
+sdf_file = ('limonin.sdf') # Example SDF file (200 PubChem compounds)
+#smi_file = ('smiles_plateIA.smi')
+supplier = Chem.SDMolSupplier(sdf_file)
+network = sg.ScaffoldNetwork.from_sdf(sdf_file, progress=True)
+
+# We can access the number of molecule nodes and scaffold nodes in the graph
+n_scaffolds_net = network.num_scaffold_nodes
+n_molecules_net = network.num_molecule_nodes
+
+print('\nGenerated scaffold network from {} molecules with {} scaffolds\n'.format(n_molecules_net, n_scaffolds_net))
+#%%
+# Molecules are stored in the network with their _Name property as a key
+# When using SDF format rdkit assigns the _Name property from the TITLE section
+# In this case this refers to a PubChem ID
+
+molecules = list(network.get_molecule_nodes())
+
+# Calculate number of scaffolds and singletons
+frag_to_save = []
+for i in range(len(molecules)):
+    mol_id = i
+    pubchem_id = molecules[mol_id]
+    smiles_iter = network.nodes[pubchem_id]['smiles']
+    frags_iter = sg.get_all_murcko_fragments(Chem.MolFromSmiles(smiles_iter))
+    num_rings = []
+    for mol in frags_iter:
+        num_rings.append(rdkit.Chem.rdMolDescriptors.CalcNumRings(mol))
+    m = max(num_rings)    
+    position_max = [i for i, j in enumerate(num_rings) if j == m]
+    frag_to_save.append(frags_iter[position_max[0]]) 
+list_smiles = []
+for frag_saved in frag_to_save:
+    list_smiles.append(Chem.MolToSmiles(frag_saved))
+
+smiles_no_duplicate = set(list_smiles)
+
+count_unique_smiles = Counter(list_smiles)
+number_of_unique_smiles= []
+
+for key_count in count_unique_smiles.keys():
+    if count_unique_smiles[key_count]==1:
+        number_of_unique_smiles.append(count_unique_smiles[key_count])
+# Number of scaffold
+N = len(smiles_no_duplicate)
+Nsing = sum(number_of_unique_smiles)
+
+print("Number of scaffolds is",N)
+print("Number of singletons is",Nsing)
+#%%
+# We can generate a scaffold tree from the SDF file just like before
+
+#tree = sg.ScaffoldTree.from_smiles_file(smi_file, progress=True)
+tree = sg.ScaffoldTree.from_sdf(sdf_file, progress=True)
+
+# We can access the number of molecule nodes and scaffold nodes in the graph
+n_scaffolds_tree = tree.num_scaffold_nodes
+n_molecules_tree = tree.num_molecule_nodes
+
+#print('\nGenerated scaffold tree from {} molecules with {} scaffolds\n'.format(n_molecules_tree, n_scaffolds_tree))
+
+# The output is a forest structure (multiple trees)
+
+#print('Graph is a Forest:', nx.is_forest(tree))
+#%%
+# We can get the number of scaffolds in each hierarchy easily (The numbers are different to the network)
+
+counts_tree = tree.get_hierarchy_sizes()
+lists = sorted(counts_tree.items())
+x, y = zip(*lists)
+
+# Plot sizes as bar chart
+
+plt.figure(figsize=(8, 6))
+plt.bar(x, y)
+plt.xlabel('Hierarchy')
+plt.ylabel('Scaffold Count')
+plt.title('Number of Scaffolds per Hierarchy (Tree)')
+plt.show()
+#%%
+# frag_all_to_save = []
+# num_rings_all = []
+# num_frag_per_mol = []
+# for i in range(len(molecules)):
+#     mol_id = i
+#     pubchem_id = molecules[mol_id]
+#     smiles_iter = network.nodes[pubchem_id]['smiles']
+#     frags_iter = sg.get_all_murcko_fragments(Chem.MolFromSmiles(smiles_iter))
+#     num_frag_per_mol.append(len(frags_iter))
+#     frag_all_to_save.append(frags_iter)
+# all_frags = [item for sublist in frag_all_to_save for item in sublist]
+# for mol in all_frags:
+#     num_rings_all.append(rdkit.Chem.rdMolDescriptors.CalcNumRings(mol))
+# #all_frags = [item for sublist in frag_all_to_save for item in sublist]
+
+# list_all_smiles = []
+# for frag_iteration in all_frags:
+#     list_all_smiles.append(Chem.MolToSmiles(frag_iteration))
+
+# count_all_smiles = Counter(list_all_smiles)
+
+# perc_mol = []
+# num_mol  = []
+# for key_smile in count_all_smiles.keys():
+#     perc_mol.append(count_all_smiles[key_smile]/len(molecules))
+#     num_mol.append(count_all_smiles[key_smile])
+
+# perc_mol_sorted = 1-np.sort(np.array(perc_mol))[::-1]
+
+# list_smiles = []
+# for frag_saved in frag_to_save:
+#     list_smiles.append(Chem.MolToSmiles(frag_saved))
+
+# smiles_no_duplicate = set(list_smiles)
+
+# count_unique_smiles = Counter(list_smiles)
+# number_of_unique_smiles= []
+
+# for key_count in count_unique_smiles.keys():
+#     if count_unique_smiles[key_count]==1:
+#         number_of_unique_smiles.append(count_unique_smiles[key_count])
+
+# def ecdf(data):
+#     """ Compute ECDF """
+#     x = np.sort(data)
+#     n = x.size
+#     y = np.arange(1, n+1) / n
+#     return(x,y)