From edd293aa2cf7b874e9a6b6275169db238078b1ee Mon Sep 17 00:00:00 2001 From: luciafuriassi <81392097+luciafuriassi@users.noreply.github.com> Date: Thu, 25 Mar 2021 17:43:45 -0500 Subject: [PATCH] Add files via upload --- calc_props.py | 243 +++++++++++++++++++++++++++++++++++++++++ my_scaffold_metrisc.py | 155 ++++++++++++++++++++++++++ 2 files changed, 398 insertions(+) create mode 100644 calc_props.py create mode 100644 my_scaffold_metrisc.py diff --git a/calc_props.py b/calc_props.py new file mode 100644 index 0000000..d8c4391 --- /dev/null +++ b/calc_props.py @@ -0,0 +1,243 @@ +# +# Calculation of molecular descriptors and complexity index +# +# Implements RDKit +# +# Bryon Drown, May 2015 +# Updated Oct. 9, 2015 +# University of Illinois, Urbana-Champaign +# +__doc__ = """ +Performs calculations of physiochemical properties of set of compounds +Properties to be calculated: +Fsp3, chiral centers +""" + +import sys +from rdkit import Chem +from rdkit.Chem import Descriptors +from rdkit.ML.Descriptors import MoleculeDescriptors +from collections import defaultdict +from collections import OrderedDict +import optparse +from os.path import basename +import csv + + +def main(): + + args = parse_args() + + ms = [x for x in Chem.SDMolSupplier(args.input_file) if x is not None] + + mols = [] + for mol in ms: + remove_ligprep_props(mol) + temp = calc_builtin_props(mol) + calcRingDescriptors(temp) + calc_chiral_centers(temp) + mols.append(temp) + + filename_base = args.output_path + '/' + \ + basename(args.input_file).strip((".sdf")) + if(args.csv): + with open(filename_base + '.csv', 'w') as out: + write_mol_csv(mols, out) + if(args.sdf): + write_mol_sdf(mols, filename_base) + + +def write_mol_csv(mols, outfile, includeChirality=True): + """Writes list of molecules and properties to CSV file + """ + w = csv.writer(outfile) + + # Get prop names from first mol and get header + first = mols[0] + propNames = list(first.GetPropNames()) + outL = [] + outL.append('SMILES') + outL.extend(propNames) + w.writerow(outL) + + # Write out properties for each molecule + for mol in mols: + smi = Chem.MolToSmiles(mol, isomericSmiles=includeChirality) + outL = [] + outL.append(smi) + for prop in propNames: + if mol.HasProp(prop): + outL.append(str(mol.GetProp(prop))) + else: + outL.append('') + w.writerow(outL) + return + + +def csv_header(mol): + """Creates string for header of csv file + """ + properties = ['name', 'smiles'] + props = mol.GetPropNames() + for prop in props: + properties.append(prop) + return ','.join(map(str, properties)) + + +def mol_props_to_csv(mol): + """Creates string for properties of individual molecule when writing to csv + TODO: keep width the same when some compounds don't have a given property + """ + values = [] + values.append(mol.GetProp('_Name')) + values.append(Chem.MolToSmiles(mol, isomericSmiles=True)) + + properties = mol.GetPropNames() + for prop in properties: + values.append(mol.GetProp(prop)) + + return ','.join(map(str, values)) + + +def write_mol_sdf(mols, filename): + """Writes list of molecules and properties to SDF file + """ + ms_wr = Chem.SDWriter(filename + ".sdf") + for mol in mols: + ms_wr.write(mol) + ms_wr.close() + + +def parse_args(): + """Parse the command line options. + + @return: All script options + """ + + parser = optparse.OptionParser(__doc__) + parser.set_defaults(verbose=False) + parser.add_option("-i", "--input", dest="input_file", default=None, + help="Input sdf file that contains structures for which properties will be calculated [default: %default]") + parser.add_option("-o", "--output", dest="output_path", default='', + help="Folder to which output files should be saved [default: %default]") + parser.add_option("-c", "--csv", action="store_true", dest="csv") + parser.add_option("-s", "--sdf", action="store_true", dest="sdf") + + (options, args) = parser.parse_args() + if(options.input_file == None): + print("Input file %s is needed" % options.input_file) + parser.print_help() + sys.exit(1) + return options + + +def remove_ligprep_props(m): + """The properties that are attached to molecules by ligprep were are removed + """ + props = ['chiral flag', 'version', 's_m_source_file', 'i_m_source_file_index', 'i_lp_mmshare_version', 'r_lp_tautomer_probability', + 'r_epik_Ionization_Penalty', 'r_epik_Ionization_Penalty_Charging', 'r_epik_Ionization_Penalty_Neutral', + 'r_epik_State_Penalty', 'r_epik_Charging_Adjusted_Penalty', 'i_epik_Tot_Q', 'i_epik_Tot_abs_Q', 'i_f3d_flags', + 's_lp_Force_Field', 'r_lp_Energy', 'b_lp_Chiralities_Consistent', 's_lp_Variant', 's_epik_Chemistry_Notes', 's_epik_input', + 's_epik_cmdline'] + for prop in props: + m.ClearProp(prop) + + +def calc_builtin_props(m): + """Calculates properties that are part of rdkit base + + @param m: molecule for which to perform calculations + @return: molecule with properties attached + """ + + nms = ('FractionCSP3', 'MolWt', 'RingCount') + calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms) + + descrs = calc.CalcDescriptors(m) + for x in range(len(descrs)): + m.SetProp(str(nms[x]), str(descrs[x])) + + return m + +def calc_chiral_centers(m): + """Calculates the number of chiral centers in a molecule + + @param m: molecule for which to perform calculations + """ + + centers = Chem.FindMolChiralCenters(m, force=True, includeUnassigned=True) + m.SetProp('NumChiralCenters', str(len(centers))) + return + + + +def calcRingDescriptors(m): + """Calculates a set of properties that measure ring complexity + + @param m: molecule for which to perform calculations + """ + + nBonds = m.GetNumBonds() + nAtoms = m.GetNumAtoms() + cyclomatic = nBonds - nAtoms + 1 + if(cyclomatic < 1): + return + + ri = m.GetRingInfo() + if(ri.NumRings() < 1): + return + # get total ring path and nBondRings + totalRing = 0 + Bonds = [] + Bridges = [] + for ring in ri.BondRings(): + + for id in ring: + + if (ri.NumBondRings(id) > 1): + + Bridges.append(id) + totalRing += 1 + Bonds.append(id) + + # remove duplicates, then get length + nBondRings = len(OrderedDict.fromkeys(Bonds).keys()) + nBridgeEdges = len(OrderedDict.fromkeys(Bridges).keys()) + + # get nAtomRings + Atoms = [] + for ring in ri.AtomRings(): + + for id in ring: + + Atoms.append(id) + nAtomRings = len(OrderedDict.fromkeys(Atoms).keys()) + + # descriptors + ringFusionDensity = 2 * float(nBridgeEdges) / float(nAtomRings) + ringComplexityIndex = float(totalRing) / float(nAtomRings) + molecularCyclizedDegree = float(nAtomRings) / float(nAtoms) + nRingSystems = (nBonds - nBondRings) - (nAtoms - nAtomRings) + 1 + if(nRingSystems < 1): + + ringFusionDegree = 0 + else: + + ringFusionDegree = float(cyclomatic) / float(nRingSystems) + + # set props + m.SetProp('TotalRing', str(totalRing)) + m.SetProp('NumBridges', str(nBridgeEdges)) + m.SetProp('nBondRings', str(nBondRings)) + m.SetProp('nAtomRings', str(nAtomRings)) + m.SetProp('ringFusionDensity', str(ringFusionDensity)) + m.SetProp('ringFusionDegree', str(ringFusionDegree)) + m.SetProp('ringComplexityIndex', str(ringComplexityIndex)) + m.SetProp('molecularCyclizedDegree', str(molecularCyclizedDegree)) + m.SetProp('NumRingSystems', str(nRingSystems)) + + return + + +if __name__ == '__main__': + main() diff --git a/my_scaffold_metrisc.py b/my_scaffold_metrisc.py new file mode 100644 index 0000000..f246ba2 --- /dev/null +++ b/my_scaffold_metrisc.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Feb 21 14:05:04 2021 + +@author: lucia +""" + +# Import scaffoldgraph +import scaffoldgraph as sg + +# Import networkx +import networkx as nx + +# Import plotting tools +import matplotlib.pyplot as plt + +import numpy as np + +# Import rdkit +from rdkit.Chem import Draw +from rdkit import Chem +import rdkit +import random +import os +from collections import Counter +os.chdir('C:/Users/lucia/OneDrive/Desktop/ScaffoldGraph-master') + +#%% +sdf_file = ('limonin.sdf') # Example SDF file (200 PubChem compounds) +#smi_file = ('smiles_plateIA.smi') +supplier = Chem.SDMolSupplier(sdf_file) +network = sg.ScaffoldNetwork.from_sdf(sdf_file, progress=True) + +# We can access the number of molecule nodes and scaffold nodes in the graph +n_scaffolds_net = network.num_scaffold_nodes +n_molecules_net = network.num_molecule_nodes + +print('\nGenerated scaffold network from {} molecules with {} scaffolds\n'.format(n_molecules_net, n_scaffolds_net)) +#%% +# Molecules are stored in the network with their _Name property as a key +# When using SDF format rdkit assigns the _Name property from the TITLE section +# In this case this refers to a PubChem ID + +molecules = list(network.get_molecule_nodes()) + +# Calculate number of scaffolds and singletons +frag_to_save = [] +for i in range(len(molecules)): + mol_id = i + pubchem_id = molecules[mol_id] + smiles_iter = network.nodes[pubchem_id]['smiles'] + frags_iter = sg.get_all_murcko_fragments(Chem.MolFromSmiles(smiles_iter)) + num_rings = [] + for mol in frags_iter: + num_rings.append(rdkit.Chem.rdMolDescriptors.CalcNumRings(mol)) + m = max(num_rings) + position_max = [i for i, j in enumerate(num_rings) if j == m] + frag_to_save.append(frags_iter[position_max[0]]) +list_smiles = [] +for frag_saved in frag_to_save: + list_smiles.append(Chem.MolToSmiles(frag_saved)) + +smiles_no_duplicate = set(list_smiles) + +count_unique_smiles = Counter(list_smiles) +number_of_unique_smiles= [] + +for key_count in count_unique_smiles.keys(): + if count_unique_smiles[key_count]==1: + number_of_unique_smiles.append(count_unique_smiles[key_count]) +# Number of scaffold +N = len(smiles_no_duplicate) +Nsing = sum(number_of_unique_smiles) + +print("Number of scaffolds is",N) +print("Number of singletons is",Nsing) +#%% +# We can generate a scaffold tree from the SDF file just like before + +#tree = sg.ScaffoldTree.from_smiles_file(smi_file, progress=True) +tree = sg.ScaffoldTree.from_sdf(sdf_file, progress=True) + +# We can access the number of molecule nodes and scaffold nodes in the graph +n_scaffolds_tree = tree.num_scaffold_nodes +n_molecules_tree = tree.num_molecule_nodes + +#print('\nGenerated scaffold tree from {} molecules with {} scaffolds\n'.format(n_molecules_tree, n_scaffolds_tree)) + +# The output is a forest structure (multiple trees) + +#print('Graph is a Forest:', nx.is_forest(tree)) +#%% +# We can get the number of scaffolds in each hierarchy easily (The numbers are different to the network) + +counts_tree = tree.get_hierarchy_sizes() +lists = sorted(counts_tree.items()) +x, y = zip(*lists) + +# Plot sizes as bar chart + +plt.figure(figsize=(8, 6)) +plt.bar(x, y) +plt.xlabel('Hierarchy') +plt.ylabel('Scaffold Count') +plt.title('Number of Scaffolds per Hierarchy (Tree)') +plt.show() +#%% +# frag_all_to_save = [] +# num_rings_all = [] +# num_frag_per_mol = [] +# for i in range(len(molecules)): +# mol_id = i +# pubchem_id = molecules[mol_id] +# smiles_iter = network.nodes[pubchem_id]['smiles'] +# frags_iter = sg.get_all_murcko_fragments(Chem.MolFromSmiles(smiles_iter)) +# num_frag_per_mol.append(len(frags_iter)) +# frag_all_to_save.append(frags_iter) +# all_frags = [item for sublist in frag_all_to_save for item in sublist] +# for mol in all_frags: +# num_rings_all.append(rdkit.Chem.rdMolDescriptors.CalcNumRings(mol)) +# #all_frags = [item for sublist in frag_all_to_save for item in sublist] + +# list_all_smiles = [] +# for frag_iteration in all_frags: +# list_all_smiles.append(Chem.MolToSmiles(frag_iteration)) + +# count_all_smiles = Counter(list_all_smiles) + +# perc_mol = [] +# num_mol = [] +# for key_smile in count_all_smiles.keys(): +# perc_mol.append(count_all_smiles[key_smile]/len(molecules)) +# num_mol.append(count_all_smiles[key_smile]) + +# perc_mol_sorted = 1-np.sort(np.array(perc_mol))[::-1] + +# list_smiles = [] +# for frag_saved in frag_to_save: +# list_smiles.append(Chem.MolToSmiles(frag_saved)) + +# smiles_no_duplicate = set(list_smiles) + +# count_unique_smiles = Counter(list_smiles) +# number_of_unique_smiles= [] + +# for key_count in count_unique_smiles.keys(): +# if count_unique_smiles[key_count]==1: +# number_of_unique_smiles.append(count_unique_smiles[key_count]) + +# def ecdf(data): +# """ Compute ECDF """ +# x = np.sort(data) +# n = x.size +# y = np.arange(1, n+1) / n +# return(x,y)