-
Notifications
You must be signed in to change notification settings - Fork 1
/
cxcalc.py
executable file
·249 lines (181 loc) · 6.52 KB
/
cxcalc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import subprocess
from io import StringIO
import pandas as pd
from AnalysisModule.routines.util import removefile
"""
a wrapper for jchemsuite
"""
Jia2019Si = """_feat_Mass
Molecular mass of _raw_SMILES (possibly a salt)
_feat_AtomCount_C
(atomcount -z 6)
Number of carbon atoms. (integer)
_feat_AtomCount_N
(atomcount -z 7)
Number of nitrogen atoms. (integer)
_feat_AvgPol
Average molecular polarizability (at _rxn_pH)
_feat_MolPol
Molecular polarizability (at _rxn_pH)
_feat_Refractivity
Computed refractivity
_feat_isoelectric
(isoelectricpoint)
Isoelectric point of the molecule
_feat_AliphaticRingCount
Number of aliphatic rings (integer)
_feat_AromaticRingCount
Number of aromatic rings (integer)
_feat_AliphaticAtomCount
AtomCount Number of aliphatic atoms in the molecule (integer)
_feat_AromaticAtomCount
Number of aromatic atoms in the molecule (integer)
_feat_BondCount
Number of bonds in the molecule (integer)
_feat_CarboaliphaticRingCount
Number of aliphatic rings comprised solely of carbon atoms (integer)
_feat_CarboaromaticRingCount
Number of aromatic rings comprised solely of carbon atoms (integer)
_feat_CarboRingCount
Number of rings comprised solely of carbon atoms (integer)
_feat_ChainAtomCount
Number atoms that are part of chain (not part of a ring) (integer)
_feat_ChiralCenterCount
Number of tetrahedral stereogenic centers (integer)
_feat_RingAtomCount
Number of atoms that are part of a ring (not part of a chain) (integer)
_feat_SmallestRingSize
Number of members in the smallest ring (integer)
_feat_LargestRingSize
Number of members in the largest ring (integer)
_feat_fsp3
Fraction of sp3 carbons (Fsp3 value)
_feat_HeteroaliphaticRingCount
Number of heteroaliphatic rings (integer)
_feat_HeteroaromaticRingCount
Number of heteroaromatic rings (integer)
_feat_RotatableBondCount
Number of rotatable bonds (integer)
_feat_BalabanIndex
Balaban molecular graph index
_feat_CyclomaticNumber
Cyclomatic number of molecular graph
_feat_HyperWienerIndex
Hyper Wiener Index of molecular graph
_feat_WienerIndex
Wiener Index of molecular graph
_feat_WienerPolarity
Wiener Polarity of molecular graph
_feat_MinimalProjectionArea
Minimal projection area
_feat_MinimalProjectionRadius
Minimal projection radius
_feat_MinimalProjectionRadius
Minimal projection radius
_feat_MaximalProjectionRadius
Maximal projection radius
_feat_LengthPerpendicularToTheMinArea
(minimalprojectionsize)
Length perpendicular to the minimal projection area
_feat_LengthPerpendicularToTheMaxArea
(maximalprojectionsize)
Length perpendicular to the maximum projection area
_feat_VanderWaalsVolume
(volume)
van der Waals volume of the molecule
_feat_VanderWaalsSurfaceArea
(vdwsa)
van der Waals surface area of the molecule
_feat_ASA
(asa -H _rxn_pH)
Water accessible surface area of the molecule, computed at _rxn_pH
_feat_ASA+
(molecularsurfacearea -t ASA+ -H _rxn_pH)
Water accessible surface area of all atoms with positive partial charge,computed at _rxn_pH
_feat_ASA-
(molecularsurfacearea -t ASA- -H _rxn_pH)
Water accessible surface area of all atoms with negative partial charge,computed at _rxn_pH
_feat_ASA_H
(molecularsurfacearea -t ASA_H -H _rxn_pH)
Water accessible surface area of all hydrophobic atoms with positive partial charge,computed at _rxn_pH
_feat_ASA_P
(molecularsurfacearea -t ASA+P -H _rxn_pH)
Water accessible surface area of all polar atoms with positive partial charge,computed at _rxn_pH
_feat_PolarSurfaceArea
(polarsurfacearea -H _rxn_pH)
2D Topological polar surface area, computed at _rxn_pH
_feat_acceptorcount
(acceptorcount -H _rxn_pH)
Hydrogen bond acceptor atom count in molecule, computed at _rxn_pH
_feat_Accsitecount
(acceptorsitecount -H _rxn_pH)
Hydrogen bond acceptor multiplicity in molecule, computed at _rxn_pH
_feat_donorcount
(donorcount -H _rxn_pH)
Hydrogen bond donor atom count in molecule, computed at _rxn_pH
_feat_donsitecount
Hydrogen bond donor multiplicity in molecule, computed at _rxn_pH
_feat_sol
(solubility -H _rxn_pH)
Aqueous solubility (logS) computed at _rxn_pH
_feat_apKa
(pka -a 2)
First and second acidic pKa value. Subsequent columns are the subsequent entries in the returned list.
_feat_bpKa1
(pka -b 4)
First-fourth basic pKa value. Subsequent columns are the"""
class CxFeature:
def __init__(self, feature: str, dstring: str, comment: str):
self.feature = feature
self.dstring = dstring
self.comment = comment
def __repr__(self):
return self.dstring
from collections import OrderedDict
def get_Feautre2Dstring(si_string: str):
des_strings = si_string.split("\n\n")
fd = OrderedDict()
for des_string in des_strings:
lines = des_string.split("\n")
if len(lines) == 2:
feature = lines[0][6:].strip().lower()
comment = lines[1]
if "rxn_ph" in comment.lower():
dstring = "{} -H _rxn_pH".format(feature)
else:
dstring = feature
elif len(lines) == 3:
feature = lines[0].strip("_feat_").strip().lower()
dstring = lines[1].strip().strip("(").strip(")")
comment = lines[2]
if "rxn_ph" in comment.lower() and "rxn_ph" not in dstring.lower():
dstring = "{} -H _rxn_pH".format(dstring)
else:
raise ValueError("{}\n cannot be parsed!".format(des_string))
fd[feature] = CxFeature(feature, dstring, comment)
return fd
Jia2019FeatureDict = get_Feautre2Dstring(Jia2019Si)
class JChemCalculator:
def __init__(self, cxexe="/home/ai/localpkg/jchemsuite/bin/cxcalc"):
self.cxexe = cxexe
def cal_feature(self, features: [CxFeature], smiles: [str], rmtmp=True, rxnph: float = None):
instring = "\n".join(smiles)
tmpfn = "jchem" + str(hash(instring)) + ".smiles"
with open(tmpfn, "w") as f:
f.write(instring)
if rxnph is None:
dstring = " ".join([cf.dstring for cf in features if "rxn_ph" not in cf.dstring.lower()])
elif isinstance(rxnph, float):
dstring = " ".join([cf.dstring for cf in features])
dstring.replace("_rxn_pH", str(rxnph))
else:
raise ValueError("rxnph must be a float or None!: {}".format(rxnph))
# dnames
cmd = "{} {} {}".format(self.cxexe, dstring, tmpfn)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
out = out.decode("utf-8")
df = pd.read_csv(StringIO(out), sep="\t")
if rmtmp:
removefile(tmpfn)
return df