-
Notifications
You must be signed in to change notification settings - Fork 31
/
io.py
249 lines (216 loc) · 8.93 KB
/
io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import logging
def parse_list_to_xyz_str(mol: list, comment: str = "", number_coordinates: int = None):
"""Convert list of atom and coordinates list into xyz-string.
Args:
mol (list): Tuple or list of `[['C', 'H', ...], [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], ... ]]`.
comment (str): Comment for comment line in xyz string. Default is "".
number_coordinates (int): Number of allowed coordinates.
Returns:
str: Information in xyz-string format.
"""
atoms = mol[0]
coordinates = mol[1]
if len(atoms) != len(coordinates):
raise ValueError("Number of atoms does not match number of coordinates for xyz string.")
xyz_str = str(int(len(atoms))) + "\n"
if "\n" in comment:
raise ValueError("Line break must not be in the comment line for xyz string.")
xyz_str = xyz_str + comment + "\n"
for a_iter, c_iter in zip(atoms, coordinates):
_at_str = str(a_iter)
if number_coordinates is not None:
c_iter = c_iter[:number_coordinates]
_c_format_str = " {:.10f}" * len(c_iter) + "\n"
xyz_str = xyz_str + _at_str + _c_format_str.format(*c_iter)
return xyz_str
def write_list_to_xyz_file(filepath: str, mol_list: list):
"""Write a list of nested list of atom and coordinates into xyz-string. Uses :obj:`parse_list_to_xyz_str`.
Args:
filepath (str): Full path to file including name.
mol_list (list): List of molecules, which is a list of pairs of atoms and coordinates of
`[[['C', 'H', ... ], [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], ... ]], ... ]`.
"""
with open(filepath, "w+") as file:
for x in mol_list:
xyz_str = parse_list_to_xyz_str(x)
file.write(xyz_str)
def parse_mol_str(mol_str: str):
"""Parse MDL mol table string into nested list. Only supports V2000 format and CTab. Better rely on
OpenBabel to do this. This function was a temporary solution.
Args:
mol_str (str): String of mol block.
Returns:
list: [title, program, comment, counts, atoms, bonds, properties]
"""
empty_return = ["", "", "", [], [], [], []]
if len(mol_str) == 0:
logging.error("Received empty MLD mol-block string. Nothing to parse. Return empty list.")
return empty_return
lines = mol_str.split("\n")
if len(lines) < 4:
logging.error("Could not find counts line. Invalid format. Can not parse string. Return empty list.")
return empty_return
title = lines[0]
program = lines[1] # IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
comment = lines[2]
version = lines[3][-6:].strip()
if version == "V2000":
# counts has aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
# or shorter but should have version of len=5 at the end
counts = [lines[3][i:i + 3].strip() for i in range(0, len(lines[3][:-6]), 3)] + [version]
na = int(counts[0])
nb = int(counts[1])
nl = int(counts[2])
ns = int(counts[5])
if ns != 0 or nl != 0:
logging.warning("Not supporting atom lists (deprecated) or stext entries for this function.")
atoms = []
for a in lines[4:(na + 4)]:
# xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
# noinspection PyTypeChecker
atoms.append([a[0:10].strip(), a[10:20].strip(), a[20:30].strip(), a[30:34].strip(), a[34:36].strip(),
a[36:39].strip(), a[39:42].strip(), a[42:45].strip(), a[45:48].strip(), a[48:51].strip(),
a[51:54].strip(), a[54:57].strip(), a[57:60].strip(), a[60:63].strip(), a[63:66].strip(),
a[66:69].strip()])
# bond block
bonds = []
for b in lines[4 + na:4 + na + nb]:
# 111222tttsssxxxrrrccc
# noinspection PyTypeChecker
bonds.append([b[i:i+3].strip() for i in range(0, len(b), 3)])
# Properties block
properties = []
for p in lines[4 + na + nb:]:
if p == "M END":
break
if "M" in p:
properties.append(p)
else:
raise NotImplementedError("Can not parse mol V3000 or higher.")
return [title, program, comment, counts, atoms, bonds, properties]
def read_xyz_file(file_path, delimiter: str = None, line_by_line=False):
"""Simple python script to read xyz-file and parse into a nested python list. Always returns a list with
the geometries in xyz file.
Args:
file_path (str): Full path to xyz-file.
delimiter (str): Delimiter for xyz separation. Default is ' '.
line_by_line (bool): Whether to read XYZ file line by line.
Returns:
list: Nested coordinates from xyz-file.
"""
mol_list = []
comment_list = []
# open file
infile = open(file_path, "r")
if line_by_line:
lines = infile # File object
else:
lines = infile.readlines() # list of lines
num = 0
comment = 0
atoms = []
coordinates = []
for line in lines:
line_list = line.strip().split(delimiter)
line_list = [x.strip() for x in line_list if x != ""] # Remove multiple delimiter
if len(line_list) == 1 and num == 0 and comment == 0:
# Start new conformer and set line counts to read.
num = int(line_list[0])
comment = 1
elif comment > 0:
# Comment comes before atom block and must always be read.
comment_list.append(str(line))
comment = 0
elif num > 0:
if len(line_list) <= 1:
logging.error("Expected to read atom-coordinate block but got comment or line count instead.")
atoms.append(str(line_list[0]).lower().capitalize())
coordinates.append([float(x) for x in line_list[1:]])
if num == 1:
# This was last line for this conformer. Append result and reset current list.
mol_list.append([atoms, coordinates])
num = 0
atoms = []
coordinates = []
else:
# Finished reading an atom line.
num = num - 1
else:
logging.warning("Empty line in xyz file for mismatch in atom count found.")
# close file
infile.close()
return mol_list
def write_mol_block_list_to_sdf(mol_block_list, filepath):
"""Write a list of mol blocks as string into a SDF file.
Args:
mol_block_list (list): List of mol blocks as string.
filepath (str): File path for SDF file.
Returns:
None.
"""
with open(filepath, "w+") as file:
for i, mol_block in enumerate(mol_block_list):
if mol_block is not None:
file.write(mol_block)
if i < len(mol_block_list) - 1:
file.write("$$$$\n")
else:
file.write("".join(["\n",
" FAIL\n",
"\n",
" 0 0 0 0 0 0 0 0 0 0 V2000\n",
"M END\n"]))
if i < len(mol_block_list) - 1:
file.write("$$$$\n")
def read_mol_list_from_sdf_file(filepath, line_by_line=False):
"""Simple loader to load an SDF file by only splitting.
Args:
filepath (str): File path for SDF file.
line_by_line (bool): Whether to read SDF file line by line.
Returns:
list: List of mol blocks as string.
"""
mol_list = []
with open(filepath, "r") as f:
if not line_by_line:
all_sting = f.read()
mol_list = all_sting.split("$$$$\n")
else:
iter_mol = ""
for line in f:
if line == "$$$$\n":
mol_list.append(iter_mol)
iter_mol = ""
else:
iter_mol = iter_mol + line
if iter_mol != "":
mol_list.append(iter_mol)
# Check if there was tailing $$$$ with nothing to follow.
# Split will make empty string at the end, which does not match actual number of mol blocks.
if len(mol_list[-1]) == 0:
mol_list = mol_list[:-1]
return mol_list
def read_smiles_file(file_path):
"""Simply python function to read smiles from file.
Args:
file_path (str): File path for smiles file.
Returns:
list: List of smiles.
"""
with open(file_path, "r") as f:
smile_list = [line.rstrip() for line in f]
return smile_list
def write_smiles_file(file_path, smile_list):
"""Simply python function to write smiles to file.
Args:
file_path (str): File path for smiles file.
smile_list (list): List of smiles to write to file.
Returns:
None
"""
with open(file_path, "w+") as f:
for i, x in enumerate(smile_list):
if i == len(smile_list)-1:
f.write(x)
else:
f.write(x + "\n")