-
Notifications
You must be signed in to change notification settings - Fork 1
/
leaf.py
158 lines (137 loc) · 5.67 KB
/
leaf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
""" Local environment-based atomic features
"""
import os
import sys
import numpy as np
import pandas as pd
from ase.data import chemical_symbols
from pymatgen.analysis.local_env import *
from pymatgen.io.cif import CifParser
from matminer.featurizers.site.fingerprint import OPSiteFingerprint, VoronoiFingerprint
class Leaf:
"""
Creates a matrix of one-hot encoded atomic representation
Values: lists of local environment features (lostops, voronoi tes.)
Keys: atomic elements
"""
def __init__(self, featurizer=OPSiteFingerprint()):
self.featurizer = featurizer
self.onehot = {atom: {} for atom in chemical_symbols}
self.features_names = self.get_features_names()
def get_features_names(self):
lostops = [i for i in OPSiteFingerprint().feature_labels()]
voronoi = [i for i in VoronoiFingerprint().feature_labels()]
voronoi = [v for v in voronoi if 'std' not in v]
return lostops + voronoi
@staticmethod
def get_species(site):
species = str(site.species).split()
return [''.join([a for a in s if a.isalpha()]) for s in species \
if ''.join([a for a in s if a.isalpha()]) in chemical_symbols]
@staticmethod
def readfile(list_cifs):
""" read file with list of cifs e.g. 1.dat """
cifs = open(list_cifs, 'r').readlines()
cifs = [i.strip() for i in cifs]
order = list_cifs.split('.')[0]
return cifs, order
@staticmethod
def select_features(i, s):
"""
from all voronoi features select:
'Voro_vol_sum', 'Voro_area_sum', 'Voro_vol_mean', 'Voro_vol_minimum',
'Voro_vol_maximum', 'Voro_area_mean', 'Voro_area_minimum', 'Voro_area_maximum',
'Voro_dist_mean', 'Voro_dist_minimum', 'Voro_dist_maximum'
"""
l = list(VoronoiFingerprint().featurize(s, i))
b = l[16:19] + l[20:23] + l[24:27] + l[28:31]
return np.array(b)
def average_features_cifs(self, list_cifs):
""" process a list of cifs (e.g. 1.dat)
average features over a number of occurences of the elements in icsd
write results into a dictionary """
cifs, order = self.readfile(list_cifs)
# structures dict
icsd = {'composition':[],'structure':[]}
# lostops
# leaf = {atom: np.zeros(37) for atom in chemical_symbols}
# nleaf = {atom: 0 for atom in chemical_symbols}
# voronoi
leaf = {atom: np.zeros(11) for atom in chemical_symbols}
nleaf = {atom: 0 for atom in chemical_symbols}
for cif in cifs:
try:
structure = CifParser(cif).get_structures()[0]
except:
continue
icsd['composition'].append(str(structure.composition))
icsd['structure'].append(structure)
features = []
for i, s in enumerate(structure):
species = self.get_species(s)
for element in species:
# lostops
# leaf[element] += self.featurizer.featurize(structure, i)
# nleaf[element] += 1
# voronoi
try:
features = self.select_features(self.featurizer.featurize(structure, i))
leaf[element] += features
nlean[element] += 1
except:
break
df = pd.DataFrame(leaf)
df.to_pickle(f'Vleaf_{order}.pickle')
df = pd.DataFrame(icsd)
df.to_pickle(f'icsd_{order}.pickle')
df = pd.DataFrame(nleaf, index=[order])
df.to_pickle(f'vnleaf_{order}.pickle')
def get_features(self, list_cifs):
cifs, order = self.readfile(list_cifs)
features = []
elements = []
for cif in cifs:
try:
structure = CifParser(cif).get_structures()[0]
except:
continue
print(structure.composition)
for i, s in enumerate(structure):
species = self.get_species(s)
for element in species:
vfeat = []
try:
vfeat = [round(f,3) for i in select_features(i,structure)]
except:
pass
feats = [round(f,3) for f in self.featurizer.featurize(structure, i)] + \
vfeat
features.append(feats)
elements.append(element)
df = pd.DataFrame({'elements': elements, 'features': features})
df.sort_values(by=['elements'])
df.to_pickle(f'test_features_{order}.pickle')
def expand_onehot(self, element, features):
""" For all features 'feature_name' with values 'v'
create one-hot columns 'feature_name_v'
fill the number of occurences into self.onehot
"""
for i,v in enumerate(features):
feature = self.features_names + '_' + str(v)
if feature in self.onehot[element]:
self.onehot[element][feature] += 1
else:
for atom in chemical_elements:
self.onehot[atom][feature] = 0
def sort_features(self):
""" sort by features names
return dictionary[atoms] = 'features values' """
for atom in self.onehot:
self.onehot[atom] = {k: v for k,v in sorted(self.onehot[atom].items(), key=lambda x: x[0])}
return {a: np.array(self.onehot[a].values()) for a in self.onehot}
if __name__ == "__main__":
fname = sys.argv[1]
#leaf = Leaf(VoronoiFingerprint())
leaf = Leaf()
#leaf.average_features_cifs(fname)
leaf.get_features(fname)