-
Notifications
You must be signed in to change notification settings - Fork 15
/
process_db.py
97 lines (73 loc) · 2.5 KB
/
process_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
import scipy.sparse
import h5py
import sys
import os
def h5_reader(file_name):
h5 = h5py.File(file_name, 'r')
h5_data = h5['matrix/data']
print('H5 dataset shape:', h5_data.shape)
return h5_data
def to_sparse_mat(file_name):
h5_data = h5_reader(file_name)
sparse_data = scipy.sparse.csr_matrix(np.array(h5_data).transpose())
scipy.sparse.save_npz(file_name.replace('h5', 'npz'), sparse_data)
def data_parsing(rna_h5_files, atac_h5_files):
for rna_h5_file in rna_h5_files:
to_sparse_mat(rna_h5_file)
for atac_h5_file in atac_h5_files:
to_sparse_mat(atac_h5_file)
def label_parsing(rna_label_files, atac_label_files):
#
total_rna_labels = []
for label_file in rna_label_files:
with open(label_file) as fp:
lines = fp.readlines()
lines = lines[1:]
for line in lines:
line = line.split(',')
label = line[1].replace('\"', '').replace('\n', '')
#print(idx, label)
total_rna_labels.append(label)
# label to idx
label_idx_mapping = {}
unique_labels = np.unique(total_rna_labels)
for i, name in enumerate(unique_labels):
label_idx_mapping[name] = i
print(label_idx_mapping)
with open(os.path.dirname(rna_label_files[0]) + "/label_to_idx.txt", "w") as fp:
for key in sorted(label_idx_mapping):
fp.write(key + " " + str(label_idx_mapping[key]) + '\n')
# gen rna label files
for label_file in rna_label_files:
with open(label_file) as fp:
lines = fp.readlines()
lines = lines[1:]
with open(label_file.replace('csv','txt'), 'w') as rna_label_f:
for line in lines:
line = line.split(',')
label = line[1].replace('\"', '').replace('\n', '')
rna_label_f.write(str(label_idx_mapping[label]) + '\n')
# gen atac label files
for label_file in atac_label_files:
with open(label_file) as fp:
lines = fp.readlines()
lines = lines[1:]
with open(label_file.replace('csv','txt'), 'w') as rna_label_f:
for line in lines:
line = line.split(',')
label = line[1].replace('\"', '').replace('\n', '')
if label in label_idx_mapping.keys():
rna_label_f.write(str(label_idx_mapping[label]) + '\n')
else:
rna_label_f.write('-1\n') # give an unknown label
if __name__ == '__main__':
rna_h5_files = []
rna_label_files = [] # csv file
atac_h5_files = []
atac_label_files = []
rna_protein_files = [] #h5 file
atac_protein_files = []
data_parsing(rna_h5_files, atac_h5_files)
label_parsing(rna_label_files, atac_label_files)
data_parsing(rna_protein_files, atac_protein_files)