-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyze_and_compute_scores.py
206 lines (171 loc) · 8.41 KB
/
analyze_and_compute_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pickle
import math
import unicodedata
import os
import pandas as pd
import numpy as np
from statistics import mode
from collections import Counter
import sklearn
import math
def get_first_mode(a):
c = Counter(a)
mode_count = max(c.values())
mode = {key for key, count in c.items() if count == mode_count}
first_mode = next(x for x in a if x in mode)
return first_mode
def BetaScore(x,n):
a1=2+x
b1=2+n-x
scoreBayesiano=(a1-1)/(a1+b1-2)
return scoreBayesiano
def initialize_results():
return {
'drugs2direction_all': {},
'drugs2direction_clinical': {},
'drugs2points_RS': {},
'drugs2points_RS_clinical': {},
'drugs2negative_clinical': {},
'drugs2direction_CS_clinical': {},
'drugs2negative': {},
'drugs2direction_CS': {},
'drugs2positive_clinical': {},
'drugs2tot_clinical': {},
'drugs2tot': {},
'drugs2positive': {},
'drugs2num_positive': {},
'drugs2num_negative': {},
'drugs2num_positive_clinical': {},
'drugs2num_negative_clinical': {},
'drugs2direction_clinistr': {}
}
def parse_pointstr(pointstr):
if pointstr != 'NA':
id2pointslist = pointstr.split(';')
id2points = dict([(sp.split(':')[0], int(sp.split(':')[1])) for sp in id2pointslist])
else:
id2points = {}
return id2points
def calculate_relevance_score(all_points):
scors = 1
all_points_sc = [s for s in all_points if s != 0]
for sco in all_points_sc:
scorp = sco / 100
scors *= (1 - scorp)
return 1 - scors
#fucntion calculate confidence score
def calculate_scores(pos, neg, tot):
x = len(pos)
xneg = len(neg)
n = x + xneg
scoreBayes_all = BetaScore(x, n)
direction = '+'
if scoreBayes_all < 0.5:
scoreBayes_all = 1 - scoreBayes_all
direction = '0'
elif scoreBayes_all == 0.5:
direction = '0'
if len(pos) > len(neg):
direction = '+'
elif len(pos) < len(neg):
direction = '-'
else:
direction = '0'
return x, xneg, n, direction, scoreBayes_all
# if the drug has no clincal ids - reset the dictionaries to 0 or empty
def reset_clinical_results(drug, results):
results['drugs2direction_clinical'][drug] = '0'
results['drugs2direction_CS_clinical'][drug] = 0
results['drugs2num_positive_clinical'][drug] = 0
results['drugs2num_negative_clinical'][drug] = 0
results['drugs2tot_clinical'][drug] = 0
results['drugs2negative_clinical'][drug] = ''
results['drugs2positive_clinical'][drug] = ''
# Function to update dataframe with results
def update_dataframe(df, results, clinical):
if clinical:
df["Direction"] = df["Drug"].map(results['drugs2direction_clinical'])
df["RS"] = df["Drug"].map(results['drugs2points_RS_clinical'])
df["CS"] = df["Drug"].map(results['drugs2direction_CS_clinical'])
df["NegativeTot"] = df["Drug"].map(results['drugs2num_negative_clinical'])
df["PositiveTot"] = df["Drug"].map(results['drugs2num_positive_clinical'])
df["clinicalTot"] = df["Drug"].map(results['drugs2tot_clinical'])
else:
df["Direction"] = df["Drug"].map(results['drugs2direction_all'])
df["RS"] = df["Drug"].map(results['drugs2points_RS'])
df["CS"] = df["Drug"].map(results['drugs2direction_CS'])
df["NegativeTot"] = df["Drug"].map(results['drugs2num_negative'])
df["PositiveTot"] = df["Drug"].map(results['drugs2num_positive'])
df["num_paperTot"] = df["Drug"].map(results['drugs2tot'])
def save_to_excel(df, filepath, results, clinical):
with pd.ExcelWriter(filepath) as writer:
directions = df["Direction"].unique()
for direction in directions:
df_direction = df[df["Direction"] == direction]
df_direction.to_excel(writer, sheet_name=f"{direction}", index=False)
def load_data_if_none(drugs2posneg, drug2pubid_clinical, df_clinical, df_allDrug, fp_dict, fpath_txt):
if drugs2posneg is None:
drugs2posneg = pickle.load(open(os.path.join(fp_dict, 'drugs2posneg'), 'rb'))
if drug2pubid_clinical is None:
drug2pubid_clinical = pickle.load(open(os.path.join(fp_dict, 'drug2article_clinical'), 'rb'))
if df_clinical is None:
df_clinical = pd.read_excel(os.path.join(fpath_txt, "Drugs_Clinical_papers_pubmedID.xlsx"))
if df_allDrug is None:
df_allDrug = pd.read_excel(os.path.join(fpath_txt, "Drugs_papers_pubmedID.xlsx"))
return drugs2posneg, drug2pubid_clinical, df_clinical, df_allDrug
def main(drugs2posneg=None, drug2pubid_clinical=None, df_clinical=None, df_allDrug=None):
path_base = os.getcwd()
fpath_txt = os.path.join(path_base, 'txt')
fp_dict = os.path.join(path_base, 'dictionary')
drugs2posneg, drug2pubid_clinical, df_clinical, df_allDrug= load_data_if_none(drugs2posneg, drug2pubid_clinical, df_clinical, df_allDrug, fp_dict, fpath_txt)
results = initialize_results()
for drug,lists in drugs2posneg.items():
pos, neg, pointstr, tot = lists[0], lists[1], lists[2], lists[3]
if tot==0:
continue
id2points = parse_pointstr(pointstr)
if drug in drug2pubid_clinical.keys():
clinical_ids=drug2pubid_clinical[drug]
if isinstance(clinical_ids, str):
clinical_ids_split=clinical_ids.split(',')
all_points_clin = [id2points[clin] for clin in clinical_ids_split if clin in id2points]
relevanceScore_clinical = calculate_relevance_score(all_points_clin)
results['drugs2points_RS_clinical'][drug]=relevanceScore_clinical
clinical_ids_w_abst=set(clinical_ids.split(',')).intersection(id2points.keys())
if clinical_ids_w_abst:
pos_clin = clinical_ids_w_abst.intersection(pos)
neg_clin = clinical_ids_w_abst.intersection(neg)
x_clinical = len(pos_clin)
x_clinical_neg = len(neg_clin)
n_clinical = x_clinical + x_clinical_neg
scoreBayes_clinical=BetaScore(x_clinical,n_clinical)
direction_clin='+'
direction_clin = '+' if scoreBayes_clinical >= 0.5 else '-'
if scoreBayes_clinical==0.5:
direction_clin='0'
results['drugs2direction_clinical'][drug] = direction_clin
results['drugs2direction_CS_clinical'][drug] = scoreBayes_clinical
results['drugs2num_positive_clinical'][drug] = x_clinical
results['drugs2num_negative_clinical'][drug] = x_clinical_neg
results['drugs2tot_clinical'][drug] = n_clinical
results['drugs2negative_clinical'][drug] = ','.join([f for f in neg_clin])
results['drugs2positive_clinical'][drug] = ','.join([f for f in pos_clin])
else:
reset_clinical_results(drug, results)
all_points = [id2points[idsl] for idsl in id2points.keys()]
relevanceScore = calculate_relevance_score(all_points)
results['drugs2points_RS'][drug]=relevanceScore
x, xneg, n, direction, scoreBayes_all = calculate_scores(pos, neg, tot)
results['drugs2direction_all'][drug] = direction
results['drugs2direction_CS'][drug] = scoreBayes_all
results['drugs2num_positive'][drug] = x
results['drugs2num_negative'][drug] = xneg
results['drugs2tot'][drug] = n
results['drugs2negative'][drug] = ','.join([f for f in neg])
results['drugs2positive'][drug] = ','.join([f for f in pos])
update_dataframe(df_clinical, results, clinical=True)
update_dataframe(df_allDrug, results, clinical=False)
df_clinical = df_clinical[df_clinical['#paper'] > 0]
save_to_excel(df_clinical, os.path.join(fpath_txt, "Clinical_Drugs_scores.xlsx"), results, clinical=True)
save_to_excel(df_allDrug, os.path.join(fpath_txt, "all_Drugs_Scores.xlsx"), results, clinical=False)
print( "Clinical_Drugs_scores.xlsx and all_Drugs_Scores.xlsx have been saved to the path \n"+ fpath_txt)