-
Notifications
You must be signed in to change notification settings - Fork 1
/
functs.py
212 lines (196 loc) · 9.37 KB
/
functs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import numpy as np
import pandas as pd
import math
import networkx as nx
import csv
from statistics import median, mean
import queue as Q
import threading
from numba import autojit
# function to open dicts we saved
def open_dict(vocabularyFile):
cats = open(vocabularyFile, 'r', encoding = "utf8").readlines()
cats2 = {}
for cat in cats:
templ = []
for x in cat.split():
templ.append(x.strip("[").strip("]").strip("'").strip(",").rstrip("''").rstrip('\n'))
try:
int(templ[1])
cats2[templ[0]] = templ[1:]
except:
cats2[templ[0]] = " ".join(templ[1:])
return cats2
# function to save our vocabulary file to disk
def save_dict(vocabulary,fileName="output.csv"):
with open(fileName,'wb') as vfile:
for i in vocabulary.keys():
vfile.write(str(i).encode())
vfile.write(str('\t').encode())
vfile.write(str(vocabulary[i]).encode())
vfile.write('\n'.encode())
vfile.close()
# function to look for the shortest path
def bfs(graph, inp_cat, inp_node, dest_cat, out_q):
# creating a queue
queue = Q.Queue()
#putting the current node in the queue
queue.put([inp_node, 0])
# defining a dictionary to check if we already visited the nodes of graph
visited = {}
# setting the distance of the current node to infinity as a default
sh_path = np.inf
# setting visited to False for every node in the graph
for x in graph.nodes:
visited[x] = False
# check for shortest paths as long as the queue is not empty
while queue.empty() != True:
# get the node we put in first
current = queue.get()
# check if the entry we got from the queue is in the destinatino category and not in the input category
if graph.node[current[0]][dest_cat] == True and graph.node[current[0]][inp_cat] != True:
# if its true, set visited #uneccessary step as well break after anyway
visited[current[0]]= True
# update the shortest path if we found one, else will stay infinitive
sh_path = current[1]
#print('shortest path from ', inp_node, ' to ', current[0], ' found (dist = ', current[1], ')')
queue.queue.clear()
else:
# get the successors of our current node (as its a directed graph)
for i in graph.successors(current[0]):
# check if the successor is not visited
if visited[i] != True:
# if its not visited, put the found node in the queue,
# together with the information about the distance it has from the starting node
queue.put([i, current[1]+1])
# set the current node to visited
visited[current[0]]= True
# put the result we found
out_q.put([inp_node, sh_path])
# function to execute the bfs
def run_bfs(start_cat, graph, categories):
#creating a list of nodes of our starting category
inp_nodes = [cat_nodes for cat_nodes in graph.nodes if graph.node[cat_nodes][start_cat]== True]
# create a dictionary we want to save the medians and other information to
medians = {}
#iterate over all categories in the list of categories
for cat in categories:
# creating a dictionary we save the information for every node the bfs returnd
sh_paths = {}
# iterate only over the categories that aren't our C0
if cat != start_cat:
# setting the destination category to be passed to our bfs
dest_cat = cat
# creating a queue that contains the nodes we want to pass to the bfs
start_q = Q.Queue()
# creating a queue that we'll pass the results of our bfs to
out_q = Q.Queue()
# adding the nodes of our C0 to the start_q. every node will be passed to our bfs
for start_node in inp_nodes:
start_q.put(start_node)
# while we didn't calculate the shortest distance for every node in our C0, do things
while not start_q.empty():
# as long as the number of running threads is at most 50, add threads
if threading.active_count() <= 50:
# get the current node from our start_q
current_t = start_q.get()
# start a thread with our bfs and the aforementioned parameters
t = threading.Thread(target=bfs, args=(graph, start_cat, current_t, dest_cat, out_q), daemon= True).start()
# tell the start_q that the task with current_t is done
start_q.task_done()
# while we didn't retrieve all values the bfs calculated, keep running
while not out_q.empty():
# get the first result in the queue
out_p = out_q.get()
# add the information to our shortest paths dictionary. the key is the node, the value the distance
sh_paths[out_p[0]] = out_p[1]
# tell the out_q that the job is finished
out_q.task_done()
# tell the start_q that all threads shall be joined
start_q.join()
# setting up variables for the calculation of avererage and counting the infitives in our result
sum_vals = 0
i = 0
inf_count = 0
# iterate over the values we retrieved for the distances from c0 to ci in order to sum the values and count the infinitives
for x in sh_paths.values():
if x != np.inf:
i+=1
sum_vals += x
else:
inf_count += 1
# saving median, mean and infinity count as values in a dictionary. The key is the category to which we calculated the distance from c0
medians[cat] = [median(sh_paths.values()), sum_vals/i, inf_count]
return medians
'''
Functions for scoring:
'''
#function to assign every node only to one category
#@autojit
def key_substraction(cat_dict, org_cat, list_smallest):
return_dict = {}
# Get a list of categorys, sorted by the ascending distance from our C0
# (doesn't include our starting category so we don't have to iterate over it
keys = []
for key in list_smallest:
keys.append(key[0])
# iterating over the categories in a list we sorted by the ascending distance from our C0
for i in range(len(keys)):
if i == 0:
# getting the nodes of our starting category
org_nodes = cat_dict[org_cat]
# iterating over all categories in our list of sorted categories
for key in keys:
# assigning only the values of the current key minus the intersection
# of the values of the current category and our starting category
temp = []
for node in cat_dict[key]:
if node not in org_nodes:
temp.append(node)
return_dict[key] = temp
else:
# iterating over all categories again. but now we're only using the keys of the categories we didn't
# clean up yet. Same as before we're only assigning the values of Ci to Cn minus the intersection Ci-1 and Ci.
for x in range(i,len(keys)):
temp = []
for node in cat_dict[keys[x]]:
if node not in cat_dict[keys[i-1]]:
temp.append(node)
return_dict[keys[x]] = temp
return return_dict
#function to create the score for a node by the in edges
def no_in_edges(node, cat):
x = 0
# getting the number of in edges by counting all predecessors
# (nodes with edges pointing towards the node we're looking at) of a node.
for i in graph.predecessors(node):
#
if graph.node[i][cat] == True:
x +=1
return x
#create the score for every node in a category
def article_score_cat(graph, cat):
#get all the nodes of the current category in a defined graph
nodes = [nodes for nodes in graph.nodes if graph.node[nodes][cat]== True]
for node in nodes:
# set the score to the existing score (in the beginning 0) + the number of in edges of the current node
graph.node[node]['score'] = graph.node[node]['score'] + no_in_edges(node, cat)
def article_score(graph, cat_list):
for i in range(len(cat_list)):
if i == 0:
nodes = [nodes for nodes in graph.nodes if graph.node[nodes][cat_list[i]]== True]
sub_g = graph.subgraph(nodes)
article_score_cat(sub_g, cat_list[i])
else:
cat_nodes = [nodes for nodes in graph.nodes if graph.node[nodes][cat_list[i]]== True]
# sub_g_cat = graph.subgraph(cat_nodes)
# article_score_cat(sub_g_cat, cat_list[i])
for node in cat_nodes:
nodes.append(node)
sub_g = graph.subgraph(nodes)
article_score_cat(sub_g, cat_list[i])
for node in cat_nodes:
for pred in sub_g.predecessors(node):
if graph.node[pred][cat_list[i-1]] == True:
graph.node[node]['score'] = graph.node[node]['score'] + graph.node[pred]['score']
return sub_g