-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
344 lines (229 loc) · 9.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
'''
relalational-search
A search algorithm that aims to search beyond the keywords.
Copyright (C) 2024 Kai Broadbent
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. '''
import pickle
import itertools
from dataclasses import dataclass
from collections import defaultdict
import timeit
import re
from collections import Counter
from userpath.cli import append
@dataclass
class datapoint:
unique_name: str
matrix: dict
def name_qury(self):
return self.unique_name
def dict_matrix_qury(self):
return self.matrix
def word_ocerenses(self,look_word):
if look_word in self.matrix:
return (self.matrix[look_word],True)
else:
return(look_word,False)
def words(self):
dictofm=dict(self.matrix)
return tuple(dictofm.keys())
def __hash__(self):
return hash(self.unique_name)
@dataclass
class mater_dict:
matrix:list
def words(self):
return [x[0] for x in self.matrix]
def word_ocerenses(self,look_word):
send=()
k=[x[0] for x in self.matrix]
for tupl in self.matrix:
if look_word in tupl:
send = (tupl[1], True, look_word)
elif look_word not in k:
send =(look_word,False)
return send
@dataclass
class byzdis_disubution_map:
map:list
def map_qurry(self):
return self.map
def unique_name_lookup(self,unique_name):
send=None
k=[x[0] for x in self.map]
for tupl in k:
if unique_name in tupl:
send = (float(tupl[1]), True, unique_name)
return send
@dataclass
class NullIterable:
def __init__(self):
pass
def __iter__(self):
return iter([]) # returns an empty iterator
@dataclass
class searchdict:
mastdict: list
def keywords(self):
out=[x[0] for x in self.mastdict]
return out
def remove_duplicates(lst):
#seen = []
#return [x for i, x in enumerate(lst) if x not in seen or not seen.append(x)]
from collections import OrderedDict
#my_list = [1, 2, 3, 2, 4, 3, 5]
unique_items = list(OrderedDict.fromkeys(lst))
return unique_items
def file_to_clean_str(file):
import re
openfile = open(file,"r")
clean = ''
for line in openfile:
#for symbol in ["\n",':','.',"'",'&','-']:
lineclean = re.sub(r'[^\w\s]', '', line)
lineclean = lineclean.replace("\n"," ")
lineclean = lineclean.lower()
clean = clean + lineclean
return clean
def string_to_dict(fullstring,numofdoc):
# fullstring must have all words and terms of data set in it!!!!
data_list_split = fullstring.split() # Split the string into a list of words
# Count the occurrences of each word in data_list_split using a defaultdict
word_counts = defaultdict(int)
for word in data_list_split:
word_counts[word] += 1
data_list_singal = set(data_list_split) # Remove duplicates using a set
sorted_data_list = sorted(data_list_singal) # Sort the list of unique words
matrix = [(word, float(word_counts[word])/numofdoc) for word in sorted_data_list]
return mater_dict(matrix=matrix)
def list_of_data_points_to_dict(point_list:list):
number_of_docs=len(point_list)
listofwordlists = [pnt.words() for pnt in point_list]
wordlist = []
wordli=[]
for wordsk in listofwordlists:
wordli += wordsk[0:len(wordsk)]
wordlist+=wordli
words = sorted(set(wordlist))
mast_dict_as_list =[]
for word in words:
wordocer=0
for point in point_list:
if point.word_ocerenses(word)[1]:
wordocer += point.word_ocerenses(word)[0]
toadd= (word,wordocer/number_of_docs)
mast_dict_as_list.append(toadd)
#[x[0] for x in mast_dict_as_list ]
return mater_dict(matrix=mast_dict_as_list)
def function_for_map(point1:datapoint,dict):
relation_fromword=0
p1_word_list = list(point1.words())
distubutinsumaslist=[]
finall_map= []
for word in p1_word_list:
if not dict.word_ocerenses(word)[1]:
print('wtf',"166",dict.word_ocerenses(word))
else:
ocrents_in_doc=int(point1.word_ocerenses(word)[0])
averge_occents_in_data=dict.word_ocerenses(word)[0]
relation_fromword += (ocrents_in_doc - averge_occents_in_data)+1
distubutinsumaslist.append(relation_fromword)
finall_map.append([point1.unique_name,sum(distubutinsumaslist)])
return finall_map
else:
return NullIterable()
def string_in_dataset_to_matrix(string_in_data):
data_list_split = string_in_data.split() # Split the string into a list of words
# Count the occurrences of each word in data_list_split using a defaultdict
word_counts = defaultdict(int)
for word in data_list_split:
word_counts[word] += 1
data_list_singal = set(data_list_split) # Remove duplicates using a set
sorted_data_list = sorted(data_list_singal) # Sort the list of unique words
matrix = [(word, word_counts[word]) for word in sorted_data_list]
return dict(matrix) # Return the matrix of word and occurrence pairs
def string_to_datapoint_without_relations(name:str,data_as_string:str):
point=datapoint(unique_name=name,matrix=string_in_dataset_to_matrix(data_as_string))
return point
def list_of_poins_to_map(listofpoints,mat_dict):
relatiin_mapp_full = byzdis_disubution_map(map=[])
for pointstk in listofpoints:
if type(function_for_map(pointstk, mat_dict)) != NullIterable:
names_in_map=[x[0] for x in relatiin_mapp_full.map_qurry()]
if pointstk.unique_name not in names_in_map:
relatiin_mapp_full.map.append(function_for_map(pointstk, mat_dict))
return relatiin_mapp_full
def searchalgo(search_terms:str,compleat_data_as_points_list:list,master_ditionary:mater_dict,byzdisrubution_map:byzdis_disubution_map,*term_filtering_stranth:int):
list_of_datapoint_with_term=[]
unsorted_list_of_datapoint_names_with_term=[]
if term_filtering_stranth!=int:
use_term_fitering=False
for term in search_terms.split():
if term in master_ditionary.words():
if not use_term_fitering:
list_of_datapoint_with_term += [x for x in compleat_data_as_points_list if term in x.words()]
elif master_ditionary.word_ocerenses(term)[0]<term_filtering_stranth:
list_of_datapoint_with_term += [x for x in compleat_data_as_points_list if term in x.words()]
pass
master_ditionary_from_points_with_terms = list_of_data_points_to_dict(list_of_datapoint_with_term)
map_of_points_with_terms=list_of_poins_to_map(list_of_datapoint_with_term,master_ditionary_from_points_with_terms)
for name_dis_number_pair in map_of_points_with_terms.map:
lookup_name=name_dis_number_pair[0][0]
local_before_filter=byzdisrubution_map.unique_name_lookup(lookup_name)[0]
change_in_local=local_before_filter-name_dis_number_pair[0][1]
unsorted_list_of_datapoint_names_with_term.append([name_dis_number_pair[0][0],change_in_local])
unsorted_list_of_datapoint_names_with_term.sort()
return [name[0] for name in unsorted_list_of_datapoint_names_with_term]
#else:
#return "pleas put a space betwwen each word and number sequence or try different search terms"
def main():
relatiin_mapp_full= byzdis_disubution_map(map=[])
relatiin_mapp_full.map = []
chunk_size=2000
openfile = open('input.txt', "r")
clean = ''
count=0
full_str=''
list_of_point_names=[]
lenchecker=0
number_of_docs=0
list_of_points_before_relatin_maping=[]
for line in openfile:
count += 1
# for symbol in ["\n",':','.',"'",'&','-']:
lineclean = re.sub(r'[^\w\s]', '', line)
lineclean = lineclean.replace("\n", " ")
lineclean = lineclean.lower()
clean = clean + lineclean
full_str= full_str + lineclean
if count % chunk_size==0:
point=string_to_datapoint_without_relations(f'lines{count-chunk_size}-{count}',clean)
list_of_points_before_relatin_maping.append(point)
list_of_point_names.append(point.unique_name)
number_of_docs+=1
clean =''
mat_dict= string_to_dict(full_str,number_of_docs)
#print("267",mat_dict)
bizdismap=list_of_poins_to_map(list_of_points_before_relatin_maping, mat_dict)
print( bizdismap,"240")
dictfromdp= list_of_data_points_to_dict(list_of_points_before_relatin_maping)
print(searchalgo('romeo',list_of_points_before_relatin_maping,mat_dict,bizdismap))
with open("points.pickle", "wb") as file:
pickle.dump(list_of_points_before_relatin_maping, file)
with open("dict.pickle", "wb") as file:
pickle.dump(mat_dict, file)
with open("relationmap.pickle","wb") as file:
pickle.dump(relatiin_mapp_full, file)
print("done")
pass
if __name__ == '__main__':
main()