-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
157 lines (115 loc) · 4.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from flask import Flask, jsonify
import math
import re
from flask import Flask, render_template, request, jsonify
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
def load_vocab():
vocab = {}
with open("vocab.txt", "r") as f:
vocab_terms = f.readlines()
with open("idf-values.txt", "r") as f:
idf_values = f.readlines()
for (term, idf_value) in zip(vocab_terms, idf_values):
vocab[term.rstrip()] = int(idf_value.rstrip())
return vocab
def load_document():
with open("document.txt", "r") as f:
documents = f.readlines()
# print('Number of documents: ', len(documents))
# print('Sample document: ', documents[0])
return documents
def load_inverted_index():
inverted_index = {}
with open('inverted_index.txt', 'r') as f:
inverted_index_terms = f.readlines()
for row_num in range(0, len(inverted_index_terms), 2):
term = inverted_index_terms[row_num].strip()
documents = inverted_index_terms[row_num+1].strip().split()
inverted_index[term] = documents
# print('Size of inverted index: ', len(inverted_index))
return inverted_index
def load_link_of_qs():
with open("Scrapper Master/Qdata/Qlink.txt", "r") as f:
links = f.readlines()
return links
vocab = load_vocab() # vocab : idf_values
document = load_document()
inverted_index = load_inverted_index()
Qlink = load_link_of_qs()
def get_tf_dict(term):
tf_dict = {}
if term in inverted_index:
for doc in inverted_index[term]:
if doc not in tf_dict:
tf_dict[doc] = 1
else:
tf_dict[doc] += 1
for doc in tf_dict:
# dividing the freq of the word in doc with the total no of words in doc indexed document
try:
tf_dict[doc] /= len(document[int(doc)])
except (ZeroDivisionError, ValueError, IndexError) as e:
print(e)
print(doc)
return tf_dict
def get_idf_value(term):
return math.log((1 + len(document)) / (1 + vocab[term]))
def calc_docs_sorted_order(q_terms):
# will store the doc which can be our ans: sum of tf-idf value of that doc for all the query terms
potential_docs = {}
ans = []
for term in q_terms:
if (term not in vocab):
continue
tf_vals_by_docs = get_tf_dict(term)
idf_value = get_idf_value(term)
# print(term, tf_vals_by_docs, idf_value)
for doc in tf_vals_by_docs:
if doc not in potential_docs:
potential_docs[doc] = tf_vals_by_docs[doc]*idf_value
else:
potential_docs[doc] += tf_vals_by_docs[doc]*idf_value
# print(potential_docs)
# divide the scores of each doc with no of query terms
for doc in potential_docs:
potential_docs[doc] /= len(q_terms)
# sort in dec order acc to values calculated
potential_docs = dict(
sorted(potential_docs.items(), key=lambda item: item[1], reverse=True))
# if no doc found
if (len(potential_docs) == 0):
print("No matching question found. Please search with more relevant terms.")
# Printing ans
# print("The Question links in Decreasing Order of Relevance are: \n")
for doc_index in potential_docs:
# print("Question Link:", Qlink[int(
# doc_index) - 1], "\tScore:", potential_docs[doc_index])
ans.append({"Question Link": Qlink[int(
doc_index) - 1][:-2], "Score": potential_docs[doc_index]})
return ans
app = Flask(__name__)
app.config['SECRET_KEY'] = 'your-secret-key'
# query = input('Enter your query: ')
# q_terms = [term.lower() for term in query.strip().split()]
# print(q_terms)
# print(calc_docs_sorted_order(q_terms)[0])
# print(len(calc_docs_sorted_order(q_terms)))
class SearchForm(FlaskForm):
search = StringField('Enter your search term')
submit = SubmitField('Search')
@app.route("/<query>")
def return_links(query):
q_terms = [term.lower() for term in query.strip().split()]
return jsonify(calc_docs_sorted_order(q_terms)[:20:])
@app.route("/", methods=['GET', 'POST'])
def home():
form = SearchForm()
results = []
if form.validate_on_submit():
query = form.search.data
q_terms = [term.lower() for term in query.strip().split()]
results = calc_docs_sorted_order(q_terms)[:10:]
return render_template('index.html', form=form, results=results)
# if __name__ == '__main__':
# app.run()