-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.c
268 lines (248 loc) · 9.24 KB
/
search.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#include "search.h"
struct document_rec{
long docid;
/* Used for merged results struct */
double tf;
/* Used for loading in the overall WC file */
int count;
};
/*
* Loads htable
*
* returns htable
*
* Procedure Loads hash table saved at index/dictionary
*/
htable search_load_index(){
htable dict;
FILE *dictionary_file = fopen("index/dictionary", "r");
if (!dictionary_file){
fprintf(stderr, "Unable to open index!\n Please ensure that the \"index\" is relative to where you're executing \n");
return NULL;
}
dict = htable_load_from_file(dictionary_file, NUM_WORDS);
return dict;
}
/*
* Loads the word count file
*
* returns array of documents
*
* Procedure Loads the word count document saved at
* index/wordcount
*/
document* load_word_count(){
int counter = 0;
int index = 0;
int max_len = 20; /* No document ID and word count exceeds this */
long doc_id;
int word_count;
char buffer[max_len];
char* token;
document* all_document_wcs = emalloc(NUM_DOCS_EXACT * sizeof(document));
FILE *wordcount_file = fopen("index/wordcount", "r");
if (!wordcount_file){
fprintf(stderr, "Unable to open index!\n Please ensure that the \"index\" is relative to where you're executing \n");
return NULL;
}
while(fgets(buffer, max_len, wordcount_file)){
token = strtok(buffer, " ");
while(token != NULL) {
if(counter % 2 == 0){
doc_id = atol(token);
}
else if(counter % 2 == 1){
word_count = atoi(token);
all_document_wcs[index].docid = doc_id;
all_document_wcs[index++].count = word_count;
}
counter++;
token = strtok(NULL, " ");
}
}
return all_document_wcs;
}
/* The Big Kahuna: This is where we do the search itself
*
* Parameters dict, the loaded in hashtable
* terms, an array of strings
* term count, the number of terms
*
* Procedure For each term, we do a hash for this term.
* If no hash is found for any of the terms we return from the function
*
* For each hash that is found we obtain all the listings for this term and save
* these listings into an array called all_listings.
* all_listings is indexed by the index of each term within the query.
* We also store the IDF of each term in an array indexed by the
* index of each term within the query.
*
* Next we determine the smallest listing length, and the index of this term.
* We are using AND logic, so if a document does not exist in the smallest array
* it does not exist in the merged results.
*
* for each listing in the smallest listing array we check if it exists in the other
* listing arrays, if it does, then it is added to the merged_results array, and at the same time
* calculating and storing it's combined tf_idf from the multiple terms.
*
* We then quicksort this merged_results array on the tf_idf score and print and free.
*
*/
void search_for_terms(htable dict, char** terms, int term_count){
int i = 0;
int term_i;
int merged_i = 0;
int smallest_length;
int smallest_listing_index;
unsigned int hash;
flexarray all_listings[term_count];
double search_term_idfs[term_count];
flexarray merged_results = flexarray_new();
document* all_document_wcs = load_word_count();
/* retrieve the listings arrays */
for(term_i = 0; term_i < term_count; term_i++){
/* for each term search the hash table and get the listings */
hash = htable_search(dict, terms[term_i]);
if(hash){
all_listings[term_i] = search_get_listings(htable_get_pos(dict, hash), htable_get_len(dict, hash));
}
else{
/* Andrew this may be redundant */
printf("%s Not Found!\n", terms[i]);
return;
}
/* Add each terms IDF to the search_terms array */
search_term_idfs[term_i] = log((double) NUM_DOCS_EXACT / (double) htable_get_freq(dict, hash));
}
/* determine smallest listing array */
for(term_i = 0; term_i < term_count; term_i++){
if(term_i == 0){
smallest_length = all_listings[term_i]->num_docs;
smallest_listing_index = term_i;
}
else if(all_listings[term_i]->num_docs < smallest_length){
smallest_length = all_listings[term_i]->num_docs;
smallest_listing_index = term_i;
}
}
/*
* iterate through smallest listing array because this is an AND
* results will not exist without existing in this array and it is the shortest
*/
for(i = 0; i < smallest_length; i++){
int temp_wc;
int curr_doc_wc[term_count];
int found_in_all = 1;
long doc_id = all_listings[smallest_listing_index]->listings[i].doc_id;
/*
* Iterate through all listing arrays, with docids starting from index 0, of the shortest
* listing array. We store the docids in a local array. If any array does not contain this
* docid, then we exit the for loop.
* */
for(term_i = 0; term_i < term_count && found_in_all == 1; term_i++){
temp_wc = flexarray_get_wordcount(doc_id, all_listings[term_i], 0, all_listings[term_i]->num_docs);
if(temp_wc == -1){
found_in_all = 0;
}
else{
curr_doc_wc[term_i] = temp_wc;
}
}
/*
docid found in all listing arrays, save this to the merged_listings flexarray
storing the doc_id and tf_idf.
If the term already exists in the flexarray, then we add to the tf_idf
*/
if(found_in_all == 1){
for(term_i = 0; term_i < term_count; term_i++){
double tf = (double) curr_doc_wc[term_i] /
(double) documents_get_wordcount(doc_id, all_document_wcs, 0, NUM_DOCS_EXACT);
double tf_idf = tf * search_term_idfs[term_i];
/* this is the first instance of this doc_id */
if(term_i == 0){
flexarray_append_merged_result(merged_results, merged_i, doc_id, tf_idf);
}
else{
flexarray_update_merged_result(merged_results, merged_i, tf_idf);
}
}
merged_i++;
}
}
if(merged_i > 0){
qsort(merged_results->listings, merged_results->num_docs, sizeof(listing), flexarray_compare_tf_idf);
flexarray_print_merged_results(merged_results);
/* Free Everything */
for(i = 0; i < term_count; i++){
flexarray_free(all_listings[i]);
}
flexarray_free(merged_results);
}
else{
}
/* Andrew this may be redundant */
printf("\n");
}
/**
* Retrieves the flexarray associated with the specific dictionary word
* from the index listing file
*
* Parameters: pos, the position of the flexarray within the file.
* len, the length of the flexarray within the file.
*
* Return: flexarray. listings containing the docid
* Procedure: fseek to the exact location within the fiel, and read the correct length
* Use flexarray_append_count_known as we only call it once per document within the array
*/
flexarray search_get_listings(long pos, int len){
/* Tested with awk '{print length}' listings | sort -rn | head -1 */
FILE *listings_file = fopen("index/listings", "r");
int i = 0;
long first_docid;
long current_docid;
int times_word_found = 0;
char buffer[len];
char* token;
flexarray f = flexarray_new();
fseek(listings_file, pos, SEEK_SET);
fread(buffer, 1, len, listings_file);
buffer[len - 1] = '\0';
token = strtok(buffer, " ");
while(token != NULL){
if(i == 0){
sscanf(token, "%lu", &first_docid);
current_docid = first_docid;
}
else if(i % 2 == 0){
sscanf(token, "%lu", ¤t_docid);
current_docid += first_docid;
}
else{
sscanf(token, "%d", ×_word_found);
flexarray_append_count_known(f, current_docid, times_word_found);
}
token = strtok(NULL, " ");
i++;
}
return f;
}
/*
* Binary search of document array,
* used for getting the total wordcount of a specific document.
* Needed for tf_idf calculation
*/
int documents_get_wordcount(long docid, document* documents, int start, int finish){
int midpoint = (finish + start) / 2;
if(finish < start){
return -1;
}
else if((documents[midpoint].docid > docid)){
return documents_get_wordcount(docid, documents, start, midpoint - 1);
}
else if(documents[midpoint].docid < docid){
return documents_get_wordcount(docid, documents, midpoint + 1, finish);
}
else {
return documents[midpoint].count;
}
}