forked from Yun-Mao/search_hash_rbtree
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cpp
259 lines (239 loc) · 5.96 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
extern "C" {
#include"hashTable.h"
#include"rbtree.h"
#include"List.h"
}
#include<iostream>
#include <fstream>
#include<algorithm>
#include<vector>
#include<string>
#define TRUE 1
#define FALSE 0
using namespace std;
vector<string> stopword; //stopword向量(有一些词出现次数太多,得不偿失,意义不大如 a the)
void Insertword(string voc, string filename, int address, hashTable * Index);//建立索引时要插入元素
void BuildIndex(hashTable *Index);// 建立索引
void collect_stopword();//把stopword加入向量
bool isstopword(string W);//判断stopword
string Filename(int N, int M);//用这种方式获取一个文件名
bool isword(char ch);//判断这个char是否合法
void collect_stopword()
{
ifstream fp;
string stop;
char ch;
fp.open("stopword.txt", ios_base::in | ios_base::binary);
if (!fp)
{
cout << "读取屏蔽词出错!" << endl;
return;
}
fp.get(ch);
while (fp)
{
stop = "";
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
if (!fp)
break;
while (isword(ch) && !fp.eof())
{
stop += ch;
fp.get(ch);
}
transform(stop.begin(), stop.end(), stop.begin(), tolower);
stopword.push_back(stop);
}
}
//判断是不是stopword——从上一个函数中取得了所有stopword拼成的字符串,判断是否能在里面找到就行
bool isstopword(string W)
{
if (find(stopword.begin(), stopword.end(), W) == stopword.end())
return 0;
else
return 1;
}
//判断是否是单词
bool isword(char ch)
{
//标准:传入ch在A-Z或者a-z中间或者是\(配合n吧)就算
int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\''));
return ((D == 1) ? TRUE : FALSE);
}
//构造文件名的函数(因为示范文件是10xx x的格式所以这么构造)
string Filename(int N, int M)
{
string res;
char file[15];
char chap[10];
_itoa_s(N, file, 10);
_itoa_s(M, chap, 10);
strcat_s(file, 15, " ");
strcat_s(file, 15, chap);
strcat_s(file, 15, ".txt");
res = file;
return res;
}
int hashfile(char*p)
{
int i = strlen(p);
int hash=0;
int j;
for (j = 0; j < i; j++)
{
if (p[j] >= '0'&&p[j] <= '9')
{
hash = hash * 10 + p[j] - '0';
}
}
return hash;
}
//整理扫描到的当前单词信息(在哪个文件的哪里),再插入 , address标志了单词位置
void Insertword(string voc, string filename, int address, hashTable* t)
{
char p[50];
strcpy(p, voc.c_str());
char *fname = (char*)filename.c_str();
if (findValue(t,p)==NULL)//假如索引里头没有这个词我们就把他加进去
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBRoot *root;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(t, p, root);
//cout << "haha" << endl;
//inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value);
}
else//有的话就得更新它的数据
{
RBRoot *tempRB=findValue(t, p);
//如果小向量红黑树sour中没有找到当前文件名(意思是现在找到的词是本文件中第一次出现)
if (rbtree_search(tempRB,hashfile(fname))==NULL)
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(tempRB, *leaf);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
//insertEntry(t, p, tempRB);
}
//如果小向量sour中找到了当前文件名(意思是现在找到的词在本文件中出现过,NA向量已经建立过了,直接加就行)
else
{
RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname));
ListInsert(&templeaf->index, 1, address);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
}
}
}
void BuildIndex(hashTable *Index)
{
int cou;
int num;
string filename = "";
ifstream fp;
string voc = "";
int address;
char ch;
//这个双层循环是配合Filename函数(自己写的)完成->读所有文件的任务
for (cou = 1001; cou <= 1042; cou++)
{
for (num = 1; num <= 153; num++)
{
filename = Filename(cou, num);
fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary);
if (!fp.is_open())
continue;
cout << filename << endl;
address = 0;
fp.get(ch);
while (fp)//以前学过的逐个单词读
{
voc = "";
//不是单词且没到结尾就继续
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
//到结尾就退出(去下一个文件)
if (!fp)
break;
//是单词的操作就在String vocabulary中连缀上它
while (isword(ch) && !fp.eof())
{
voc += ch;
fp.get(ch);
}
transform(voc.begin(), voc.end(), voc.begin(), tolower); //转化成小写字母
//下标移动(数在第几个位置嘛)
address++;
//假如不是屏蔽词汇,就整理单词信息再插入索引(用之前的Insert函数)
if (!isstopword(voc))
{
Insertword(voc, filename, address, Index);
//getchar();
}
}
fp.close();
}
}
}
void test() //测试功能专用
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, 10);
ListInsert(&L, 1, 30);
hashTable t;
initHashTable(&t);
RBRoot *root = NULL;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahah";
leaf->hashfile = 20;
leaf->index = L;
insert_rbtree(root, *leaf);
leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahaha";
leaf->hashfile = 10;
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(&t, "son", root);
insertEntry(&t, "on", root);
RBRoot *b = findValue(&t, "son");
inorder_rbtree(b);
getchar();
}
int main(){
//test();
hashTable t;
initHashTable(&t);
cout << "屏蔽词载入中..." << endl;
collect_stopword();
cout << "屏蔽词库建立完成" << endl;
cout << "倒排索引建立中..." << endl;
BuildIndex(&t);
cout << "倒排索引建立完成" << endl;
while (1) {
string s;
cin >> s;
char *p = (char *)s.c_str();
inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value);
}
//inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value);
getchar();
getchar();
}