-
Notifications
You must be signed in to change notification settings - Fork 1
/
corpus_dictionary.py
70 lines (55 loc) · 2.45 KB
/
corpus_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from gensim import corpora
from pymongo import MongoClient
from preprocess_text import clean
client = MongoClient()
db = client['crawled_news']
collection = db['crawled_news']
class CustomCorpus(object):
def __init__(self,query={}):
self.query = query
self.dictionary = corpora.Dictionary(all_of_words(query=query))
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
def __iter__(self):
for tokens in all_of_words(query=self.query):
yield self.dictionary.doc2bow(tokens)
class SubCorpus(object):
def __init__(self,collection):
self.collection = collection
self.dictionary = corpora.Dictionary([clean(d["content"]).split() for d in collection])
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
def __iter__(self):
for l in self.collection:
yield self.dictionary.doc2bow(clean(l["content"]).split())
class CompleteCorpus(object):
def __init__(self):
self.dictionary = corpora.Dictionary((clean(d["content"]).split() for d in collection.find()))
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
def __iter__(self):
for l in collection.find():
yield self.dictionary.doc2bow(clean(l["content"]).split())
def iterate_collection(collection):
for doc in collection:
cleaned = clean(doc["content"])
yield cleaned.split()
def all_of_words(query={}):
for doc in collection.find(query):
yield doc["content"].split()
def recreate_dictionary(dict_path = 'dictionary/all_of_words.dict', corpus_path = 'corpus/all_of_words.mm'):
corpus = CustomCorpus(query) # create a dictionary
corpus.dictionary.save(dict_path)
corpora.MmCorpus.serialize(corpus_path, corpus)
def custom_corpus(collection):
corpus = SubCorpus(collection)
# corpora.MmCorpus.serialize("corpus/temp.mm", corpus)
# c = corpora.mmcorpus.MmCorpus("corpus/temp.mm")
return corpus
def custom_corpus(collection):
corpus = SubCorpus(collection)
# corpora.MmCorpus.serialize("corpus/temp.mm", corpus)
# c = corpora.mmcorpus.MmCorpus("corpus/temp.mm")
return corpus
if __name__ == "__main__":
corpus = CompleteCorpus() # create a dictionary
# corpus.dictionary
# corpus.dictionary.save('dictionary/all_of_words.dict')
# corpora.MmCorpus.serialize('corpus/all_of_words.mm', corpus)