-
Notifications
You must be signed in to change notification settings - Fork 0
/
document.py
69 lines (52 loc) · 1.16 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
class DocRef():
file = ''
length = 0.0
def __init__(self,file='',length=0.0):
self.file = file
self.length = length
def get_path(self):
return self.file
def get_length(self):
return self.length
def set_length(self,length):
self.length = length
def toVector1(self):
file = open(self.file,'r')
doc = file.read()
file.close()
doc = re.sub(r'[^a-z^A-Z]',' ',doc)
doc = word_tokenize(doc)
list1 = dict([])
for w in doc:
w = w.lower()
if w in stop_words:
continue
if list1.__contains__(w):
list1[w] = list1[w] + 1
else:
list1[w] = 1
return list1
def toVector(self):
file = open(self.file,'r')
doc = file.read()
file.close()
doc = re.sub(r'[^a-z^A-Z]',' ',doc)
doc = word_tokenize(doc)
list1 = dict([])
for w in doc:
w = w.lower()
if w in stop_words:
continue
w = ps.stem(w)
if list1.__contains__(w):
list1[w] = list1[w] + 1
else:
list1[w] = 1
return list1