-
Notifications
You must be signed in to change notification settings - Fork 0
/
findfeatures.py
97 lines (88 loc) · 2.67 KB
/
findfeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import operator
from orangecontrib.associate.fpgrowth import *
import Orange
import codecs
import os
def create_txt(file_path, content):
if os.path.exists(file_path):
os.remove(file_path)
f=codecs.open(file_path,'w','utf-8')
for x in content:
f.write(x)
f.write('\n')
f.close()
def write_feature(file_path, content):
if os.path.exists(file_path):
os.remove(file_path)
f=codecs.open(file_path,'w','utf-8')
for x in content:
for y in x:
f.write(y)
f.write('\n')
f.close()
def readfile(filepath):
file_object = codecs.open(filepath,'r','utf-8')
word = []
flag = dict()
try:
newarr = []
all_text = file_object.read()
arr = all_text.split()
for a in arr:
if not a in flag:
newarr.append(a)
flag[a]=1
finally:
file_object.close()
create_txt("F:/course/sentimentcode/feature/data/wordset",newarr)
return newarr
def redundencyprune(res, frequency, threshold):
freqdist = dict()
reslen = len(res)
for i in range(reslen):
wordarr = res[i]
l = len(wordarr)
for j in range(l):
dic = dict()
dic[l]=frequency[i][j]
word = wordarr[j]
if not word in freqdist:
freqdist[word] = []
freqdist[word].append(dic)
removewords = []
for k,v in freqdist.items():
if len(v)>1:
sum = 0
for d in v:
for k1,v1 in d.items():
if not k1 == 1:
sum = sum-v1
else:
sum = sum+v1
if sum < threshold:
removewords.append(k)
print(k)
for word in removewords:
arr = []
arr.append(word)
res.remove(arr)
return res
file_path = 'F:/course/sentimentcode/feature/data/newnoun.basket'
data = Orange.data.Table(file_path)
noun_path = 'F:/course/sentimentcode/feature/data/newnoun'
arr = readfile(noun_path)
X, mapping = OneHot.encode(data, include_class=False)
itemsets = dict(frequent_itemsets(X, .01))
itemsets = sorted(itemsets.items(), key=lambda x:x[1])
res = []
frequency = []
for item in itemsets:
temp = []
tempf = []
for i in item[0]:
temp.append(arr[i])
tempf.append(item[1])
res.append(temp)
frequency.append(tempf)
res = redundencyprune(res, frequency, 200)
write_feature("F:/course/sentimentcode/feature/data/features", res)