-
Notifications
You must be signed in to change notification settings - Fork 12
/
n_gram.py
47 lines (33 loc) · 977 Bytes
/
n_gram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 9 13:26:50 2018
@author: 燃烧杯
词集模型
"""
import sys
#n-gram的n值
n = int(sys.argv[1])
print("n = ", n)
import pandas as pd
origin = pd.read_csv("data.csv")
#origin = pd.read_csv("test.csv")
from infrastructure.mydict import MyDict
mdict = MyDict()
feature = origin["Feature"].str.split("|")
total = len(feature)
for i, code in enumerate(feature):
mdict.newLayer()
if not type(code) == list:
continue
for method in code:
length = len(method)
if length < n:
continue
for start in range(length - (n - 1)):
end = start + n
mdict.mark(method[start:end])
print("已完成", i, "个应用,百分比如下:")
print((i + 1) * 100 / total, "%")
result = mdict.dict
pd.DataFrame(result, index=origin.index)\
.to_csv("./" + str(n) + "_gram.csv", index=False)