-
Notifications
You must be signed in to change notification settings - Fork 4
/
text_sim.py
105 lines (87 loc) · 3.29 KB
/
text_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import nltk, string
import numpy as np
import re, os
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopwords = nltk.corpus.stopwords.words('english')
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words=stopwords)
def cosine_sim(text1, text2):
lent1 = len(text1)
text = text1 + text2
tfidf = vectorizer.fit_transform(text)
vec = ((tfidf * tfidf.T).A)
cng = np.delete(np.delete(vec, np.s_[0:lent1], axis=1), np.s_[lent1:len(text)], axis=0)
return [(text1[i],np.amax(cng[i])) for i in range(len(cng))]
def sort_pattern(x):
return x[1]
def find_uncanny(text1, text2):
for k in text1:
if len(k) < 10:
text1.remove(k)
for k in text2:
if len(k) < 10:
text2.remove(k)
res = cosine_sim(text1, text2)
dc = {}
for (r1, r2) in res:
dc[r1] = r2
return dc
def clean_sentence(dct, s):
fs = s.replace("\n","")
fs = re.sub(r"[0-9]+", "", fs)
fs = re.sub(r" +", " ", fs)
dct[fs] = s
return fs
def preprocess_colons(text):
split_col = list(filter(None,re.split(r"(?<!(HTTP)|(http))(?<!(HTTPS)|(https)) *: *", text)))
split_col_num = [list(filter(None, re.split(r"[,;] *[0-9]+\.* *", t))) for t in split_col]
fin_split = [[nltk.sent_tokenize(t) for t in ts] for ts in split_col_num]
for i in range(1, len(fin_split)):
prev_sentence = fin_split[i-1][-1].pop() + " "
if len(fin_split[i]) == 1:
end = len(fin_split[i][0]) - 1
else:
end = len(fin_split[i][0])
for j in range(end):
fin_split[i][0][j] = prev_sentence + fin_split[i][0][j]
ret = []
for sublist in fin_split:
for subsublist in sublist:
for item in subsublist:
ret.append(item)
return ret
def parse(f1):
dct = {}
text1 = list(map(lambda s: clean_sentence(dct, s), preprocess_colons(f1)))
reg = re.compile(r"http\S+|HTTP\S+")
for st in text1:
if reg.search(st):
text1.remove(st)
maxima = {}
path = os.getcwd()
for root, _, files in os.walk(path + '/licenses'):
for file in files:
if file.endswith(".txt") and file != 'apple_fixed.txt':
with open(os.path.join(root,file), 'r') as f:
text2 = list(map(lambda s: s.replace("\n",""), nltk.sent_tokenize(f.read())))
uncanny = find_uncanny(text1, text2)
for key in uncanny:
if key in maxima:
maxima[key] = max(uncanny[key],maxima[key])
else:
maxima[key] = uncanny[key]
fin = []
for key in maxima:
if abs(maxima[key]-1) > 10.0**(-6):
fin.append((key, maxima[key]))
fin.sort(key = sort_pattern)
if len(fin) == 0:
return "Pretty standard legalese here!"
else:
return '\n'.join([dct[f[0]] for f in fin][:min(5, len(fin))])