-
Notifications
You must be signed in to change notification settings - Fork 0
/
we_sensesim.py
112 lines (99 loc) · 3.67 KB
/
we_sensesim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import sys
import utils
import os
from collections import defaultdict
from nltk.corpus import wordnet as wn
from scipy.spatial.distance import cosine
from scipy.spatial.distance import correlation
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
from utils import trim
import pdb
"""
Sense embedding format: see https://github.com/sjauhar/SenseRetrofit
Use ',' to seperate Datasets
"""
def run(path, fname):
'''
if len(sys.argv) != 3:
print("Usage: python we_sensesim.py SenseEmbedding Datasets")
exit(0)
'''
#wvs = utils.readWordVecs(os.path.expanduser(full_path))
wvs = utils.readWordVecs(sys.argv[1])
print("Finish reading vector!")
wvssen = {}
s_list = defaultdict(list)
for sense in wvs:
wvssen[sense.split("%")[0]] = ''
s_list[sense.split("%")[0]].append(sense)
mean_vector = np.mean(wvs.values(), axis=0)
spear_score_max = []
spear_score_avg = []
f_name = []
for name in fname:
full_path = os.path.join(path, name)
filenames = os.path.expanduser(full_path).split(',')
pairs, scores = utils.readDataset(filenames[0], no_skip=True)
#f_name.append(filenames[0])
#print("Pair number for %s: %d"%(filenames[0], len(pairs)))
coefs_max = []
coefs_avg = []
missing = 0
for pair in pairs:
vecs0 = []
trimed_p0 = trim(pair[0], wvssen)
if trimed_p0 not in wvssen:
vecs0.append(mean_vector)
missing += 1
#print trimed_p0,
else:
for sense in s_list[trimed_p0]:
vecs0.append(wvs[sense])
'''
for sense in wvs:
word = sense.split("%")[0]
if trimed_p0 == word:
vecs0.append(wvs[sense])
'''
vecs1 = []
trimed_p1 = trim(pair[1],wvssen)
if trimed_p1 not in wvssen:
vecs1.append(mean_vector)
missing += 1
#print trimed_p1,
else:
for sense in s_list[trimed_p1]:
vecs1.append(wvs[sense])
'''
for sense in wvs:
word = sense.split("%")[0]
if trimed_p1 == word:
vecs1.append(wvs[sense])
'''
'''
max_value and avg_value: see "Multi-Prototype Vector-Space Models of Word Meaning" section 3.2 Measuring Semantic Similarity
http://www.cs.utexas.edu/~ml/papers/reisinger.naacl-2010.pdf
'''
max_value = max([1-cosine(a,b) for a in vecs0 for b in vecs1])
avg_value = np.mean([1-cosine(a,b) for a in vecs0 for b in vecs1])
coefs_max.append(max_value)
coefs_avg.append(avg_value)
spear_max = spearmanr(scores, coefs_max)
pearson_max = pearsonr(scores, coefs_max)
spear_avg = spearmanr(scores, coefs_avg)
pearson_avg = pearsonr(scores, coefs_avg)
spear_score_max.append(spear_max[0])
spear_score_avg.append(spear_avg[0])
print 'type \t',
for i in range(len(fname)):
print fname[i].split('.')[0],
print '\nspear max\t',
for i in range(len(fname)):
print '%.04f,' % (spear_score_max[i]),
print '\nspear avg\t',
for i in range(len(fname)):
print '%.04f,' % (spear_score_avg[i]),
if __name__ == "__main__":
run('./eval_data', ['EN-MEN-n.txt', 'EN-MEN-l.txt', 'EN-TRUK.txt', 'EN-RW.txt', 'EN-WS353.txt', 'EN-WS353-s.txt', 'EN-WS353-r.txt'])