-
Notifications
You must be signed in to change notification settings - Fork 0
/
notes.txt
executable file
·68 lines (54 loc) · 2.51 KB
/
notes.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
Kenlm vs. SRILM scoring
>>> sentence2 = "Est es un casa"
>>> words = ['<s>'] + sentence2.split() + ['</s>']
>>> for i, (prob, length) in enumerate(klm.full_scores(sentence2)):
print "{} {}:{}".format(prob, length, ' '.join(words[i+2-length:i+2]))
-6.54681587219 2:<s> Est
-0.934829890728 2:Est es
-1.33603322506 2:es un
-5.18935441971 2:un casa
-1.61485874653 2:casa </s>
>>> import srilm
>>> slm.total_logprob_strings(["Est", "es", "un", "casa"])
-14.00703340768814
>>> -6.54681587219 + -0.934829890728 + -1.33603322506 + -5.18935441971 + -1.61485874653
-15.621892154218001
>>> -1.61485874653 + -14.00703340768814
-15.621892154218141
>>>
>>> slm.logprob_strings("</s>", ["casa", "un", "es", "<s>"])
-1.6148587465286255
>>> slm.total_logprob_strings(["es", "un", "casa"])
-8.953945636749268
>>> klm.score("es un casa")
-10.568803787231445
>>> slm.total_logprob_strings(["es", "un", "casa", "</s>"])
-10.568804383277893
>>> slm.total_logprob_strings(["<s>", "es", "un", "casa", "</s>"])
-inf
Results comparisons to keep
version 2 for entities vs version 3
devset
pruiz@pruiz-ubuntu-desktop:~/DATA/projects/Tweet-Norm/tnor2$ diff -y ../results2/devset_run_631_eval.txt ../results2/devset_run_641_eval.txt | grep ^NEG | grep POS | less
testset
630 vs. 640 for version 2 vs. 3
in the case of the testset, can also compare 618 with 630 for version 1 vs. version 2
Settings that bring it back to best results
# ISOLATE COMPONENTS -----------------------------------------------------------
generic_workflow = bool(1) # 0 if applying components separately
use_lmall = bool(0) # new lm workflow
no_postprocessing = bool(0)
activate_prepro = bool(1) # 1 if gonna use one of safelist, abbrev, runin, regex
safelist_end = bool(0) # stop after safelist
abbrev_end = bool(0) # stop after abbrev
#runin_end = bool(0) # no need, cos at that point, the function returns anyway
use_regexes = bool(1) ; regex_end = bool(0) # use_regexes to turn on/off. Both 1 means regexes ONLY
trusted_end = bool(0)
trusted_and_regex_end = bool(0)
use_ed = bool(1) # Edit candidates
#edcand_end = bool(0) # Edit candidates only: No need, cos use_entities 0 does it
use_entities = bool(1)
#
Pb with OOV in allcaps:
827 vs. 843: now solved, allcaps and caps initial found in IV dico resolved.
the other differences between 827 and 843 are due to increment_norm = False in 827.