-
Notifications
You must be signed in to change notification settings - Fork 3
/
create_tag_map.awk
58 lines (52 loc) · 1.44 KB
/
create_tag_map.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
BEGIN {
FS = "\t";
OFS = "\t";
print "# coding: utf8"
print "from __future__ import unicode_literals"
print "from spacy.symbols import POS, " poses "\n"
print "TAG_MAP = {"
}
{
if (substr($1,1,1) == "#" || substr($1,1,1)== "" ) {
# skip lines
} else {
#split UD tags
split($6, splits,"|")
joined = "";
# UD tags are comma separated
for (i=1; i <= length(splits); i++) {
split(splits[i], minisplits, "=")
# skip empty (_) as in CCONJ or SCONJ
if (minisplits[1] != "_"){
joined = joined ", " "\""minisplits[1]"\":" "\""minisplits[2]"\"";
} else {
joined = joined
}
}
# ignore UD tags (other than POS) for the time being
# because spacy v2 can't deal with arbitrary values
# and some of our values are not "acceptable"
# https://github.com/explosion/spaCy/issues/6019
# if (NR != lines) {
# print " \"" $5 "\":{POS:" $4 joined"},"
# } else {
# print " \"" $5 "\":{POS:" $4 joined"}"
# }
#
if (NR != lines) {
print " \"" $5 "\":{POS:" $4"},"
} else {
print " \"" $5 "\":{POS:" $4"}"
}
}
}
END {
print "}"
}
# from ..symbols import POS, NOUN, VERB, DET
#
# TAG_MAP = {
# "NNS": {POS: NOUN, "Number": "plur"},
# "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
# "DT": {POS: DET}
# }