Skip to content

Commit

Permalink
changes mainly in the lexicon, for months of the year, to invoke time…
Browse files Browse the repository at this point in the history
…-related lexical entries. A bit better than what was there before, though still lacking a CARG value.
  • Loading branch information
olzama committed May 29, 2024
1 parent c5b83ec commit 4fe97f8
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 55 deletions.
1 change: 0 additions & 1 deletion irtypes.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ fem_pl_ilr := fem_ilr & pl_ilr.
neut_sg_ilr := neut_ilr & sg_ilr.
neut_pl_ilr := neut_ilr & sg_ilr.


; -- determiners

d_ilr := infl-ltow-rule &
Expand Down
2 changes: 1 addition & 1 deletion letypes.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -1229,7 +1229,6 @@ nom-subj-synsem := lex-synsem &

""".


arg1_lt := nom-subj-synsem &
[ LOCAL.CAT.VAL.SUBJ < [ LOCAL.CONT.HOOK.INDEX #ind ] >,
LKEYS.KEYREL arg1-ev-relation &
Expand Down Expand Up @@ -4612,6 +4611,7 @@ v_ap_ser_synsem := v_copula_synsem &
"""
e.g. Juan es feo
removed LOCAL [ CAT.VAL COMPS < [ LOCAL [ CAT.HEAD [ KEYS.KEY basic_adj_rel ], to cover "será complicado"
OZ 2024-01-31: Added AGR #sind in MOD to rule out "Ella es bueno"
""".


Expand Down
62 changes: 25 additions & 37 deletions lexicon.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -1094,10 +1094,9 @@ abrillantar_v-np := v_np_native_le &
[ STEM < "abrillantar" >,
SYNSEM.LKEYS.KEYREL.PRED "_abrillantar_v_rel" ].

abril_n := n_-_c_native_le &
abril_n := n_-_c-tmp_native_le &
[ STEM < "abril" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_abril_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

abrir_v-np-pp_a-sbj_cp_p := v_np-ppa*_sbj-cp-p-sub_native_le &
[ STEM < "abrir" >,
Expand Down Expand Up @@ -8548,10 +8547,9 @@ agostar_v-np_rfx := v_np_rfx_native_le &
[ STEM < "agostar" >,
SYNSEM.LKEYS.KEYREL.PRED "_agostar_v_rel" ].

agosto_n := n_-_c_native_le &
agosto_n := n_-_c-tmp_native_le &
[ STEM < "agosto" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_agosto_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

agotable_aj-nspd := aj_-_i-nspd_native_le &
[ STEM < "agotable" >,
Expand Down Expand Up @@ -77901,10 +77899,9 @@ diciclopentadieno_n := n_-_nc_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT abs,
PRED "_diciclopentadieno_n_rel" ] ].

diciembre_n := n_-_c_native_le &
diciembre_n := n_-_c-tmp_native_le &
[ STEM < "diciembre" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_diciembre_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

dicloroacetato_n := n_-_c_native_le &
[ STEM < "dicloroacetato" >,
Expand Down Expand Up @@ -88926,10 +88923,9 @@ energúmeno_aj-i := aj_-_i_native_le &
[ STEM < "energúmeno" >,
SYNSEM.LKEYS.KEYREL.PRED "_energúmeno_a_rel" ].

enero_n := n_-_c_native_le &
enero_n := n_-_c-tmp_native_le &
[ STEM < "enero" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_enero_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

enervar_v-np_rfx := v_np_rfx_native_le &
[ STEM < "enervar" >,
Expand Down Expand Up @@ -104796,10 +104792,9 @@ fealdad_n-pp := n_pp_mc_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT abs,
PRED "_fealdad_n_rel" ] ].

febrero_n := n_-_c_native_le &
febrero_n := n_-_c-tmp_native_le &
[ STEM < "febrero" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_febrero_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

febrífugo_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "febrífugo" >,
Expand Down Expand Up @@ -139880,10 +139875,9 @@ juicioso_aj-i := aj_-_i_native_le &
[ STEM < "juicioso" >,
SYNSEM.LKEYS.KEYREL.PRED "_juicioso_a_rel" ].

julio_n := n_-_c_native_le &
julio_n := n_-_c-tmp_native_le &
[ STEM < "julio" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp_unit,
PRED "_julio_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

julio_n-part := n_pp_psd-part_native_le &
[ STEM < "julio" >,
Expand All @@ -139904,10 +139898,9 @@ jungla_n := n_-_c_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT loc,
PRED "_jungla_n_rel" ] ].

junio_n := n_-_c_native_le &
junio_n := n_-_c-tmp_native_le &
[ STEM < "junio" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_junio_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

junior_n := n_-_c_native_le &
[ STEM < "junior" >,
Expand Down Expand Up @@ -150917,10 +150910,9 @@ marxista_n := n_-_c_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT hum,
PRED "_marxista_n_rel" ] ].

marzo_n := n_-_c_native_le &
marzo_n := n_-_c-tmp_native_le &
[ STEM < "marzo" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_marzo_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

masacrar_v-np := v_np_native_le &
[ STEM < "masacrar" >,
Expand Down Expand Up @@ -151823,10 +151815,9 @@ mayonesa_n := n_-_c_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT cnc,
PRED "_mayonesa_n_rel" ] ].

mayo_n := n_-_c_native_le &
mayo_n := n_-_c-tmp_native_le &
[ STEM < "mayo" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_mayo_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

mayor_aj-pp-cmp := aj_pp_i-cmp_native_le &
[ STEM < "mayor" >,
Expand Down Expand Up @@ -166210,10 +166201,9 @@ novicio_n := n_-_c_native_le &
SYNSEM.LKEYS.KEYREL [ ARG0.SORT hum,
PRED "_novicio_n_rel" ] ].

noviemtre_n := n_-_c_native_le &
[ STEM < "noviemtre" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_noviemtre_n_rel" ] ].
noviembre_n := n_-_c-tmp_native_le &
[ STEM < "noviembre" >,
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

novillo_n := n_-_c_native_le &
[ STEM < "novillo" >,
Expand Down Expand Up @@ -167726,10 +167716,9 @@ octovalente_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "octovalente" >,
SYNSEM.LKEYS.KEYREL.PRED "_octovalente_a_rel" ].

octubre_n := n_-_c_native_le &
octubre_n := n_-_c-tmp_native_le &
[ STEM < "octubre" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_octubre_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

óctuple_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "óctuple" >,
Expand Down Expand Up @@ -214632,10 +214621,9 @@ séptico_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "séptico" >,
SYNSEM.LKEYS.KEYREL.PRED "_séptico_a_rel" ].

septiembre_n := n_-_c_native_le &
septiembre_n := n_-_c-tmp_native_le &
[ STEM < "septiembre" >,
SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp,
PRED "_septiembre_n_rel" ] ].
SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ].

septisómico_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "septisómico" >,
Expand Down
2 changes: 1 addition & 1 deletion tsdb/skeletons/mrs/item
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
311@unknown@formal@none@1@S@El perro que perseguía a Núria lladró.@@@@1@7@The dog that Browne chased barked.@oe@21-9-2008
321@unknown@formal@none@1@S@El perro a perseguir está ladrando@@@@1@6@The dog to chase is barking.@oe@21-9-2008
331@unknown@formal@none@1@S@El perro fue perseguido por Núria@@@@1@6@The dog was chased by Browne.@oe@21-9-2008
341@unknown@formal@none@1@S@El perro perseguido por Núria ladró@@@@1@6@The dog chased by Browne barked.@oe@21-9-2008
341@unknown@formal@none@1@S@El perro perseguido por el gato ladró@@@@1@6@The dog chased by the cat barked.@oe@21-9-2008
351@unknown@formal@none@1@S@El perro está ladrando.@@@@1@4@The dog is barking.@oe@21-9-2007
361@unknown@formal@none@1@S@El perro ha ladrado.@@@@1@4@The dog has barked.@oe@21-9-2007
371@unknown@formal@none@1@S@El perro ha estado ladrando.@@@@1@5@The dog has been barking.@oe@21-9-2007
Expand Down
17 changes: 15 additions & 2 deletions util/freeling_api/srg-freeling.dat
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ alguna @any
algunos @any
algunas @any
rueda @any
peor @any
mejor @any
divertido @any
pasado @any
</NoDisambiguate>

## List of words for which the list of output analysis given
Expand Down Expand Up @@ -106,6 +105,7 @@ escasa escaso AQ0FS00 escaso DI0FS0
escasas escaso AQ0FP00 escaso DI0FP0
escaso escaso AQ0MS00 escaso DI0MS0
escasos escaso AQ0MP00 escaso DI0MP0
helado helado NCMS000 helado VMP00SM
numerosa numeroso AQ0FS00 numeroso DI0FS0
numerosas numeroso AQ0FP00 numeroso DI0FP0
numeroso numeroso AQ0MS00 numeroso DI0MS0
Expand All @@ -117,11 +117,24 @@ raros raro AQ0MP00 raro DI0MP0
cientos ciento Zd
millares millar Zd
miles mil Zd
mejor mejor AQ0CS00 mejor RG
mejores mejor AQ0CP00
peor peor AQ0CS00 peor RG
peores peor AQ0CP00
mayor mayor AQ0CS00
mayores mayor AQ0CP00
menor menor AQ0CS00
menores menor AQ0CP00
off-line off-line AQ0CN00
on-line on-line AQ0CN00
no_sólo no_sólo CC no_sólo RG
hace hace SP hacer VMIP3S0
hacía hacía SP hacer VMII3S0
cerca cerca RG cercar VMIP3S0
mar. marzo NCMS000 mar NCMS000
favorito favorito AQ0MS00 favorito NCMS000
favoritos favorito AQ0MP00 favorito NCMP000

</ReplaceAll>

## List of tag fusions to perform.
Expand Down
8 changes: 4 additions & 4 deletions util/srg_freeling2yy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def override_tag(selected, word, lemma, tag, override_dicts):
return {'tags': [tags[0]+'+'+tags[1][:-3]], 'prob': -1 }
else:
print("More than four tags in Freeling output: {}".format(selected['tag']))
if lemma in override_dicts['replace'] and len(override_dicts['replace'][lemma]['lemma']) == 1:
return {'tags': override_dicts['replace'][lemma]['tag'], 'prob': -1 }
if word in override_dicts['replace'] and len(override_dicts['replace'][word]['lemma']) == 1:
return {'tags': override_dicts['replace'][word]['tag'], 'prob': -1 }
return {'tags': [selected['tag']], 'prob': selected['prob']}
#raise Exception("selected tag not in tag list")

Expand Down Expand Up @@ -77,8 +77,8 @@ def convert_sentences(sentences, override_dicts):
for j,tok in enumerate(sent['tokens']):
is_additional = tok['additional']
surface = tok['form']
#if surface == 'primer':
# print('debug')
if surface == 'mar.':
print('debug')
tag_prob = {'tag': tok['tag'], 'prob':tok['prob']}
pos_conf = override_tag(tag_prob, surface.lower(), tok['lemma'], tok['tag'], override_dicts)
if len(pos_conf['tags']) > 1:
Expand Down
21 changes: 14 additions & 7 deletions util/tokenize_and_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def tokenize_and_tag(self, sentence_list, override_dicts):
tag = '" "+'.join([tp['tag'] for tp in tags_probs])
prob = tags_probs[-1]['prob']
#print("lemma: {}, form: {}, start: {}, end: {}, tag: {}".format(w.get_lemma(), w.get_form(), w.get_span_start(), w.get_span_finish(), w.get_tag()))
output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
lemma = w.get_lemma() if not 'lemma' in tags_probs[-1] else tags_probs[-1]['lemma']
output[i]['tokens'].append({'lemma': lemma, 'form': w.get_form(),
'start':w.get_span_start(), 'end': w.get_span_finish(),
'tag': tag, 'prob': prob, 'additional': additional})
for k,arc in enumerate(additional_arcs):
Expand Down Expand Up @@ -129,24 +130,30 @@ def freeling_analyze(self, lin, sid):
def get_selected_tags(self, w, override_dicts):
tags = []
additional_arcs = []
#if w.get_form().lower() == "primer":
# print("debug")
seen = set()
if w.get_form().lower() == "mar.":
print("debug")
for a in w:
if a.is_selected():
if a.is_retokenizable():
tks = a.get_retokenizable()
for tk in tks:
tags.append(({'tag': tk.get_tag(), 'prob': a.get_prob()}))
else:
if not w.get_form().lower() in override_dicts['replace']:
needs_replacement = w.get_form().lower() in override_dicts['replace']
if not needs_replacement:
tags.append(({'additional':False, 'tag': a.get_tag(), 'prob': a.get_prob()}))
else:
for i, additional_tag in enumerate(override_dicts['replace'][w.get_form().lower()]['tag']):
additional_lemma = override_dicts['replace'][w.get_form().lower()]['lemma'][i]
if i == 0:
tags.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma}))
if (additional_tag, additional_lemma) not in seen:
tags.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma}))
seen.add((additional_tag, additional_lemma))
else:
additional_arcs.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma}))
if (additional_tag, additional_lemma) not in seen:
additional_arcs.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma}))
seen.add((additional_tag, additional_lemma))
else:
# There are words for which Freeling selected analysis should be ignored (no analysis discarded).
# In principle, there is also one tag for which it should be done if the word is in the first position:
Expand All @@ -160,5 +167,5 @@ def get_selected_tags(self, w, override_dicts):
add_tags = ADD_TAGS[a.get_tag()].split(',')
for mt in add_tags:
additional_arcs.append(({'additional': True, 'tag': mt.strip(), 'prob': -1, 'lemma': a.get_lemma()}))

return tags, additional_arcs

2 changes: 1 addition & 1 deletion util/treebanking-scripts/ace-process-all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ directory="$1"

for profile in "$directory"/*; do
echo $profile
delphin process --options="-y --yy-rules --max-chart-megabytes=24000 --max-unpack-megabytes=24000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens "$profile"
delphin process --options="-1 -p -y --yy-rules --max-chart-megabytes=48000 --max-unpack-megabytes=56000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens "$profile"
done

2 changes: 1 addition & 1 deletion util/treebanking-scripts/ace-process.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

profile="$1"

delphin process --options="-y --yy-rules -1" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens ~/delphin/SRG/treebanks/dev/all/$profile
delphin process --options="-1 -p -y --yy-rules --max-chart-megabytes=48000 --max-unpack-megabytes=56000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens $profile

0 comments on commit 4fe97f8

Please sign in to comment.