diff --git a/irtypes.tdl b/irtypes.tdl index e933f9c..2dda822 100644 --- a/irtypes.tdl +++ b/irtypes.tdl @@ -232,7 +232,6 @@ fem_pl_ilr := fem_ilr & pl_ilr. neut_sg_ilr := neut_ilr & sg_ilr. neut_pl_ilr := neut_ilr & sg_ilr. - ; -- determiners d_ilr := infl-ltow-rule & diff --git a/letypes.tdl b/letypes.tdl index dd6abe3..357e138 100644 --- a/letypes.tdl +++ b/letypes.tdl @@ -1229,7 +1229,6 @@ nom-subj-synsem := lex-synsem & """. - arg1_lt := nom-subj-synsem & [ LOCAL.CAT.VAL.SUBJ < [ LOCAL.CONT.HOOK.INDEX #ind ] >, LKEYS.KEYREL arg1-ev-relation & @@ -4612,6 +4611,7 @@ v_ap_ser_synsem := v_copula_synsem & """ e.g. Juan es feo removed LOCAL [ CAT.VAL COMPS < [ LOCAL [ CAT.HEAD [ KEYS.KEY basic_adj_rel ], to cover "será complicado" + OZ 2024-01-31: Added AGR #sind in MOD to rule out "Ella es bueno" """. diff --git a/lexicon.tdl b/lexicon.tdl index b591675..9580bd4 100644 --- a/lexicon.tdl +++ b/lexicon.tdl @@ -1094,10 +1094,9 @@ abrillantar_v-np := v_np_native_le & [ STEM < "abrillantar" >, SYNSEM.LKEYS.KEYREL.PRED "_abrillantar_v_rel" ]. -abril_n := n_-_c_native_le & +abril_n := n_-_c-tmp_native_le & [ STEM < "abril" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_abril_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. abrir_v-np-pp_a-sbj_cp_p := v_np-ppa*_sbj-cp-p-sub_native_le & [ STEM < "abrir" >, @@ -8548,10 +8547,9 @@ agostar_v-np_rfx := v_np_rfx_native_le & [ STEM < "agostar" >, SYNSEM.LKEYS.KEYREL.PRED "_agostar_v_rel" ]. -agosto_n := n_-_c_native_le & +agosto_n := n_-_c-tmp_native_le & [ STEM < "agosto" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_agosto_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. agotable_aj-nspd := aj_-_i-nspd_native_le & [ STEM < "agotable" >, @@ -77901,10 +77899,9 @@ diciclopentadieno_n := n_-_nc_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT abs, PRED "_diciclopentadieno_n_rel" ] ]. -diciembre_n := n_-_c_native_le & +diciembre_n := n_-_c-tmp_native_le & [ STEM < "diciembre" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_diciembre_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. dicloroacetato_n := n_-_c_native_le & [ STEM < "dicloroacetato" >, @@ -88926,10 +88923,9 @@ energúmeno_aj-i := aj_-_i_native_le & [ STEM < "energúmeno" >, SYNSEM.LKEYS.KEYREL.PRED "_energúmeno_a_rel" ]. -enero_n := n_-_c_native_le & +enero_n := n_-_c-tmp_native_le & [ STEM < "enero" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_enero_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. enervar_v-np_rfx := v_np_rfx_native_le & [ STEM < "enervar" >, @@ -104796,10 +104792,9 @@ fealdad_n-pp := n_pp_mc_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT abs, PRED "_fealdad_n_rel" ] ]. -febrero_n := n_-_c_native_le & +febrero_n := n_-_c-tmp_native_le & [ STEM < "febrero" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_febrero_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. febrífugo_aj-i-nprd := aj_-_i-nprd_native_le & [ STEM < "febrífugo" >, @@ -139880,10 +139875,9 @@ juicioso_aj-i := aj_-_i_native_le & [ STEM < "juicioso" >, SYNSEM.LKEYS.KEYREL.PRED "_juicioso_a_rel" ]. -julio_n := n_-_c_native_le & +julio_n := n_-_c-tmp_native_le & [ STEM < "julio" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp_unit, - PRED "_julio_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. julio_n-part := n_pp_psd-part_native_le & [ STEM < "julio" >, @@ -139904,10 +139898,9 @@ jungla_n := n_-_c_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT loc, PRED "_jungla_n_rel" ] ]. -junio_n := n_-_c_native_le & +junio_n := n_-_c-tmp_native_le & [ STEM < "junio" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_junio_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. junior_n := n_-_c_native_le & [ STEM < "junior" >, @@ -150917,10 +150910,9 @@ marxista_n := n_-_c_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT hum, PRED "_marxista_n_rel" ] ]. -marzo_n := n_-_c_native_le & +marzo_n := n_-_c-tmp_native_le & [ STEM < "marzo" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_marzo_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. masacrar_v-np := v_np_native_le & [ STEM < "masacrar" >, @@ -151823,10 +151815,9 @@ mayonesa_n := n_-_c_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT cnc, PRED "_mayonesa_n_rel" ] ]. -mayo_n := n_-_c_native_le & +mayo_n := n_-_c-tmp_native_le & [ STEM < "mayo" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_mayo_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. mayor_aj-pp-cmp := aj_pp_i-cmp_native_le & [ STEM < "mayor" >, @@ -166210,10 +166201,9 @@ novicio_n := n_-_c_native_le & SYNSEM.LKEYS.KEYREL [ ARG0.SORT hum, PRED "_novicio_n_rel" ] ]. -noviemtre_n := n_-_c_native_le & - [ STEM < "noviemtre" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_noviemtre_n_rel" ] ]. +noviembre_n := n_-_c-tmp_native_le & + [ STEM < "noviembre" >, + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. novillo_n := n_-_c_native_le & [ STEM < "novillo" >, @@ -167726,10 +167716,9 @@ octovalente_aj-i-nprd := aj_-_i-nprd_native_le & [ STEM < "octovalente" >, SYNSEM.LKEYS.KEYREL.PRED "_octovalente_a_rel" ]. -octubre_n := n_-_c_native_le & +octubre_n := n_-_c-tmp_native_le & [ STEM < "octubre" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_octubre_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. óctuple_aj-i-nprd := aj_-_i-nprd_native_le & [ STEM < "óctuple" >, @@ -214632,10 +214621,9 @@ séptico_aj-i-nprd := aj_-_i-nprd_native_le & [ STEM < "séptico" >, SYNSEM.LKEYS.KEYREL.PRED "_séptico_a_rel" ]. -septiembre_n := n_-_c_native_le & +septiembre_n := n_-_c-tmp_native_le & [ STEM < "septiembre" >, - SYNSEM.LKEYS.KEYREL [ ARG0.SORT tmp, - PRED "_septiembre_n_rel" ] ]. + SYNSEM.LKEYS.KEYREL.ARG0.SORT tmp ]. septisómico_aj-i-nprd := aj_-_i-nprd_native_le & [ STEM < "septisómico" >, diff --git a/tsdb/skeletons/mrs/item b/tsdb/skeletons/mrs/item index e6e6134..2a46b4d 100755 --- a/tsdb/skeletons/mrs/item +++ b/tsdb/skeletons/mrs/item @@ -31,7 +31,7 @@ 311@unknown@formal@none@1@S@El perro que perseguía a Núria lladró.@@@@1@7@The dog that Browne chased barked.@oe@21-9-2008 321@unknown@formal@none@1@S@El perro a perseguir está ladrando@@@@1@6@The dog to chase is barking.@oe@21-9-2008 331@unknown@formal@none@1@S@El perro fue perseguido por Núria@@@@1@6@The dog was chased by Browne.@oe@21-9-2008 -341@unknown@formal@none@1@S@El perro perseguido por Núria ladró@@@@1@6@The dog chased by Browne barked.@oe@21-9-2008 +341@unknown@formal@none@1@S@El perro perseguido por el gato ladró@@@@1@6@The dog chased by the cat barked.@oe@21-9-2008 351@unknown@formal@none@1@S@El perro está ladrando.@@@@1@4@The dog is barking.@oe@21-9-2007 361@unknown@formal@none@1@S@El perro ha ladrado.@@@@1@4@The dog has barked.@oe@21-9-2007 371@unknown@formal@none@1@S@El perro ha estado ladrando.@@@@1@5@The dog has been barking.@oe@21-9-2007 diff --git a/util/freeling_api/srg-freeling.dat b/util/freeling_api/srg-freeling.dat index 19c9c64..2db8dc3 100644 --- a/util/freeling_api/srg-freeling.dat +++ b/util/freeling_api/srg-freeling.dat @@ -37,9 +37,8 @@ alguna @any algunos @any algunas @any rueda @any -peor @any -mejor @any divertido @any +pasado @any ## List of words for which the list of output analysis given @@ -106,6 +105,7 @@ escasa escaso AQ0FS00 escaso DI0FS0 escasas escaso AQ0FP00 escaso DI0FP0 escaso escaso AQ0MS00 escaso DI0MS0 escasos escaso AQ0MP00 escaso DI0MP0 +helado helado NCMS000 helado VMP00SM numerosa numeroso AQ0FS00 numeroso DI0FS0 numerosas numeroso AQ0FP00 numeroso DI0FP0 numeroso numeroso AQ0MS00 numeroso DI0MS0 @@ -117,11 +117,24 @@ raros raro AQ0MP00 raro DI0MP0 cientos ciento Zd millares millar Zd miles mil Zd +mejor mejor AQ0CS00 mejor RG +mejores mejor AQ0CP00 +peor peor AQ0CS00 peor RG +peores peor AQ0CP00 +mayor mayor AQ0CS00 +mayores mayor AQ0CP00 +menor menor AQ0CS00 +menores menor AQ0CP00 off-line off-line AQ0CN00 on-line on-line AQ0CN00 no_sólo no_sólo CC no_sólo RG hace hace SP hacer VMIP3S0 hacía hacía SP hacer VMII3S0 +cerca cerca RG cercar VMIP3S0 +mar. marzo NCMS000 mar NCMS000 +favorito favorito AQ0MS00 favorito NCMS000 +favoritos favorito AQ0MP00 favorito NCMP000 + ## List of tag fusions to perform. diff --git a/util/srg_freeling2yy.py b/util/srg_freeling2yy.py index 67eb001..1befa8a 100755 --- a/util/srg_freeling2yy.py +++ b/util/srg_freeling2yy.py @@ -40,8 +40,8 @@ def override_tag(selected, word, lemma, tag, override_dicts): return {'tags': [tags[0]+'+'+tags[1][:-3]], 'prob': -1 } else: print("More than four tags in Freeling output: {}".format(selected['tag'])) - if lemma in override_dicts['replace'] and len(override_dicts['replace'][lemma]['lemma']) == 1: - return {'tags': override_dicts['replace'][lemma]['tag'], 'prob': -1 } + if word in override_dicts['replace'] and len(override_dicts['replace'][word]['lemma']) == 1: + return {'tags': override_dicts['replace'][word]['tag'], 'prob': -1 } return {'tags': [selected['tag']], 'prob': selected['prob']} #raise Exception("selected tag not in tag list") @@ -77,8 +77,8 @@ def convert_sentences(sentences, override_dicts): for j,tok in enumerate(sent['tokens']): is_additional = tok['additional'] surface = tok['form'] - #if surface == 'primer': - # print('debug') + if surface == 'mar.': + print('debug') tag_prob = {'tag': tok['tag'], 'prob':tok['prob']} pos_conf = override_tag(tag_prob, surface.lower(), tok['lemma'], tok['tag'], override_dicts) if len(pos_conf['tags']) > 1: diff --git a/util/tokenize_and_tag.py b/util/tokenize_and_tag.py index 0e3a297..6200232 100755 --- a/util/tokenize_and_tag.py +++ b/util/tokenize_and_tag.py @@ -99,7 +99,8 @@ def tokenize_and_tag(self, sentence_list, override_dicts): tag = '" "+'.join([tp['tag'] for tp in tags_probs]) prob = tags_probs[-1]['prob'] #print("lemma: {}, form: {}, start: {}, end: {}, tag: {}".format(w.get_lemma(), w.get_form(), w.get_span_start(), w.get_span_finish(), w.get_tag())) - output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(), + lemma = w.get_lemma() if not 'lemma' in tags_probs[-1] else tags_probs[-1]['lemma'] + output[i]['tokens'].append({'lemma': lemma, 'form': w.get_form(), 'start':w.get_span_start(), 'end': w.get_span_finish(), 'tag': tag, 'prob': prob, 'additional': additional}) for k,arc in enumerate(additional_arcs): @@ -129,8 +130,9 @@ def freeling_analyze(self, lin, sid): def get_selected_tags(self, w, override_dicts): tags = [] additional_arcs = [] - #if w.get_form().lower() == "primer": - # print("debug") + seen = set() + if w.get_form().lower() == "mar.": + print("debug") for a in w: if a.is_selected(): if a.is_retokenizable(): @@ -138,15 +140,20 @@ def get_selected_tags(self, w, override_dicts): for tk in tks: tags.append(({'tag': tk.get_tag(), 'prob': a.get_prob()})) else: - if not w.get_form().lower() in override_dicts['replace']: + needs_replacement = w.get_form().lower() in override_dicts['replace'] + if not needs_replacement: tags.append(({'additional':False, 'tag': a.get_tag(), 'prob': a.get_prob()})) else: for i, additional_tag in enumerate(override_dicts['replace'][w.get_form().lower()]['tag']): additional_lemma = override_dicts['replace'][w.get_form().lower()]['lemma'][i] if i == 0: - tags.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma})) + if (additional_tag, additional_lemma) not in seen: + tags.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma})) + seen.add((additional_tag, additional_lemma)) else: - additional_arcs.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma})) + if (additional_tag, additional_lemma) not in seen: + additional_arcs.append(({'additional':True, 'tag': additional_tag, 'prob': -1, 'lemma': additional_lemma})) + seen.add((additional_tag, additional_lemma)) else: # There are words for which Freeling selected analysis should be ignored (no analysis discarded). # In principle, there is also one tag for which it should be done if the word is in the first position: @@ -160,5 +167,5 @@ def get_selected_tags(self, w, override_dicts): add_tags = ADD_TAGS[a.get_tag()].split(',') for mt in add_tags: additional_arcs.append(({'additional': True, 'tag': mt.strip(), 'prob': -1, 'lemma': a.get_lemma()})) - return tags, additional_arcs + diff --git a/util/treebanking-scripts/ace-process-all.sh b/util/treebanking-scripts/ace-process-all.sh index ed0995b..616c6a6 100755 --- a/util/treebanking-scripts/ace-process-all.sh +++ b/util/treebanking-scripts/ace-process-all.sh @@ -7,6 +7,6 @@ directory="$1" for profile in "$directory"/*; do echo $profile - delphin process --options="-y --yy-rules --max-chart-megabytes=24000 --max-unpack-megabytes=24000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens "$profile" + delphin process --options="-1 -p -y --yy-rules --max-chart-megabytes=48000 --max-unpack-megabytes=56000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens "$profile" done diff --git a/util/treebanking-scripts/ace-process.sh b/util/treebanking-scripts/ace-process.sh index 4caaf35..8158007 100755 --- a/util/treebanking-scripts/ace-process.sh +++ b/util/treebanking-scripts/ace-process.sh @@ -5,4 +5,4 @@ profile="$1" -delphin process --options="-y --yy-rules -1" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens ~/delphin/SRG/treebanks/dev/all/$profile +delphin process --options="-1 -p -y --yy-rules --max-chart-megabytes=48000 --max-unpack-megabytes=56000" -g ~/delphin/SRG/grammar/srg/ace/srg.dat --full-forest --select i-tokens $profile