From 624f3801f865435996fd0396712d758dbee1dd68 Mon Sep 17 00:00:00 2001 From: olzama Date: Mon, 20 Mar 2023 15:02:40 +0100 Subject: [PATCH 1/3] A script to create a Spanish MRS test suite with correct ids --- util/tsdb-updates/create_Spa_MRS_testsuite.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 util/tsdb-updates/create_Spa_MRS_testsuite.py diff --git a/util/tsdb-updates/create_Spa_MRS_testsuite.py b/util/tsdb-updates/create_Spa_MRS_testsuite.py new file mode 100644 index 0000000..4927af5 --- /dev/null +++ b/util/tsdb-updates/create_Spa_MRS_testsuite.py @@ -0,0 +1,27 @@ +''' +Assuming a [incr tsdb()] profile created with pydelphin from a text file +and a textfile mapping the same sentences from the text file to ID numbers, +update the test suite to use those id numbers instead of the automatically created ones. +''' +import sys +from delphin import itsdb + +def get_id_mappings(file_ids): + sentence2id = {} + with open (file_ids, 'r') as f: + sentences_ids = [ln for ln in f.read().splitlines() if ln] + for s_id in sentences_ids: + id,s = s_id.split('\t') + sentence2id[s] = int(id) + return sentence2id +def update_ids(ts_path, id_mappings): + ts = itsdb.TestSuite(ts_path) + for i, row in enumerate(ts['item']): + id = id_mappings[ts['item'][i]['i-input']] + ts['item'].update(i, {'i-id':id}) + ts.commit() # ts is a database which needs to be committed to disk, otherwise the updates will not persist. + + +if __name__ == "__main__": + id_mappings = get_id_mappings(sys.argv[2]) + update_ids(sys.argv[1], id_mappings) \ No newline at end of file From fd95306d50024406bf78d37b7fad64eefff42aa1 Mon Sep 17 00:00:00 2001 From: olzama Date: Mon, 20 Mar 2023 15:43:22 +0100 Subject: [PATCH 2/3] re-added an inflectional rule for Freeling tag NP00000; corrected the stem in the rule to have 5 zeroes (instead of 4 zeroes and one o in the middle). This is a named entity tag so a generic lexical entry is needed for it as well. No longer need to overwrite the tag NP00000. --- generics.tdl | 8 ++++++++ inflr.tdl | 6 +++--- util/override_freeling.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/generics.tdl b/generics.tdl index c333682..434c5da 100644 --- a/generics.tdl +++ b/generics.tdl @@ -41,6 +41,14 @@ np00g00_ge := n_-_pn_le & Generic lexical entry that will be triggered by tag np00g00. """. +np00000_ge := n_-_pn_le & + [ STEM < "n_-_pn_le" >, + TOKENS.+LIST generic_token_list & < [ +POS.+TAGS < "np00000" >, +FORM #form ] >, + SYNSEM.LKEYS.KEYREL [ PRED named_rel, CARG #form ] ] + """ + Generic lexical entry that will be triggered by tag np00000. + """. + ; ncms000, ncmp000, ncfs000, ncfp000, ncms00a, ncmp00a, ncfs00a, ncfp00a, ncms00d, ncmp00d, ncfs00d, ncfp00d, ncms00x, ncmp00x, ncfs00x, ncfp00x, nccs000, nccp000, ncmn000, ncfn000, nccn000 diff --git a/inflr.tdl b/inflr.tdl index ec81914..daf79a0 100644 --- a/inflr.tdl +++ b/inflr.tdl @@ -56,9 +56,9 @@ np00g00 := np00sp0_ilr. ; OZ: Not sure what this rule is for. Right now it is occurs where NCFS000 should. -;np00o00 := -;%suffix (np00o00 np00o00) -;np00sp0_ilr. +np00000 := +%suffix (np00000 np00000) +np00sp0_ilr. np00v00 := %suffix (np00v00 np00v00) diff --git a/util/override_freeling.py b/util/override_freeling.py index af3e417..e72f83b 100644 --- a/util/override_freeling.py +++ b/util/override_freeling.py @@ -1,6 +1,6 @@ # Freeling tags to override and replace by other tags -TAGS = {'I': 'AQ0MS00', 'NP00O00':'NCFS000'} +TAGS = {'I': 'AQ0MS00'} # Sometimes Freeling gives a wrong tag in very simple cases, such as for the 3SG verb 'ladrar' it returns a noun tag # with the probability 78%. From 6daebc5e43effb5401972380bb5572471dce590d Mon Sep 17 00:00:00 2001 From: olzama Date: Wed, 5 Apr 2023 13:38:55 +0200 Subject: [PATCH 3/3] changes for release 0.3.0 --- freeling/freeling_API/tokenize_and_tag.py | 17 ++++++++++------- util/override_freeling.py | 4 ---- util/populate_tokens.py | 6 ------ util/srg_freeling2yy.py | 23 +++++++++++++---------- 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/freeling/freeling_API/tokenize_and_tag.py b/freeling/freeling_API/tokenize_and_tag.py index fee5128..4fc279a 100755 --- a/freeling/freeling_API/tokenize_and_tag.py +++ b/freeling/freeling_API/tokenize_and_tag.py @@ -38,17 +38,17 @@ def __init__(self): self.mf=pyfreeling.maco(self.op) # activate mmorpho odules to be used in next call - self.mf.set_active_options(False, True, True, True, # select which among created - True, True, False, True, # submodules are to be used. - True, True, True, True ) # default: all created submodules are used + self.mf.set_active_options(umap=False, num=True, pun=True, dat=False, # select which among created + dic=True, aff=True, comp=False, rtk=True, # submodules are to be used. + mw=False, ner=True, qt=False, prb=True ) # default: all created submodules are used - self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2) + self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,1) + #self.tg = pyfreeling.relax_tagger(self.DATA+self.LANG+"/constr_gram-B.dat",500,670.0,0.001,True,1) def tokenize_and_tag(self, sentence_list): output = [] sid=self.sp.open_session() # process input text - #lin = "El perro del pueblo duerme." for i,lin in enumerate(sentence_list): if not lin[-1] in string.punctuation: # assume a dot at the end @@ -63,11 +63,14 @@ def tokenize_and_tag(self, sentence_list): ws = s.get_words() for j,w in enumerate(ws) : output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(), - 'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []}) + 'start':w.get_span_start(), 'end': w.get_span_finish(), + 'selected-tag': w.get_tag(), 'all-tags': []}) analyses = list(w.get_analysis()) for a in analyses: #print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob())) - output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()}) + output[i]['tokens'][j]['all-tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()}) + if a.get_tag() == output[i]['tokens'][j]['selected-tag']: + output[i]['tokens'][j]['selected-prob'] = a.get_prob() # clean up self.sp.close_session(sid) return output diff --git a/util/override_freeling.py b/util/override_freeling.py index e72f83b..3d5e659 100644 --- a/util/override_freeling.py +++ b/util/override_freeling.py @@ -2,10 +2,6 @@ # Freeling tags to override and replace by other tags TAGS = {'I': 'AQ0MS00'} -# Sometimes Freeling gives a wrong tag in very simple cases, such as for the 3SG verb 'ladrar' it returns a noun tag -# with the probability 78%. -#LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}} - REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'diĆ³': {'lemma': 'dar', 'tag': 'VMIS3S0'}} DO_NOT_OVERRIDE = {'uf', 'je', 'ja'} diff --git a/util/populate_tokens.py b/util/populate_tokens.py index 9800c46..80c7915 100644 --- a/util/populate_tokens.py +++ b/util/populate_tokens.py @@ -9,10 +9,6 @@ # I cannot figure out how to use the pyfreeling library: from freeling.freeling_API.tokenize_and_tag import Freeling_tok_tagger -# REMOVE = {'The tobacco garden dog barked.', 'Abrams wiped the table clean.', -# 'Abrams left it to Browne to bark.', 'How happy was Abrams?'} - - def read_testsuite(ts): items = ts['item'] # Strip the trailing hyphens to match old LKB output, although may want to put them back in later. @@ -79,8 +75,6 @@ def update_testsuite(ts): print('{} items in the corpus'.format(len(yy))) for i, row in enumerate(ts['item']): ts['item'].update(i, {'i-tokens': yy[i]}) - if ts['item'][i]['i-id'] == 10: - ts['item'].update(i, {'i-id':101}) ts.commit() def freeling2json(s): diff --git a/util/srg_freeling2yy.py b/util/srg_freeling2yy.py index 4c23e90..053c9ab 100755 --- a/util/srg_freeling2yy.py +++ b/util/srg_freeling2yy.py @@ -13,13 +13,17 @@ For compatibility, we will do the same for now. i -> AQ0MS0 (interjection to a default adjective form; will then undergo an adjective-to-interjection rule...) ''' -def override_tag(tag, word): - if tag in TAGS: +def override_tag(selected, all, word): + if selected in TAGS: if word not in DO_NOT_OVERRIDE: - return TAGS[tag] + return {'tag': TAGS[selected], 'prob': -1 } elif word in REPLACE_LEMMA_AND_TAG: - return REPLACE_LEMMA_AND_TAG[word]['tag'] - return tag + return { 'tag': REPLACE_LEMMA_AND_TAG[word]['tag'], 'prob': -1 } + else: + for t in all: + if t['tag'] == selected: + return t + raise Exception("selected tag not in tag list") def override_lemma(lemma, tag): if tag in STEM_EQUALS_TAG: @@ -94,12 +98,11 @@ def convert_sentences(sentences): _num = 0 # lattice ID _from = 0 # lattice from for j,tok in enumerate(sent['tokens']): - best_tag = max(tok['tags'], key=lambda x: x['prob']) # get the highest prob - tag = best_tag['tag'] - conf = best_tag['prob'] surface = tok['form'] - lemma = override_lemma(tok['lemma'], tag) - pos = override_tag(tag, tok['lemma']) + best = override_tag(tok['selected-tag'],tok['all-tags'], tok['lemma']) + pos = best['tag'] + conf = best['prob'] + lemma = override_lemma(tok['lemma'], pos) _num += 1 output += '(' output += str(_num)