Merge pull request #33 from delph-in/olzama-dev

Changes for release 0.2.1
delph-in · Apr 5, 2023 · f8258f3 · f8258f3
2 parents 3547db0 + 6daebc5
commit f8258f3
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 31 deletions.
diff --git a/freeling/freeling_API/tokenize_and_tag.py b/freeling/freeling_API/tokenize_and_tag.py
@@ -38,17 +38,17 @@ def __init__(self):
         self.mf=pyfreeling.maco(self.op)
 
         # activate mmorpho odules to be used in next call
-        self.mf.set_active_options(False, True, True, True,  # select which among created
-                              True, True, False, True,  # submodules are to be used.
-                              True, True, True, True )  # default: all created submodules are used
+        self.mf.set_active_options(umap=False, num=True, pun=True, dat=False,  # select which among created
+                              dic=True, aff=True, comp=False, rtk=True,  # submodules are to be used.
+                              mw=False, ner=True, qt=False, prb=True )  # default: all created submodules are used
 
-        self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2)
+        self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,1)
+        #self.tg = pyfreeling.relax_tagger(self.DATA+self.LANG+"/constr_gram-B.dat",500,670.0,0.001,True,1)
 
     def tokenize_and_tag(self, sentence_list):
         output = []
         sid=self.sp.open_session()
         # process input text
-        #lin = "El perro del pueblo duerme."
         for i,lin in enumerate(sentence_list):
             if not lin[-1] in string.punctuation:
                 # assume a dot at the end
@@ -63,11 +63,14 @@ def tokenize_and_tag(self, sentence_list):
             ws = s.get_words()
             for j,w in enumerate(ws) :
                 output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
-                                            'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []})
+                                            'start':w.get_span_start(), 'end': w.get_span_finish(),
+                                            'selected-tag': w.get_tag(), 'all-tags': []})
                 analyses = list(w.get_analysis())
                 for a in analyses:
                     #print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
-                    output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
+                    output[i]['tokens'][j]['all-tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
+                    if a.get_tag() == output[i]['tokens'][j]['selected-tag']:
+                        output[i]['tokens'][j]['selected-prob'] = a.get_prob()
         # clean up
         self.sp.close_session(sid)
         return output

diff --git a/generics.tdl b/generics.tdl
@@ -41,6 +41,14 @@ np00g00_ge := n_-_pn_le &
   Generic lexical entry that will be triggered by tag np00g00.
   """.
 
+np00000_ge := n_-_pn_le &
+  [ STEM < "n_-_pn_le" >,
+    TOKENS.+LIST generic_token_list & < [ +POS.+TAGS < "np00000" >, +FORM #form ] >,
+    SYNSEM.LKEYS.KEYREL [ PRED named_rel, CARG #form ] ]
+  """
+  Generic lexical entry that will be triggered by tag np00000.
+  """.
+
 
 ; ncms000, ncmp000, ncfs000, ncfp000, ncms00a, ncmp00a, ncfs00a, ncfp00a, ncms00d, ncmp00d, ncfs00d, ncfp00d, ncms00x, ncmp00x, ncfs00x, ncfp00x, nccs000, nccp000, ncmn000, ncfn000, nccn000
 

diff --git a/inflr.tdl b/inflr.tdl
@@ -56,9 +56,9 @@ np00g00 :=
 np00sp0_ilr.
 
 ; OZ: Not sure what this rule is for. Right now it is occurs where NCFS000 should.
-;np00o00 := 
-;%suffix (np00o00 np00o00)
-;np00sp0_ilr.
+np00000 := 
+%suffix (np00000 np00000)
+np00sp0_ilr.
 
 np00v00 :=
 %suffix (np00v00 np00v00)

diff --git a/util/override_freeling.py b/util/override_freeling.py
@@ -1,10 +1,6 @@
 
 # Freeling tags to override and replace by other tags
-TAGS = {'I': 'AQ0MS00', 'NP00O00':'NCFS000'}
-
-# Sometimes Freeling gives a wrong tag in very simple cases, such as for the 3SG verb 'ladrar' it returns a noun tag
-# with the probability 78%.
-#LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}}
+TAGS = {'I': 'AQ0MS00'}
 
 REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'}}
 

diff --git a/util/populate_tokens.py b/util/populate_tokens.py
@@ -9,10 +9,6 @@
 # I cannot figure out how to use the pyfreeling library:
 from freeling.freeling_API.tokenize_and_tag import Freeling_tok_tagger
 
-# REMOVE = {'The tobacco garden dog barked.', 'Abrams wiped the table clean.',
-#           'Abrams left it to Browne to bark.', 'How happy was Abrams?'}
-
-
 def read_testsuite(ts):
     items = ts['item']
     # Strip the trailing hyphens to match old LKB output, although may want to put them back in later.
@@ -79,8 +75,6 @@ def update_testsuite(ts):
     print('{} items in the corpus'.format(len(yy)))
     for i, row in enumerate(ts['item']):
         ts['item'].update(i, {'i-tokens': yy[i]})
-        if ts['item'][i]['i-id'] == 10:
-            ts['item'].update(i, {'i-id':101})
     ts.commit()
 
 def freeling2json(s):

diff --git a/util/srg_freeling2yy.py b/util/srg_freeling2yy.py
@@ -13,13 +13,17 @@
 For compatibility, we will do the same for now.
 i -> AQ0MS0 (interjection to a default adjective form; will then undergo an adjective-to-interjection rule...)
 '''
-def override_tag(tag, word):
-    if tag in TAGS:
+def override_tag(selected, all, word):
+    if selected in TAGS:
         if word not in DO_NOT_OVERRIDE:
-            return TAGS[tag]
+            return {'tag': TAGS[selected], 'prob': -1 }
     elif word in REPLACE_LEMMA_AND_TAG:
-        return REPLACE_LEMMA_AND_TAG[word]['tag']
-    return tag
+        return { 'tag': REPLACE_LEMMA_AND_TAG[word]['tag'], 'prob': -1 }
+    else:
+        for t in all:
+            if t['tag'] == selected:
+                return t
+        raise Exception("selected tag not in tag list")
 
 def override_lemma(lemma, tag):
     if tag in STEM_EQUALS_TAG:
@@ -94,12 +98,11 @@ def convert_sentences(sentences):
         _num = 0       # lattice ID
         _from = 0      # lattice from
         for j,tok in enumerate(sent['tokens']):
-            best_tag = max(tok['tags'], key=lambda x: x['prob']) # get the highest prob
-            tag = best_tag['tag']
-            conf = best_tag['prob']
             surface = tok['form']
-            lemma = override_lemma(tok['lemma'], tag)
-            pos = override_tag(tag, tok['lemma'])
+            best = override_tag(tok['selected-tag'],tok['all-tags'], tok['lemma'])
+            pos = best['tag']
+            conf = best['prob']
+            lemma = override_lemma(tok['lemma'], pos)
             _num += 1
             output += '('
             output += str(_num)

diff --git a/util/tsdb-updates/create_Spa_MRS_testsuite.py b/util/tsdb-updates/create_Spa_MRS_testsuite.py
@@ -0,0 +1,27 @@
+'''
+Assuming a [incr tsdb()] profile created with pydelphin from a text file
+and a textfile mapping the same sentences from the text file to ID numbers,
+update the test suite to use those id numbers instead of the automatically created ones.
+'''
+import sys
+from delphin import itsdb
+
+def get_id_mappings(file_ids):
+    sentence2id = {}
+    with open (file_ids, 'r') as f:
+        sentences_ids = [ln  for ln in f.read().splitlines() if ln]
+        for s_id in sentences_ids:
+            id,s = s_id.split('\t')
+            sentence2id[s] = int(id)
+    return sentence2id
+def update_ids(ts_path, id_mappings):
+    ts = itsdb.TestSuite(ts_path)
+    for i, row in enumerate(ts['item']):
+        id = id_mappings[ts['item'][i]['i-input']]
+        ts['item'].update(i, {'i-id':id})
+    ts.commit() # ts is a database which needs to be committed to disk, otherwise the updates will not persist.
+
+
+if __name__ == "__main__":
+    id_mappings = get_id_mappings(sys.argv[2])
+    update_ids(sys.argv[1], id_mappings)