Merge branch 'olzama-dev' into main

delph-in · Mar 16, 2023 · 3f11426 · 3f11426
2 parents b72cd0c + 3d5dfb1
commit 3f11426
Show file tree

Hide file tree

Showing 13 changed files with 5,990 additions and 132 deletions.
diff --git a/freeling/freeling_API/__init__.py b/freeling/freeling_API/__init__.py
diff --git a/freeling/freeling_API/_pyfreeling.so b/freeling/freeling_API/_pyfreeling.so
diff --git a/freeling/freeling_API/pyfreeling.py b/freeling/freeling_API/pyfreeling.py
diff --git a/freeling/freeling_API/sample.py b/freeling/freeling_API/sample.py
@@ -0,0 +1,79 @@
+#! /usr/bin/python3
+
+import pyfreeling
+import sys, os
+
+## ----------------------------------------------
+## -------------    MAIN PROGRAM  ---------------
+## ----------------------------------------------
+
+os.environ["FREELINGDIR"] = '/usr'
+
+if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
+   print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
+         "not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
+         file=sys.stderr)
+   sys.exit(1)
+
+
+# Location of FreeLing configuration files.
+DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
+# Init locales
+pyfreeling.util_init_locale("default")
+# create language detector. Used just to show it. Results are printed
+# but ignored (after, it is assumed language is LANG)
+la=pyfreeling.lang_ident(DATA+"common/lang_ident/ident-few.dat")
+# create options set for maco analyzer. Default values are Ok, except for data files.
+LANG="es"
+op= pyfreeling.maco_options(LANG)
+op.set_data_files( "", 
+                   DATA + "common/punct.dat",
+                   DATA + LANG + "/dicc.src",
+                   DATA + LANG + "/afixos.dat",
+                   "",
+                   DATA + LANG + "/locucions.dat", 
+                   DATA + LANG + "/np.dat",
+                   DATA + LANG + "/quantities.dat",
+                   DATA + LANG + "/probabilitats.dat")
+
+# create analyzers
+tk=pyfreeling.tokenizer(DATA+LANG+"/tokenizer.dat")
+sp=pyfreeling.splitter(DATA+LANG+"/splitter.dat")
+mf=pyfreeling.maco(op)
+
+# activate mmorpho odules to be used in next call
+mf.set_active_options(False, True, True, True,  # select which among created 
+                      True, True, False, True,  # submodules are to be used. 
+                      True, True, True, True )  # default: all created submodules are used
+
+# create tagger, sense anotator, and parsers
+tg=pyfreeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2)
+#sen=pyfreeling.senses(DATA+LANG+"/senses.dat");
+#parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
+#dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol());
+
+sid=sp.open_session()
+# process input text
+#lin=sys.stdin.readline();
+lin = "El perro del pueblo duerme."
+
+l = tk.tokenize(lin)
+ls = sp.split(sid,l,False)
+
+ls = mf.analyze(ls)
+ls = tg.analyze(ls)
+
+## output results
+for s in ls :
+    print(s)
+    ws = s.get_words()
+    for w in ws :
+       print("FORM: {} LEMMA: {} START: {} END: {}".format(w.get_form(), w.get_lemma(),
+                                                           w.get_span_start(), w.get_span_finish()))
+       analyses = list(w.get_analysis())
+       for a_i in analyses:
+           print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
+
+# clean up       
+sp.close_session(sid)
+
diff --git a/freeling/freeling_API/tokenize_and_tag.py b/freeling/freeling_API/tokenize_and_tag.py
@@ -0,0 +1,74 @@
+#! /usr/bin/python3
+
+from freeling.freeling_API import pyfreeling
+import sys, os, string
+
+class Freeling_tok_tagger:
+    def __init__(self):
+        os.environ["FREELINGDIR"] = '/usr'
+        if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
+           print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
+                 "not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
+                 file=sys.stderr)
+           sys.exit(1)
+
+        # Location of FreeLing configuration files.
+        self.DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
+        # Init locales
+        pyfreeling.util_init_locale("default")
+        # create language detector. Used just to show it. Results are printed
+        # but ignored (after, it is assumed language is LANG)
+        self.la=pyfreeling.lang_ident(self.DATA+"common/lang_ident/ident-few.dat")
+        # create options set for maco analyzer. Default values are Ok, except for data files.
+        self.LANG="es"
+        self.op= pyfreeling.maco_options(self.LANG)
+        self.op.set_data_files( "",
+                           self.DATA + "common/punct.dat",
+                           self.DATA + self.LANG + "/dicc.src",
+                           self.DATA + self.LANG + "/afixos.dat",
+                           "",
+                           self.DATA + self.LANG + "/locucions.dat",
+                           self.DATA + self.LANG + "/np.dat",
+                           self.DATA + self.LANG + "/quantities.dat",
+                           self.DATA + self.LANG + "/probabilitats.dat")
+
+        # create analyzers
+        self.tk=pyfreeling.tokenizer(self.DATA+self.LANG+"/tokenizer.dat")
+        self.sp=pyfreeling.splitter(self.DATA+self.LANG+"/splitter.dat")
+        self.mf=pyfreeling.maco(self.op)
+
+        # activate mmorpho odules to be used in next call
+        self.mf.set_active_options(False, True, True, True,  # select which among created
+                              True, True, False, True,  # submodules are to be used.
+                              True, True, True, True )  # default: all created submodules are used
+
+        self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2)
+
+    def tokenize_and_tag(self, sentence_list):
+        output = []
+        sid=self.sp.open_session()
+        # process input text
+        #lin = "El perro del pueblo duerme."
+        for i,lin in enumerate(sentence_list):
+            if not lin[-1] in string.punctuation:
+                # assume a dot at the end
+                lin = lin + '.'
+            output.append({'sentence': lin, 'tokens':[]})
+            s = self.tk.tokenize(lin)
+            s = self.sp.split(sid,s,False)
+            s = self.mf.analyze(s)
+            s = self.tg.analyze(s)
+            assert len(s) == 1
+            s = s[0]
+            ws = s.get_words()
+            for j,w in enumerate(ws) :
+                output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
+                                            'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []})
+                analyses = list(w.get_analysis())
+                for a in analyses:
+                    #print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
+                    output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
+        # clean up
+        self.sp.close_session(sid)
+        return output
+
diff --git a/letypes.tdl b/letypes.tdl
@@ -12370,6 +12370,11 @@ n_-_pn_native_le := n_-_pn_lex & native_le
   This is a native lexical entry type, for words that are in the lexicon.
   """.
 
+foreign_le := n_-_pn_lex
+"""
+Assume for now that it is useful to treat foreign words/fragments as named entities.
+""".
+
 n_-_pn_le := n_-_pn_lex.
 
 ;  <type val="n_-_pr-pers-n_le">

diff --git a/srtypes.tdl b/srtypes.tdl
@@ -881,7 +881,7 @@ basic-head-adj-phrase := basic-head-mod-phrase-simple & phrasal &
 head-adj-phrase := basic-head-adj-phrase & head-initial.
 non_str-head-adj-phrase := basic-head-adj-phrase & non_str-head-initial.
 
-; We split head-adj-phrase and adj-head-phrase into two each, bone for intersective 
+; We split head-adj-phrase and adj-head-phrase into two each, one for intersective 
 ; modifiers and one for scopal modifiers, in order to get desired results for recursive 
 ; modification as in "apparently difficult problem" (cf. Kasper '98). This split is also 
 ; used in generation, where we delay construction of intersective modification, but not scopal.
@@ -5117,4 +5117,4 @@ r_p_crd-mono-mid_constr := r_p_crd-mono-mid_phrase & binary-rule-right-to-left.
 p_r_crd-mono-top_constr := p_r_crd-mono-top_phrase & binary-rule-right-to-left.
 p_r_crd-mono-mid_constr := p_r_crd-mono-mid_phrase & binary-rule-right-to-left.
 a_r_crd-mono-top_constr := a_r_crd-mono-top_phrase & binary-rule-right-to-left.
-a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.
+a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.
diff --git a/tsdb/mrs.txt b/tsdb/mrs.txt
diff --git a/util/override_freeling.py b/util/override_freeling.py
@@ -6,7 +6,7 @@
 # with the probability 78%.
 #LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}}
 
-REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}}
+REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'}}
 
 DO_NOT_OVERRIDE = {'uf', 'je', 'ja'}