Skip to content

Commit

Permalink
Merge branch 'olzama-dev' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
olzama committed Mar 16, 2023
2 parents b72cd0c + 3d5dfb1 commit 3f11426
Show file tree
Hide file tree
Showing 13 changed files with 5,990 additions and 132 deletions.
Empty file.
Binary file added freeling/freeling_API/_pyfreeling.so
Binary file not shown.
5,706 changes: 5,706 additions & 0 deletions freeling/freeling_API/pyfreeling.py

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions freeling/freeling_API/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#! /usr/bin/python3

import pyfreeling
import sys, os

## ----------------------------------------------
## ------------- MAIN PROGRAM ---------------
## ----------------------------------------------

os.environ["FREELINGDIR"] = '/usr'

if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
file=sys.stderr)
sys.exit(1)


# Location of FreeLing configuration files.
DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
# Init locales
pyfreeling.util_init_locale("default")
# create language detector. Used just to show it. Results are printed
# but ignored (after, it is assumed language is LANG)
la=pyfreeling.lang_ident(DATA+"common/lang_ident/ident-few.dat")
# create options set for maco analyzer. Default values are Ok, except for data files.
LANG="es"
op= pyfreeling.maco_options(LANG)
op.set_data_files( "",
DATA + "common/punct.dat",
DATA + LANG + "/dicc.src",
DATA + LANG + "/afixos.dat",
"",
DATA + LANG + "/locucions.dat",
DATA + LANG + "/np.dat",
DATA + LANG + "/quantities.dat",
DATA + LANG + "/probabilitats.dat")

# create analyzers
tk=pyfreeling.tokenizer(DATA+LANG+"/tokenizer.dat")
sp=pyfreeling.splitter(DATA+LANG+"/splitter.dat")
mf=pyfreeling.maco(op)

# activate mmorpho odules to be used in next call
mf.set_active_options(False, True, True, True, # select which among created
True, True, False, True, # submodules are to be used.
True, True, True, True ) # default: all created submodules are used

# create tagger, sense anotator, and parsers
tg=pyfreeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2)
#sen=pyfreeling.senses(DATA+LANG+"/senses.dat");
#parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
#dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol());

sid=sp.open_session()
# process input text
#lin=sys.stdin.readline();
lin = "El perro del pueblo duerme."

l = tk.tokenize(lin)
ls = sp.split(sid,l,False)

ls = mf.analyze(ls)
ls = tg.analyze(ls)

## output results
for s in ls :
print(s)
ws = s.get_words()
for w in ws :
print("FORM: {} LEMMA: {} START: {} END: {}".format(w.get_form(), w.get_lemma(),
w.get_span_start(), w.get_span_finish()))
analyses = list(w.get_analysis())
for a_i in analyses:
print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))

# clean up
sp.close_session(sid)

74 changes: 74 additions & 0 deletions freeling/freeling_API/tokenize_and_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#! /usr/bin/python3

from freeling.freeling_API import pyfreeling
import sys, os, string

class Freeling_tok_tagger:
def __init__(self):
os.environ["FREELINGDIR"] = '/usr'
if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
file=sys.stderr)
sys.exit(1)

# Location of FreeLing configuration files.
self.DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
# Init locales
pyfreeling.util_init_locale("default")
# create language detector. Used just to show it. Results are printed
# but ignored (after, it is assumed language is LANG)
self.la=pyfreeling.lang_ident(self.DATA+"common/lang_ident/ident-few.dat")
# create options set for maco analyzer. Default values are Ok, except for data files.
self.LANG="es"
self.op= pyfreeling.maco_options(self.LANG)
self.op.set_data_files( "",
self.DATA + "common/punct.dat",
self.DATA + self.LANG + "/dicc.src",
self.DATA + self.LANG + "/afixos.dat",
"",
self.DATA + self.LANG + "/locucions.dat",
self.DATA + self.LANG + "/np.dat",
self.DATA + self.LANG + "/quantities.dat",
self.DATA + self.LANG + "/probabilitats.dat")

# create analyzers
self.tk=pyfreeling.tokenizer(self.DATA+self.LANG+"/tokenizer.dat")
self.sp=pyfreeling.splitter(self.DATA+self.LANG+"/splitter.dat")
self.mf=pyfreeling.maco(self.op)

# activate mmorpho odules to be used in next call
self.mf.set_active_options(False, True, True, True, # select which among created
True, True, False, True, # submodules are to be used.
True, True, True, True ) # default: all created submodules are used

self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2)

def tokenize_and_tag(self, sentence_list):
output = []
sid=self.sp.open_session()
# process input text
#lin = "El perro del pueblo duerme."
for i,lin in enumerate(sentence_list):
if not lin[-1] in string.punctuation:
# assume a dot at the end
lin = lin + '.'
output.append({'sentence': lin, 'tokens':[]})
s = self.tk.tokenize(lin)
s = self.sp.split(sid,s,False)
s = self.mf.analyze(s)
s = self.tg.analyze(s)
assert len(s) == 1
s = s[0]
ws = s.get_words()
for j,w in enumerate(ws) :
output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []})
analyses = list(w.get_analysis())
for a in analyses:
#print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
# clean up
self.sp.close_session(sid)
return output

5 changes: 5 additions & 0 deletions letypes.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -12370,6 +12370,11 @@ n_-_pn_native_le := n_-_pn_lex & native_le
This is a native lexical entry type, for words that are in the lexicon.
""".

foreign_le := n_-_pn_lex
"""
Assume for now that it is useful to treat foreign words/fragments as named entities.
""".

n_-_pn_le := n_-_pn_lex.

; <type val="n_-_pr-pers-n_le">
Expand Down
4 changes: 2 additions & 2 deletions srtypes.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ basic-head-adj-phrase := basic-head-mod-phrase-simple & phrasal &
head-adj-phrase := basic-head-adj-phrase & head-initial.
non_str-head-adj-phrase := basic-head-adj-phrase & non_str-head-initial.

; We split head-adj-phrase and adj-head-phrase into two each, bone for intersective
; We split head-adj-phrase and adj-head-phrase into two each, one for intersective
; modifiers and one for scopal modifiers, in order to get desired results for recursive
; modification as in "apparently difficult problem" (cf. Kasper '98). This split is also
; used in generation, where we delay construction of intersective modification, but not scopal.
Expand Down Expand Up @@ -5117,4 +5117,4 @@ r_p_crd-mono-mid_constr := r_p_crd-mono-mid_phrase & binary-rule-right-to-left.
p_r_crd-mono-top_constr := p_r_crd-mono-top_phrase & binary-rule-right-to-left.
p_r_crd-mono-mid_constr := p_r_crd-mono-mid_phrase & binary-rule-right-to-left.
a_r_crd-mono-top_constr := a_r_crd-mono-top_phrase & binary-rule-right-to-left.
a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.
a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.
107 changes: 0 additions & 107 deletions tsdb/mrs.txt

This file was deleted.

2 changes: 1 addition & 1 deletion util/override_freeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# with the probability 78%.
#LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}}

REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}}
REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'}}

DO_NOT_OVERRIDE = {'uf', 'je', 'ja'}

Expand Down
Loading

0 comments on commit 3f11426

Please sign in to comment.