-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
5,990 additions
and
132 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#! /usr/bin/python3 | ||
|
||
import pyfreeling | ||
import sys, os | ||
|
||
## ---------------------------------------------- | ||
## ------------- MAIN PROGRAM --------------- | ||
## ---------------------------------------------- | ||
|
||
os.environ["FREELINGDIR"] = '/usr' | ||
|
||
if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") : | ||
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling", | ||
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory", | ||
file=sys.stderr) | ||
sys.exit(1) | ||
|
||
|
||
# Location of FreeLing configuration files. | ||
DATA = os.environ["FREELINGDIR"]+"/share/freeling/" | ||
# Init locales | ||
pyfreeling.util_init_locale("default") | ||
# create language detector. Used just to show it. Results are printed | ||
# but ignored (after, it is assumed language is LANG) | ||
la=pyfreeling.lang_ident(DATA+"common/lang_ident/ident-few.dat") | ||
# create options set for maco analyzer. Default values are Ok, except for data files. | ||
LANG="es" | ||
op= pyfreeling.maco_options(LANG) | ||
op.set_data_files( "", | ||
DATA + "common/punct.dat", | ||
DATA + LANG + "/dicc.src", | ||
DATA + LANG + "/afixos.dat", | ||
"", | ||
DATA + LANG + "/locucions.dat", | ||
DATA + LANG + "/np.dat", | ||
DATA + LANG + "/quantities.dat", | ||
DATA + LANG + "/probabilitats.dat") | ||
|
||
# create analyzers | ||
tk=pyfreeling.tokenizer(DATA+LANG+"/tokenizer.dat") | ||
sp=pyfreeling.splitter(DATA+LANG+"/splitter.dat") | ||
mf=pyfreeling.maco(op) | ||
|
||
# activate mmorpho odules to be used in next call | ||
mf.set_active_options(False, True, True, True, # select which among created | ||
True, True, False, True, # submodules are to be used. | ||
True, True, True, True ) # default: all created submodules are used | ||
|
||
# create tagger, sense anotator, and parsers | ||
tg=pyfreeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2) | ||
#sen=pyfreeling.senses(DATA+LANG+"/senses.dat"); | ||
#parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); | ||
#dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol()); | ||
|
||
sid=sp.open_session() | ||
# process input text | ||
#lin=sys.stdin.readline(); | ||
lin = "El perro del pueblo duerme." | ||
|
||
l = tk.tokenize(lin) | ||
ls = sp.split(sid,l,False) | ||
|
||
ls = mf.analyze(ls) | ||
ls = tg.analyze(ls) | ||
|
||
## output results | ||
for s in ls : | ||
print(s) | ||
ws = s.get_words() | ||
for w in ws : | ||
print("FORM: {} LEMMA: {} START: {} END: {}".format(w.get_form(), w.get_lemma(), | ||
w.get_span_start(), w.get_span_finish())) | ||
analyses = list(w.get_analysis()) | ||
for a_i in analyses: | ||
print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob())) | ||
|
||
# clean up | ||
sp.close_session(sid) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#! /usr/bin/python3 | ||
|
||
from freeling.freeling_API import pyfreeling | ||
import sys, os, string | ||
|
||
class Freeling_tok_tagger: | ||
def __init__(self): | ||
os.environ["FREELINGDIR"] = '/usr' | ||
if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") : | ||
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling", | ||
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory", | ||
file=sys.stderr) | ||
sys.exit(1) | ||
|
||
# Location of FreeLing configuration files. | ||
self.DATA = os.environ["FREELINGDIR"]+"/share/freeling/" | ||
# Init locales | ||
pyfreeling.util_init_locale("default") | ||
# create language detector. Used just to show it. Results are printed | ||
# but ignored (after, it is assumed language is LANG) | ||
self.la=pyfreeling.lang_ident(self.DATA+"common/lang_ident/ident-few.dat") | ||
# create options set for maco analyzer. Default values are Ok, except for data files. | ||
self.LANG="es" | ||
self.op= pyfreeling.maco_options(self.LANG) | ||
self.op.set_data_files( "", | ||
self.DATA + "common/punct.dat", | ||
self.DATA + self.LANG + "/dicc.src", | ||
self.DATA + self.LANG + "/afixos.dat", | ||
"", | ||
self.DATA + self.LANG + "/locucions.dat", | ||
self.DATA + self.LANG + "/np.dat", | ||
self.DATA + self.LANG + "/quantities.dat", | ||
self.DATA + self.LANG + "/probabilitats.dat") | ||
|
||
# create analyzers | ||
self.tk=pyfreeling.tokenizer(self.DATA+self.LANG+"/tokenizer.dat") | ||
self.sp=pyfreeling.splitter(self.DATA+self.LANG+"/splitter.dat") | ||
self.mf=pyfreeling.maco(self.op) | ||
|
||
# activate mmorpho odules to be used in next call | ||
self.mf.set_active_options(False, True, True, True, # select which among created | ||
True, True, False, True, # submodules are to be used. | ||
True, True, True, True ) # default: all created submodules are used | ||
|
||
self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2) | ||
|
||
def tokenize_and_tag(self, sentence_list): | ||
output = [] | ||
sid=self.sp.open_session() | ||
# process input text | ||
#lin = "El perro del pueblo duerme." | ||
for i,lin in enumerate(sentence_list): | ||
if not lin[-1] in string.punctuation: | ||
# assume a dot at the end | ||
lin = lin + '.' | ||
output.append({'sentence': lin, 'tokens':[]}) | ||
s = self.tk.tokenize(lin) | ||
s = self.sp.split(sid,s,False) | ||
s = self.mf.analyze(s) | ||
s = self.tg.analyze(s) | ||
assert len(s) == 1 | ||
s = s[0] | ||
ws = s.get_words() | ||
for j,w in enumerate(ws) : | ||
output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(), | ||
'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []}) | ||
analyses = list(w.get_analysis()) | ||
for a in analyses: | ||
#print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob())) | ||
output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()}) | ||
# clean up | ||
self.sp.close_session(sid) | ||
return output | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.