Skip to content

Commit

Permalink
Merge pull request #33 from delph-in/olzama-dev
Browse files Browse the repository at this point in the history
Changes for release 0.2.1
  • Loading branch information
olzama authored Apr 5, 2023
2 parents 3547db0 + 6daebc5 commit f8258f3
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 31 deletions.
17 changes: 10 additions & 7 deletions freeling/freeling_API/tokenize_and_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ def __init__(self):
self.mf=pyfreeling.maco(self.op)

# activate mmorpho odules to be used in next call
self.mf.set_active_options(False, True, True, True, # select which among created
True, True, False, True, # submodules are to be used.
True, True, True, True ) # default: all created submodules are used
self.mf.set_active_options(umap=False, num=True, pun=True, dat=False, # select which among created
dic=True, aff=True, comp=False, rtk=True, # submodules are to be used.
mw=False, ner=True, qt=False, prb=True ) # default: all created submodules are used

self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2)
self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,1)
#self.tg = pyfreeling.relax_tagger(self.DATA+self.LANG+"/constr_gram-B.dat",500,670.0,0.001,True,1)

def tokenize_and_tag(self, sentence_list):
output = []
sid=self.sp.open_session()
# process input text
#lin = "El perro del pueblo duerme."
for i,lin in enumerate(sentence_list):
if not lin[-1] in string.punctuation:
# assume a dot at the end
Expand All @@ -63,11 +63,14 @@ def tokenize_and_tag(self, sentence_list):
ws = s.get_words()
for j,w in enumerate(ws) :
output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []})
'start':w.get_span_start(), 'end': w.get_span_finish(),
'selected-tag': w.get_tag(), 'all-tags': []})
analyses = list(w.get_analysis())
for a in analyses:
#print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
output[i]['tokens'][j]['all-tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
if a.get_tag() == output[i]['tokens'][j]['selected-tag']:
output[i]['tokens'][j]['selected-prob'] = a.get_prob()
# clean up
self.sp.close_session(sid)
return output
Expand Down
8 changes: 8 additions & 0 deletions generics.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ np00g00_ge := n_-_pn_le &
Generic lexical entry that will be triggered by tag np00g00.
""".

np00000_ge := n_-_pn_le &
[ STEM < "n_-_pn_le" >,
TOKENS.+LIST generic_token_list & < [ +POS.+TAGS < "np00000" >, +FORM #form ] >,
SYNSEM.LKEYS.KEYREL [ PRED named_rel, CARG #form ] ]
"""
Generic lexical entry that will be triggered by tag np00000.
""".


; ncms000, ncmp000, ncfs000, ncfp000, ncms00a, ncmp00a, ncfs00a, ncfp00a, ncms00d, ncmp00d, ncfs00d, ncfp00d, ncms00x, ncmp00x, ncfs00x, ncfp00x, nccs000, nccp000, ncmn000, ncfn000, nccn000

Expand Down
6 changes: 3 additions & 3 deletions inflr.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ np00g00 :=
np00sp0_ilr.

; OZ: Not sure what this rule is for. Right now it is occurs where NCFS000 should.
;np00o00 :=
;%suffix (np00o00 np00o00)
;np00sp0_ilr.
np00000 :=
%suffix (np00000 np00000)
np00sp0_ilr.

np00v00 :=
%suffix (np00v00 np00v00)
Expand Down
6 changes: 1 addition & 5 deletions util/override_freeling.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@

# Freeling tags to override and replace by other tags
TAGS = {'I': 'AQ0MS00', 'NP00O00':'NCFS000'}

# Sometimes Freeling gives a wrong tag in very simple cases, such as for the 3SG verb 'ladrar' it returns a noun tag
# with the probability 78%.
#LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}}
TAGS = {'I': 'AQ0MS00'}

REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'}}

Expand Down
6 changes: 0 additions & 6 deletions util/populate_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
# I cannot figure out how to use the pyfreeling library:
from freeling.freeling_API.tokenize_and_tag import Freeling_tok_tagger

# REMOVE = {'The tobacco garden dog barked.', 'Abrams wiped the table clean.',
# 'Abrams left it to Browne to bark.', 'How happy was Abrams?'}


def read_testsuite(ts):
items = ts['item']
# Strip the trailing hyphens to match old LKB output, although may want to put them back in later.
Expand Down Expand Up @@ -79,8 +75,6 @@ def update_testsuite(ts):
print('{} items in the corpus'.format(len(yy)))
for i, row in enumerate(ts['item']):
ts['item'].update(i, {'i-tokens': yy[i]})
if ts['item'][i]['i-id'] == 10:
ts['item'].update(i, {'i-id':101})
ts.commit()

def freeling2json(s):
Expand Down
23 changes: 13 additions & 10 deletions util/srg_freeling2yy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@
For compatibility, we will do the same for now.
i -> AQ0MS0 (interjection to a default adjective form; will then undergo an adjective-to-interjection rule...)
'''
def override_tag(tag, word):
if tag in TAGS:
def override_tag(selected, all, word):
if selected in TAGS:
if word not in DO_NOT_OVERRIDE:
return TAGS[tag]
return {'tag': TAGS[selected], 'prob': -1 }
elif word in REPLACE_LEMMA_AND_TAG:
return REPLACE_LEMMA_AND_TAG[word]['tag']
return tag
return { 'tag': REPLACE_LEMMA_AND_TAG[word]['tag'], 'prob': -1 }
else:
for t in all:
if t['tag'] == selected:
return t
raise Exception("selected tag not in tag list")

def override_lemma(lemma, tag):
if tag in STEM_EQUALS_TAG:
Expand Down Expand Up @@ -94,12 +98,11 @@ def convert_sentences(sentences):
_num = 0 # lattice ID
_from = 0 # lattice from
for j,tok in enumerate(sent['tokens']):
best_tag = max(tok['tags'], key=lambda x: x['prob']) # get the highest prob
tag = best_tag['tag']
conf = best_tag['prob']
surface = tok['form']
lemma = override_lemma(tok['lemma'], tag)
pos = override_tag(tag, tok['lemma'])
best = override_tag(tok['selected-tag'],tok['all-tags'], tok['lemma'])
pos = best['tag']
conf = best['prob']
lemma = override_lemma(tok['lemma'], pos)
_num += 1
output += '('
output += str(_num)
Expand Down
27 changes: 27 additions & 0 deletions util/tsdb-updates/create_Spa_MRS_testsuite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
'''
Assuming a [incr tsdb()] profile created with pydelphin from a text file
and a textfile mapping the same sentences from the text file to ID numbers,
update the test suite to use those id numbers instead of the automatically created ones.
'''
import sys
from delphin import itsdb

def get_id_mappings(file_ids):
sentence2id = {}
with open (file_ids, 'r') as f:
sentences_ids = [ln for ln in f.read().splitlines() if ln]
for s_id in sentences_ids:
id,s = s_id.split('\t')
sentence2id[s] = int(id)
return sentence2id
def update_ids(ts_path, id_mappings):
ts = itsdb.TestSuite(ts_path)
for i, row in enumerate(ts['item']):
id = id_mappings[ts['item'][i]['i-input']]
ts['item'].update(i, {'i-id':id})
ts.commit() # ts is a database which needs to be committed to disk, otherwise the updates will not persist.


if __name__ == "__main__":
id_mappings = get_id_mappings(sys.argv[2])
update_ids(sys.argv[1], id_mappings)

0 comments on commit f8258f3

Please sign in to comment.