Skip to content

Commit

Permalink
Merge pull request #52 from delph-in/olzama-dev
Browse files Browse the repository at this point in the history
1. Freeling is now used so as to provide tag sequences in some cases (no retokenization). Some of these tag sequences are used directly by the grammar (such as in the case with clitics). Other sequences are fused by the srg-freeling interface, into a tag which can then be used by the grammar.
2. Added lexical entries for some missing punctuation marks. It is not clear that this is what needs to be done though, because before, I think the grammar somehow handled them without those lexical entries. Perhaps Freeling can analyze them into something that the grammar expects, but I cannot yet figure out what that is. At this point, the lexical entries were added somewhat ad hoc so there is no expectation that they will lead to a parse, only to the absense of lexical analysis error.
  • Loading branch information
olzama authored May 30, 2023
2 parents fcd98f3 + 3ccb939 commit 18d0d96
Show file tree
Hide file tree
Showing 10 changed files with 460 additions and 165 deletions.
44 changes: 26 additions & 18 deletions inflr.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,10 @@ ncfp000 :=
%suffix (ncfp000 ncfp000)
n_fem-pl_ilr.

ncfp00v :=
%suffix (ncfp00v ncfp00v)
n_fem-pl_ilr.

ncms00a :=
%suffix (ncms00a ncms00a)
n_masc-sg_ilr.
Expand All @@ -855,6 +859,10 @@ ncfs00a :=
%suffix (ncfs00a ncfs00a)
n_fem-sg_ilr.

ncfs00v :=
%suffix (ncfs00v ncfs00v)
n_fem-sg_ilr.

ncfp00a :=
%suffix (ncfp00a ncfp00a)
n_fem-pl_ilr.
Expand Down Expand Up @@ -1110,13 +1118,13 @@ a_ilr.

; -- comparatives
; e.g. mejor
aqccs0 :=
%suffix (aqccs0 aqccs0)
aqccs00 :=
%suffix (aqccs00 aqccs00)
a_sg_ilr.

; e.g. mejores
aqccp0 :=
%suffix (aqccp0 aqccp0)
aqccp00 :=
%suffix (aqccp00 aqccp00)
a_pl_ilr.

; -- superlatives
Expand All @@ -1136,33 +1144,33 @@ aqsfs00 :=
a_fem-sg_ilr.

; e.g. dificilísimas
aqsfp0 :=
%suffix (aqsfp0 aqsfp0)
aqsfp00 :=
%suffix (aqsfp00 aqsfp00)
a_fem-pl_ilr.

; -- diminutives
; e.g. pequeñísimo
aqdms0 :=
%suffix (aqdms0 aqdms0)
aqdms00 :=
%suffix (aqdms00 aqdms00)
a_masc-sg_ilr.

; e.g. pequeñísimos
aqdmp0 :=
%suffix (aqdmp0 aqdmp0)
aqdmp00 :=
%suffix (aqdmp00 aqdmp00)
a_masc-pl_ilr.

; e.g. pequeñísima
aqdfs0 :=
%suffix (aqdfs0 aqdfs0)
aqdfs00 :=
%suffix (aqdfs00 aqdfs00)
a_fem-sg_ilr.

; e.g. pequeñísimos
aqdfp0 :=
%suffix (aqdfp0 aqdfp0)
aqdfp00 :=
%suffix (aqdfp00 aqdfp00)
a_fem-pl_ilr.

aq0000 :=
%suffix (aq0000 aq0000)
aq00000 :=
%suffix (aq00000 aq00000)
no_ilr.


Expand Down Expand Up @@ -2217,11 +2225,11 @@ vpart_ilr.

; -- enclitics

+PP3MSA00 :=
+PP3MSA0 :=
%suffix (vmlo vmlo)
pp3msa_ilr.

+PP3CNA00 :=
+PP3CNA0 :=
%suffix (vmlo2 vmlo2)
pp3cna_ilr.

Expand Down
2 changes: 0 additions & 2 deletions letypes.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -8556,7 +8556,6 @@ pt_-_lhyphn_lex := basic-punct-lex &
[ SYNSEM punct_synsem &
[ LOCAL.CAT.HEAD.PUNCT-MK lhyphen_punct ] ].


pt_-_quest-op_lex := basic-punct-lex &
[ SYNSEM punct_synsem &
[ LOCAL.CAT.HEAD.PUNCT-MK ques_op_punct ] ].
Expand Down Expand Up @@ -8590,7 +8589,6 @@ pt_-_fr-op_lex := basic-punct-lex &
[ LOCAL.CAT.HEAD.PUNCT-MK fra_punct ] ].



;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; --- Lexical types
Expand Down
30 changes: 29 additions & 1 deletion lexicon.tdl
Original file line number Diff line number Diff line change
Expand Up @@ -125286,10 +125286,29 @@ hutu_n := n_-_c_native_le &

hyphen_pt := pt_-_hyphn_native_le &
[ STEM < "-" > ].

;[olzama-dev 56a4658] Adding a lexical entry for the slash, but for now adding it as a hyphen,
; assuming it sometimes means things like OR. will probably need to be revisited.
slash_pt := pt_-_hyphn_native_le &
[ STEM < "/" > ].

; OZ 2023-05-29: Adding a double hyphen as a simple hyphen for now;
; may need to be revisited.
doublehyphen_pt := pt_-_hyphn_native_le &
[ STEM < "--" > ].

triplehyphen_pt := pt_-_hyphn_native_le &
[ STEM < "---" > ].

lhyphen_pt := pt_-_lhyphn_native_le &
[ STEM < "-" > ].

; OZ 2023-05-29: Adding a double hyphen as a simple hyphen for now;
; may need to be revisited.
; In reality, this occurs as the opening punctuation in dialog:
; "-- No."
doubnlelhyphen_pt := pt_-_lhyphn_native_le &
[ STEM < "--" > ].

hypocalcificar_v-np_rfx := v_np_rfx_native_le &
[ STEM < "hypocalcificar" >,
Expand Down Expand Up @@ -186066,6 +186085,15 @@ por_ciento_av-i-vm := av_-_i-vm_native_le &
[ STEM < "por_ciento" >,
SYNSEM.LKEYS.KEYREL.PRED "_por_ciento_x_rel" ].

; OZ 2023-05-29
; Adding the percentage %, which was previously handled by Freeling somehow
; (possibly, not compositionally?)
; This may beed to be revisited.
por_ciento_sign_av-i-vm := av_-_i-vm_native_le &
[ STEM < "%" >,
SYNSEM.LKEYS.KEYREL.PRED "_por_ciento_x_rel" ].


porcino_aj-i-nprd := aj_-_i-nprd_native_le &
[ STEM < "porcino" >,
SYNSEM.LKEYS.KEYREL.PRED "_porcino_a_rel" ].
Expand Down Expand Up @@ -196627,7 +196655,7 @@ quotes_pt := pt_-_quots_le &
[ STEM < "\"" > ].

quotes-sngl_pt := pt_-_quots_le &
[ STEM < "\'" > ].
[ STEM < "'" > ].

rabadilla_n := n_-_c_native_le &
[ STEM < "rabadilla" >,
Expand Down
196 changes: 98 additions & 98 deletions tibidabo.mem

Large diffs are not rendered by default.

174 changes: 174 additions & 0 deletions util/freeling_api/srg-freeling.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
## List of forms (or tags, if uppercased) for which PoS tagger output will
## be ignored (no analysis discarded) when found at the specified @position
<NoDisambiguate>
NP00000 @begin
que @any
hasta @any
tanto @any
como @any
fui @any
fuiste @any
fue @any
fuimos @any
fuisteis @any
fueron @any
</NoDisambiguate>

## List of words for which the list of output analysis given
## by FreeLing must be ignored and replaced by the specified list.
## One entry per line, format:
## form lemma1 tag1 lemma2 tag2 ...
<ReplaceAll>
quería querer VMII4S0
un un Z
uno uno Z
una una Z
acá acá NC00000
acullá acullá NC00000
ahí ahí NC00000
ahora ahora NC00000
allá allá NC00000
allende allende NC00000
allí allí NC00000
anoche anoche NC00000
antaño antaño NC00000
anteanoche anteanoche NC00000
anteanteayer anteanteayer NC00000
anteayer anteayer NC00000
antes_de_anoche antes_de_anoche NC00000
antes_de_ayer antes_de_ayer NC00000
aquende aquende NC00000
aquí aquí NC00000
así así NC00000 así SP
ayer ayer NC00000
ayer_noche ayer_noche NC00000
entonces entonces NC00000
hogaño hogaño NC00000
hoy hoy NC00000
ibídem ibídem NC00000
mañana mañana NC00000
pasado_mañana pasado_mañana NC00000
ni ni CC ni RG
demás demás PI0CC000
vez vez NC00000
veces vez NC00000
antes antes SP antes RG
después después SP después RG
más más AQ0CS00 más SP más RG
menos menos AQ0CS00 menos SP menos RG
múltiples múltiple DI0CP0
cierta cierto AQ0FS00 cierto DI0FS0
ciertas cierto AQ0FP00 cierto DI0FP0
cierto cierto AQ0MS00 cierto DI0MS0
ciertos cierto AQ0MP00 cierto DI0MP0
determinada determinar VMP00SF determinado DI0FS0
determinadas determinar VMP00PF determinado DI0FP0
determinado determinar VMP00SM determinado DI0MS0
determinados determinar VMP00PM determinado DI0MP0
diferente diferente AQ0CS00 diferente DI0CS0
diferentes diferente AQ0CP00 diferente DI0CP0
distinta diferente AQ0FS00 diferente DI0FS0
distintas distinto AQ0FP00 diferente DI0FP0
distinta distinto AQ0FS00 distinto DI0FS0
distintas distinto AQ0FP00 distinto DI0FP0
distinto distinto AQ0MS00 distinto DI0MS0
distintos distinto AQ0MP00 distinto DI0MP0
diversa diverso AQ0FS00 diverso DI0FS0
diversas diverso AQ0FP00 diverso DI0FP0
diverso diverso AQ0MS00 diverso DI0MS0
diversos diverso AQ0MP00 diverso DI0MP0
escasa escaso AQ0FS00 escaso DI0FS0
escasas escaso AQ0FP00 escaso DI0FP0
escaso escaso AQ0MS00 escaso DI0MS0
escasos escaso AQ0MP00 escaso DI0MP0
numerosa numeroso AQ0FS00 numeroso DI0FS0
numerosas numeroso AQ0FP00 numeroso DI0FP0
numeroso numeroso AQ0MS00 numeroso DI0MS0
numerosos numeroso AQ0MP00 numeroso DI0MP0
rara raro AQ0FS00 raro DI0FS0
raras raro AQ0FP00 raro DI0FP0
raro raro AQ0MS00 raro DI0MS0
raros raro AQ0MP00 raro DI0MP0
cientos ciento Zd
millares millar Zd
miles mil Zd
mejor mejor AQ0CS00
off-line off-line AQ0CN00
on-line on-line AQ0CN00
peor peor AQ0CS00

</ReplaceAll>

## List of tag fusions to perform.
## When a word has all tags at the left hand side (with the same lemma),
## they are replaced by the tag at the right hand side (keeping the same lemma).
## Format:
## tag1 tag2 ... tagn => tag
<Fusion>
VMII1S0 VMII3S0 => VMII4S0
VMII3S0 VMII1S0 => VMII4S0
VMIC1S0 VMIC3S0 => VMIC4S0
VMIC3S0 VMIC1S0 => VMIC4S0
VMSP1S0 VMSP3S0 => VMSP4S0
VMSP3S0 VMSP1S0 => VMSP4S0
VMSI1S0 VMSI3S0 => VMSI4S0
VMSI3S0 VMSI1S0 => VMSI4S0
VMSF1S0 VMSF3S0 => VMSF4S0
VMSF3S0 VMSF1S0 => VMSF4S0
VAII1S0 VAII3S0 => VAII4S0
VAII3S0 VAII1S0 => VAII4S0
VAIC1S0 VAIC3S0 => VAIC4S0
VAIC3S0 VAIC1S0 => VAIC4S0
VASP1S0 VASP3S0 => VASP4S0
VASP3S0 VASP1S0 => VASP4S0
VASI1S0 VASI3S0 => VASI4S0
VASI3S0 VASI1S0 => VASI4S0
VASF1S0 VASF3S0 => VASF4S0
VASF3S0 VASF1S0 => VASF4S0
VSII1S0 VSII3S0 => VSII4S0
VSII3S0 VSII1S0 => VSII4S0
VSIC1S0 VSIC3S0 => VSIC4S0
VSIC3S0 VSIC1S0 => VSIC4S0
VSSP1S0 VSSP3S0 => VSSP4S0
VSSP3S0 VSSP1S0 => VSSP4S0
VSSI1S0 VSSI3S0 => VSSI4S0
VSSI3S0 VSSI1S0 => VSSI4S0
VSSF1S0 VSSF3S0 => VSSF4S0
VSSF3S0 VSSF1S0 => VSSF4S0
VMIP1P0 VMIS1P0 => VMIB1P0
VMIS1P0 VMIP1P0 => VMIB1P0
PP3CNA0 PP3MSA0 => PP3MSA0
PP3MSA0 PP3CNA0 => PP3MSA0
NCMS000 NCFS000 => NCCS000
NCFS000 NCMS000 => NCCS000
NCFS000 NCFS000 => NCCS000
NCMP000 NCFP000 => NCCP000
NCFP000 NCMP000 => NCCP000
P00CN00 P03CN00 => P03CN00
P03CN00 P00CN00 => P03CN00
</Fusion>

## Rearrangements to SPPP output fields
## Rule form is:
## form lemma tag => stem rule_id form
##
## On the left hand side:
## "form", "lemma", and "tag" are regular expressions.
## "*" may be used to mean "anything".
## For "form" and "lemma" complete match will be checked.
## For "tag" prefix match will be used.
## Symbol "!" preceding the regexp negates it.
##
## On the right hand side:
## "stem" may be "F" (form), "L" (lemma), "T" (tag), or any lowercase literal.
## "rule_id" may be "F" (form), "L" (lemma), or "T" (tag).
## "form" may be any combination of "F", "L", and "T". form/lemma/tag will be
## concatenated in the given order, separated by "#".
##
## Rules are applied in order, until a match is found, thus, a last default
## rule "* * *" is needed.
<Output>
* * !(Z|W|NP|AO) => L T F ## stem=lema per tots excepte numeros, dates, NPs i AOs.
(un|una|uno) * Z => F T FL ## lema="un/o/a" per "un/o/a" amb tag Z (tenien lema="1")
* * * => T T FL ## stem=tag per la resta (numeros!="un/o/a", dates, NPs, AOs)
</Output>
8 changes: 5 additions & 3 deletions util/override_freeling.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@

# Freeling tags to override and replace by other tags
TAGS = {'I': 'AQ0MS00', 'DP1MPP': 'AP0MP1P', 'AQVMP00':'AQ0MP00',
'DP1MSP': 'AP0MS1P', 'DP1FPP': 'AP0FP1P', 'DP1FSP': 'AP0FS1P', 'AQ00000':'AQ0000'}
'DP1MSP': 'AP0MS1P', 'DP1FPP': 'AP0FP1P', 'DP1FSP': 'AP0FS1P', 'AQVFS00': 'AQ0FS00'}

REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'},
'dios': {'lemma': 'dios', 'tag': 'NCMS000'},
'adiós': {'lemma': 'adiós', 'tag': 'NCMS000'},
'señor': {'lemma': 'señor', 'tag': 'NCMS000'}}
'señor': {'lemma': 'señor', 'tag': 'NCMS000'},
}


DO_NOT_OVERRIDE = {'uf', 'je', 'ja', 'oh', 'todo_lo_contrario', 'ojalá'}

STEM_EQUALS_TAG = {'Z', 'W'}
STEM_EQUALS_TAG = {'Z', 'W'}

Loading

0 comments on commit 18d0d96

Please sign in to comment.