Merge pull request #52 from delph-in/olzama-dev

1. Freeling is now used so as to provide tag sequences in some cases (no retokenization). Some of these tag sequences are used directly by the grammar (such as in the case with clitics). Other sequences are fused by the srg-freeling interface, into a tag which can then be used by the grammar. 2. Added lexical entries for some missing punctuation marks. It is not clear that this is what needs to be done though, because before, I think the grammar somehow handled them without those lexical entries. Perhaps Freeling can analyze them into something that the grammar expects, but I cannot yet figure out what that is. At this point, the lexical entries were added somewhat ad hoc so there is no expectation that they will lead to a parse, only to the absense of lexical analysis error.
delph-in · May 30, 2023 · 18d0d96 · 18d0d96
2 parents fcd98f3 + 3ccb939
commit 18d0d96
Show file tree

Hide file tree

Showing 10 changed files with 460 additions and 165 deletions.
diff --git a/inflr.tdl b/inflr.tdl
@@ -834,6 +834,10 @@ ncfp000 :=
 %suffix (ncfp000 ncfp000)
 n_fem-pl_ilr.
 
+ncfp00v := 
+%suffix (ncfp00v ncfp00v)
+n_fem-pl_ilr.
+
 ncms00a := 
 %suffix (ncms00a ncms00a)
 n_masc-sg_ilr. 
@@ -855,6 +859,10 @@ ncfs00a :=
 %suffix (ncfs00a ncfs00a)
 n_fem-sg_ilr.
 
+ncfs00v := 
+%suffix (ncfs00v ncfs00v)
+n_fem-sg_ilr.
+
 ncfp00a := 
 %suffix (ncfp00a ncfp00a)
 n_fem-pl_ilr.
@@ -1110,13 +1118,13 @@ a_ilr.
 
 ; -- comparatives
 ; e.g. mejor
-aqccs0 := 
-%suffix (aqccs0 aqccs0)
+aqccs00 := 
+%suffix (aqccs00 aqccs00)
 a_sg_ilr.
 
 ; e.g. mejores
-aqccp0 := 
-%suffix (aqccp0 aqccp0) 
+aqccp00 := 
+%suffix (aqccp00 aqccp00) 
 a_pl_ilr.
 
 ; -- superlatives
@@ -1136,33 +1144,33 @@ aqsfs00 :=
 a_fem-sg_ilr.
 
 ; e.g. dificilísimas
-aqsfp0 := 
-%suffix (aqsfp0 aqsfp0)
+aqsfp00 := 
+%suffix (aqsfp00 aqsfp00)
 a_fem-pl_ilr.
 
 ; -- diminutives
 ; e.g. pequeñísimo
-aqdms0 := 
-%suffix (aqdms0 aqdms0)
+aqdms00 := 
+%suffix (aqdms00 aqdms00)
 a_masc-sg_ilr.
 
 ; e.g. pequeñísimos
-aqdmp0 := 
-%suffix (aqdmp0 aqdmp0)
+aqdmp00 := 
+%suffix (aqdmp00 aqdmp00)
 a_masc-pl_ilr.
 
 ; e.g. pequeñísima
-aqdfs0 := 
-%suffix (aqdfs0 aqdfs0)
+aqdfs00 := 
+%suffix (aqdfs00 aqdfs00)
 a_fem-sg_ilr.
 
 ; e.g. pequeñísimos
-aqdfp0 := 
-%suffix (aqdfp0 aqdfp0)
+aqdfp00 := 
+%suffix (aqdfp00 aqdfp00)
 a_fem-pl_ilr.
 
-aq0000 := 
-%suffix (aq0000 aq0000)
+aq00000 := 
+%suffix (aq00000 aq00000)
 no_ilr.
 
 
@@ -2217,11 +2225,11 @@ vpart_ilr.
 
 ; -- enclitics
 
-+PP3MSA00 := 
++PP3MSA0 := 
 %suffix (vmlo vmlo)
 pp3msa_ilr.
 
-+PP3CNA00 := 
++PP3CNA0 := 
 %suffix (vmlo2 vmlo2) 
 pp3cna_ilr.
 

diff --git a/letypes.tdl b/letypes.tdl
@@ -8556,7 +8556,6 @@ pt_-_lhyphn_lex := basic-punct-lex &
   [ SYNSEM punct_synsem & 
            [ LOCAL.CAT.HEAD.PUNCT-MK lhyphen_punct ] ].
 
-
 pt_-_quest-op_lex := basic-punct-lex & 
   [ SYNSEM punct_synsem & 
            [ LOCAL.CAT.HEAD.PUNCT-MK ques_op_punct ] ].
@@ -8590,7 +8589,6 @@ pt_-_fr-op_lex := basic-punct-lex &
            [ LOCAL.CAT.HEAD.PUNCT-MK fra_punct ] ].
 
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; --- Lexical types

diff --git a/lexicon.tdl b/lexicon.tdl
@@ -125286,10 +125286,29 @@ hutu_n := n_-_c_native_le &
 
 hyphen_pt := pt_-_hyphn_native_le &
   [ STEM < "-" > ].
+
+;[olzama-dev 56a4658] Adding a lexical entry for the slash, but for now adding it as a hyphen, 
+; assuming it  sometimes means things like OR. will probably need to be revisited.  
+slash_pt := pt_-_hyphn_native_le &
+  [ STEM < "/" > ].
+
+; OZ 2023-05-29: Adding a double hyphen as a simple hyphen for now;
+; may need to be revisited.
+doublehyphen_pt := pt_-_hyphn_native_le &
+  [ STEM < "--" > ].  
+
+triplehyphen_pt := pt_-_hyphn_native_le &
+  [ STEM < "---" > ].  
 
 lhyphen_pt := pt_-_lhyphn_native_le &
   [ STEM < "-" > ].
 
+; OZ 2023-05-29: Adding a double hyphen as a simple hyphen for now;
+; may need to be revisited.
+; In reality, this occurs as the opening punctuation in dialog: 
+; "-- No."
+doubnlelhyphen_pt := pt_-_lhyphn_native_le &
+  [ STEM < "--" > ].
 
 hypocalcificar_v-np_rfx := v_np_rfx_native_le &
   [ STEM < "hypocalcificar" >,
@@ -186066,6 +186085,15 @@ por_ciento_av-i-vm := av_-_i-vm_native_le &
   [ STEM < "por_ciento" >,
     SYNSEM.LKEYS.KEYREL.PRED "_por_ciento_x_rel" ].
 
+; OZ 2023-05-29
+; Adding the percentage %, which was previously handled by Freeling somehow
+; (possibly, not compositionally?)
+; This may beed to be revisited.    
+por_ciento_sign_av-i-vm := av_-_i-vm_native_le &
+  [ STEM < "%" >,
+    SYNSEM.LKEYS.KEYREL.PRED "_por_ciento_x_rel" ].
+
+
 porcino_aj-i-nprd := aj_-_i-nprd_native_le &
   [ STEM < "porcino" >,
     SYNSEM.LKEYS.KEYREL.PRED "_porcino_a_rel" ].
@@ -196627,7 +196655,7 @@ quotes_pt := pt_-_quots_le &
    [ STEM < "\"" > ].
 
 quotes-sngl_pt := pt_-_quots_le & 
-   [ STEM < "\'" > ].
+   [ STEM < "'" > ].
 
 rabadilla_n := n_-_c_native_le &
   [ STEM < "rabadilla" >,

diff --git a/tibidabo.mem b/tibidabo.mem
diff --git a/util/freeling_api/srg-freeling.dat b/util/freeling_api/srg-freeling.dat
@@ -0,0 +1,174 @@
+## List of forms (or tags, if uppercased) for which PoS tagger output will 
+## be ignored (no analysis discarded) when found at the specified @position
+<NoDisambiguate>
+NP00000 @begin
+que @any
+hasta @any
+tanto @any
+como @any
+fui @any
+fuiste @any
+fue @any
+fuimos @any
+fuisteis @any
+fueron @any
+</NoDisambiguate>
+
+## List of words for which the list of output analysis given
+## by FreeLing must be ignored and replaced by the specified list.
+## One entry per line, format:
+##      form lemma1 tag1 lemma2 tag2 ...
+<ReplaceAll>
+quería querer VMII4S0
+un un Z
+uno uno Z
+una una Z
+acá acá NC00000
+acullá acullá NC00000
+ahí ahí NC00000
+ahora ahora NC00000
+allá allá NC00000
+allende allende NC00000
+allí allí NC00000
+anoche anoche NC00000
+antaño antaño NC00000
+anteanoche anteanoche NC00000
+anteanteayer anteanteayer NC00000
+anteayer anteayer NC00000
+antes_de_anoche antes_de_anoche NC00000
+antes_de_ayer antes_de_ayer NC00000
+aquende aquende NC00000
+aquí aquí NC00000
+así así NC00000 así SP
+ayer ayer NC00000
+ayer_noche ayer_noche NC00000
+entonces entonces NC00000
+hogaño hogaño NC00000
+hoy hoy NC00000
+ibídem ibídem NC00000
+mañana mañana NC00000
+pasado_mañana pasado_mañana NC00000
+ni ni CC ni RG
+demás demás PI0CC000
+vez vez NC00000
+veces vez NC00000
+antes antes SP antes RG
+después después SP después RG
+más más AQ0CS00 más SP más RG
+menos menos AQ0CS00 menos SP menos RG
+múltiples múltiple DI0CP0
+cierta cierto AQ0FS00 cierto DI0FS0
+ciertas cierto AQ0FP00 cierto DI0FP0
+cierto cierto AQ0MS00 cierto DI0MS0
+ciertos cierto AQ0MP00 cierto DI0MP0
+determinada determinar VMP00SF determinado DI0FS0
+determinadas determinar VMP00PF determinado DI0FP0
+determinado determinar VMP00SM determinado DI0MS0
+determinados determinar VMP00PM determinado DI0MP0
+diferente diferente AQ0CS00 diferente DI0CS0
+diferentes diferente AQ0CP00 diferente DI0CP0
+distinta diferente AQ0FS00 diferente DI0FS0
+distintas distinto AQ0FP00 diferente DI0FP0
+distinta distinto AQ0FS00 distinto DI0FS0
+distintas distinto AQ0FP00 distinto DI0FP0
+distinto distinto AQ0MS00 distinto DI0MS0
+distintos distinto AQ0MP00 distinto DI0MP0
+diversa diverso AQ0FS00 diverso DI0FS0
+diversas diverso AQ0FP00 diverso DI0FP0
+diverso diverso AQ0MS00 diverso DI0MS0
+diversos diverso AQ0MP00 diverso DI0MP0
+escasa escaso AQ0FS00 escaso DI0FS0
+escasas escaso AQ0FP00 escaso DI0FP0
+escaso escaso AQ0MS00 escaso DI0MS0
+escasos escaso AQ0MP00 escaso DI0MP0
+numerosa numeroso AQ0FS00 numeroso DI0FS0
+numerosas numeroso AQ0FP00 numeroso DI0FP0
+numeroso numeroso AQ0MS00 numeroso DI0MS0
+numerosos numeroso AQ0MP00 numeroso DI0MP0
+rara raro AQ0FS00 raro DI0FS0
+raras raro AQ0FP00 raro DI0FP0
+raro raro AQ0MS00 raro DI0MS0
+raros raro AQ0MP00 raro DI0MP0
+cientos ciento Zd
+millares millar Zd
+miles mil Zd
+mejor mejor AQ0CS00
+off-line off-line AQ0CN00
+on-line on-line AQ0CN00
+peor peor AQ0CS00
+
+</ReplaceAll>
+
+## List of tag fusions to perform. 
+## When a word has all tags at the left hand side (with the same lemma),
+## they are replaced by the tag at the right hand side (keeping the same lemma).
+## Format:
+##    tag1 tag2 ... tagn => tag
+<Fusion>
+VMII1S0 VMII3S0 => VMII4S0
+VMII3S0 VMII1S0 => VMII4S0
+VMIC1S0 VMIC3S0 => VMIC4S0
+VMIC3S0 VMIC1S0 => VMIC4S0
+VMSP1S0 VMSP3S0 => VMSP4S0
+VMSP3S0 VMSP1S0 => VMSP4S0
+VMSI1S0 VMSI3S0 => VMSI4S0
+VMSI3S0 VMSI1S0 => VMSI4S0
+VMSF1S0 VMSF3S0 => VMSF4S0
+VMSF3S0 VMSF1S0 => VMSF4S0
+VAII1S0 VAII3S0 => VAII4S0
+VAII3S0 VAII1S0 => VAII4S0
+VAIC1S0 VAIC3S0 => VAIC4S0
+VAIC3S0 VAIC1S0 => VAIC4S0
+VASP1S0 VASP3S0 => VASP4S0
+VASP3S0 VASP1S0 => VASP4S0
+VASI1S0 VASI3S0 => VASI4S0
+VASI3S0 VASI1S0 => VASI4S0
+VASF1S0 VASF3S0 => VASF4S0
+VASF3S0 VASF1S0 => VASF4S0
+VSII1S0 VSII3S0 => VSII4S0
+VSII3S0 VSII1S0 => VSII4S0
+VSIC1S0 VSIC3S0 => VSIC4S0
+VSIC3S0 VSIC1S0 => VSIC4S0
+VSSP1S0 VSSP3S0 => VSSP4S0
+VSSP3S0 VSSP1S0 => VSSP4S0
+VSSI1S0 VSSI3S0 => VSSI4S0
+VSSI3S0 VSSI1S0 => VSSI4S0
+VSSF1S0 VSSF3S0 => VSSF4S0
+VSSF3S0 VSSF1S0 => VSSF4S0
+VMIP1P0 VMIS1P0 => VMIB1P0
+VMIS1P0 VMIP1P0 => VMIB1P0
+PP3CNA0 PP3MSA0 => PP3MSA0
+PP3MSA0 PP3CNA0 => PP3MSA0
+NCMS000 NCFS000 => NCCS000
+NCFS000 NCMS000 => NCCS000
+NCFS000 NCFS000 => NCCS000
+NCMP000 NCFP000 => NCCP000
+NCFP000 NCMP000 => NCCP000
+P00CN00 P03CN00 => P03CN00
+P03CN00 P00CN00 => P03CN00
+</Fusion>
+
+## Rearrangements to SPPP output fields
+## Rule form is:
+##     form lemma tag  =>  stem rule_id form
+##
+##  On the left hand side:
+##    "form", "lemma", and "tag" are regular expressions.
+##    "*" may be used to mean "anything".
+##    For "form" and "lemma" complete match will be checked.
+##    For "tag" prefix match will be used.
+##    Symbol "!" preceding the regexp negates it.
+##
+##  On the right hand side:
+##    "stem" may be "F" (form), "L" (lemma), "T" (tag), or any lowercase literal.
+##    "rule_id" may be "F" (form), "L" (lemma), or "T" (tag).
+##    "form" may be any combination of "F", "L", and "T". form/lemma/tag will be 
+##           concatenated in the given order, separated by "#".
+##
+##  Rules are applied in order, until a match is found, thus, a last default
+##  rule "* * *" is needed.
+<Output>
+*             *  !(Z|W|NP|AO)  =>  L  T  F   ## stem=lema per tots excepte numeros, dates, NPs i AOs.
+(un|una|uno)  *  Z             =>  F  T  FL  ## lema="un/o/a" per "un/o/a" amb tag Z (tenien lema="1")
+*             *  *             =>  T  T  FL  ## stem=tag per la resta (numeros!="un/o/a", dates, NPs, AOs)
+</Output>
diff --git a/util/override_freeling.py b/util/override_freeling.py
@@ -1,14 +1,16 @@
 
 # Freeling tags to override and replace by other tags
 TAGS = {'I': 'AQ0MS00', 'DP1MPP': 'AP0MP1P', 'AQVMP00':'AQ0MP00',
-        'DP1MSP': 'AP0MS1P', 'DP1FPP': 'AP0FP1P', 'DP1FSP': 'AP0FS1P', 'AQ00000':'AQ0000'}
+        'DP1MSP': 'AP0MS1P', 'DP1FPP': 'AP0FP1P', 'DP1FSP': 'AP0FS1P', 'AQVFS00': 'AQ0FS00'}
 
 REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'},
                          'dios': {'lemma': 'dios', 'tag': 'NCMS000'},
                          'adiós': {'lemma': 'adiós', 'tag': 'NCMS000'},
-                         'señor': {'lemma': 'señor', 'tag': 'NCMS000'}}
+                         'señor': {'lemma': 'señor', 'tag': 'NCMS000'},
+                         }
 
 
 DO_NOT_OVERRIDE = {'uf', 'je', 'ja', 'oh', 'todo_lo_contrario', 'ojalá'}
 
-STEM_EQUALS_TAG = {'Z', 'W'}
+STEM_EQUALS_TAG = {'Z', 'W'}
+