Skip to content

Commit

Permalink
#11, #12: ECMAScript regex compiled from Python (with comments)
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Dec 2, 2021
1 parent dd6fa63 commit 810044c
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
# with:
# cmd: yq < ontologia/0.77.995.yml > ontologia/0.77.995.json

- run: yq < ontologia/0.77.995.yml > ontologia/0.77.995.json
- run: yq --output-format json < ontologia/0.77.995.yml > ontologia/0.77.995.json
continue-on-error: true

- run: cp -r bin/ docs/bin/
Expand Down
10 changes: 7 additions & 3 deletions docs/eng-Latn/dictionary.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ Definitionem::
Referens::
* https://en.wikipedia.org/wiki/Nomen_dubium
Usum::
* Consider use more specific <<#iz_bcp47e_x_periculo>> or <<#iz_bcp47e_x_ambigua>> when applicable.
* Consider use more specific <<#iz_bcp47e_x_periculo>> or <<#iz_bcp47e_x_ambiguum>> when applicable.

[#iz_bcp47e_x_periculo]
===== `+iz_bcp47e_x_periculo`
Expand Down Expand Up @@ -524,10 +524,13 @@ Note: all language attributes start with `+i`
+i_pt
+i_por
+ig_port1283
+ir_br
+ir_076
+is_latn
+it_en_eng_latn
+ix_ambigua
+iu_1_traslatorname
+iw_1_bing
+ix_ambiguum+ix_periculo
+iz_bcp47e_t (long form of +it_, but without break in parts)
+iz_bcp47e_x (long form of +ix_, but without break in parts)
----
Expand All @@ -537,14 +540,15 @@ Note: all language attributes start with `+i`
----
+i_pt
+i_por
+ii_de_linguam
+ii_de_linguam_fontem
+ii_est_linguam_fontem
+ir_076
+ir_br
----

// +izb47_t_en_por_latn
// +izb47_x_ambigua
// +izb47_x_ambiguum

=== Use with not typical linguistic content

Expand Down
62 changes: 41 additions & 21 deletions docs/ontologia-regulam.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<script>

/****** _[eng-Latn] Edit this section [eng-Latn]_ ******/
const debug = false
const debug = true
const hxltm_ontologia_archivum = "./ontologia/cor.hxltm.215.json"
const testum_regula_adhoc = ''
const testum_exemplum_adhoc = [
Expand Down Expand Up @@ -45,6 +45,7 @@
exemplum.forEach(element => {
debug && console.log('actionem_testum item', element)
result = regula_regex.exec(element.hxl)
debug && console.log('actionem_testum groups', result.groups)
Object.entries(element).forEach(([key, value]) => {
if (result.groups[key]) {
if (element[key] !== result.groups[key]) {
Expand All @@ -61,41 +62,60 @@
}
}

/**
* Simplistic conversion from python re Regex to ECMAScript regex
*/
const javascript_regex_de_python_regex = (python_regex) => {
return python_regex
.replace(/((?<!\\)[#]).*/g, '') // Strip comments # (but not \#)
.replace(/\s/g, '') // Strip spaces
.replace(/\(\?P</g, '(?<') // Replace python (?P<group>...), no P
}

const main = (ontologia) => {
const est_regula = testum_regula_adhoc || ontologia.ontologia_regulam.structuram.basim.javascript
const est_exemplum = quod_exemplum(ontologia, testum_exemplum_adhoc)
if (debug) {
const de_python_regex = javascript_regex_de_python_regex(
ontologia.ontologia_regulam.structuram.basim.python)
console.log(`main
est_regula [${est_regula}]
testum_regula_adhoc [${testum_regula_adhoc}]
ontologia_regulam [${ontologia.ontologia_regulam.structuram.basim.javascript}]
... de_python [${de_python_regex}]
`)
console.log('main est_exemplum', est_exemplum)
}

if (debug) {
// Use this to generate the
// ontologia.ontologia_regulam.structuram.basim.javascript
console.log(javascript_regex_de_python_regex(
ontologia.ontologia_regulam.structuram.basim.python))
}
actionem_testum(est_exemplum, est_regula)

// var regula2 = '(?<linguam>(i.?_).*(?!<etc>(\\+.?))).?'
// var regula2 = '(?<linguam>(i.?_).*(?<etc>(\\+.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>(.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>[^\\+i](.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>[^\\+i.?](.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_.*))?(?<etcetera>[\\+[^i]].*)?'
// var regula3 = '(?<etcetera>(\\+[^i\w?_].*))'
var regula3 = '(?<linguam>(\\+i.?_.*))?(?<etcetera>(\\+[^i\w?_].*))'
var subspeciem = [
'+i_pt+i_por+ig_port1283+is_latn+rem',
'+i_pt+i_por+ig_port1283+is_latn+exemplo1+exemplo2+exemplo3',
'+ix_de_linguam',
'+ix_est_linguam',
'+ix_est_linguam_fontem',
]
// var regula_regex2 = new RegExp(regula2, "i")
var regula_regex2 = new RegExp(regula3, "i")
// var regula2 = '(?<linguam>(i.?_).*(?!<etc>(\\+.?))).?'
// var regula2 = '(?<linguam>(i.?_).*(?<etc>(\\+.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>(.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>[^\\+i](.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_).*(?<etc>[^\\+i.?](.?))).?'
// var regula2 = '(?<linguam>(\\+i.?_.*))?(?<etcetera>[\\+[^i]].*)?'
// var regula3 = '(?<etcetera>(\\+[^i\w?_].*))'
var regula3 = '(?<linguam>(\\+i.?_.*))?(?<etcetera>(\\+[^i\w?_].*))'
var subspeciem = [
'+i_pt+i_por+ig_port1283+is_latn+rem',
'+i_pt+i_por+ig_port1283+is_latn+exemplo1+exemplo2+exemplo3',
'+ix_de_linguam',
'+ix_est_linguam',
'+ix_est_linguam_fontem',
]
// var regula_regex2 = new RegExp(regula2, "i")
var regula_regex2 = new RegExp(regula3, "i")

subspeciem.forEach(element => {
console.log(regula_regex2.exec(element))
});
subspeciem.forEach(element => {
console.log(regula_regex2.exec(element))
});

}

Expand Down
40 changes: 33 additions & 7 deletions ontologia/cor.hxltm.215.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3025,16 +3025,22 @@ ontologia_regulam:
divisionem: '#item'
classem: '+linguam'
# speciem: +i_pt+i_por+ig_port1283+is_latn
- hxl: '#meta+linguam+i_en+i_eng+ig_stan1293+ir_076+is_latn+it_1_pt_por_latn+iu_1_emerrocha+iw_1_bing+ix_ambiguum+ix_periculo+v_linguam_maximum'
# BCP47 extended
# bcp47e: pt-Latn-g-port1283-t-en-latn
divisionem: '#meta'
classem: '+linguam'
# speciem: +i_pt+i_por+ig_port1283+is_latn
# /workspace/git/EticaAI/tico-19-hxltm/scripts/fn/linguacodex.py --de_bcp47_simplex --de_codex g-port1283-aaa-bbb | jq

# Trivia: strūctūram, https://en.wiktionary.org/wiki/structura#Latin
structuram:
# basim -> divisionem, classem, speciem
basim:
# https://regex101.com/r/XUOncM/5
# https://regex101.com/r/Ff27ID/3
# https://regex101.com/r/Ff27ID/4
javascript: >-
(?<divisionem>(#item|#meta))(?<classem>(\+conceptum|\+linguam|\+terminum))((?<linguam_de>(\+ix_de_[a-z_]*))|(?<linguam_est>(\+ix_est_[a-z_]*))|(?<linguam_i2a>(\+i_\w\w))?(?<linguam_i3a>(\+i_\w\w\w))(?<linguam_ig>(\+ig_\w\w\w\w\d\d\d\d))?((?<linguam_s4a>(\+is_\w{3,4})))(?<linguam_it>(\+it_[a-z0-9_]*))?)?(?<etcetera>(\+.*))?(?<datum_vocabularium>(\+v_[a-z_]*))?
(?<divisionem>(\#item|\#meta))(?<classem>(\+conceptum|\+linguam|\+terminum))((?<!conceptum)((?<linguam_implicitum_de>(\+ii_de_linguam[a-z_]*))|(?<linguam_implicitum_est>(\+ii_est_linguam[a-z_]*)))|((?<linguam_iso639_1_a>(\+i_\w\w))?(?<linguam_iso639_3_a>(\+i_\w\w\w))(?<linguam_glotto>(\+ig_[a-z]{4}\d{4}))?((?<linguam_iso3166_2_a>(\+ir_[a-z]{2}))|(?<linguam_iso3166_3_a>(\+ir_[a-z]{3}))|(?<linguam_unm49>(\+ir_[0-9]{3})))?(((?<linguam_iso15924_a>(\+is_[a-z]{4})))|((?<linguam_iso15924_n>(\+is_[0-9]{3}))))((?<linguam_translationem_de_linguam>((\+it_[1,9]{1}_[a-z0-9_]*){1,})))?(((?<linguam_translationem_humanum>((\+iu_[1,9]{1}_[a-z0-9_]*){1,}){1,})))?(((?<linguam_translationem_machinam>((\+iw_[1,9]{1}_[a-z0-9_]*){1,}){1,})))?(((?<linguam_privatum>((\+ix_[a-z0-9]{2,8})){1,})))?))?.*(?<datum_vocabularium>(\+v_[a-z_]*))?
# \#(?<divisionem>(item|meta)).+?(?<classem>(conceptum|linguam|terminum))(?<speciem>.*)
# https://learnbyexample.github.io/py_regular_expressions/groupings-and-backreferences.html
python: |
Expand All @@ -3060,7 +3066,7 @@ ontologia_regulam:
## explicitum est
(
(?P<linguam_iso639_1_a>(\+i_\w\w))?
(?P<linguam_iso639_3_a>(\+i_\w\w\w)) # requisitum!
(?P<linguam_iso639_3_a>(\+i_\w\w\w)) # ISO 639-3 requisitum!
(?P<linguam_glotto>(\+ig_[a-z]{4}\d{4}))?
( # Locum
(?P<linguam_iso3166_2_a>(\+ir_[a-z]{2}))
Expand All @@ -3069,14 +3075,33 @@ ontologia_regulam:
|
(?P<linguam_unm49>(\+ir_[0-9]{3}))
)?
( # scriptum codicem: requisitum!
( # Scriptum codicem: requisitum!
((?P<linguam_iso15924_a>(\+is_[a-z]{4})))
|
((?P<linguam_iso15924_n>(\+is_[0-9]{3})))
)
( # BCP 47 Extension T style, de linguam
(?P<linguam_translationem_de_linguam>((\+it_[1,9]{1}_[a-z0-9_]*){1,}))
)?
( # Humam translator, list
((?P<linguam_translationem_humanum>((\+iu_[1,9]{1}_[a-z0-9_]*){1,}){1,}))
)?
# +iv_ not used
( # _machinam translator, list
((?P<linguam_translationem_machinam>((\+iw_[1,9]{1}_[a-z0-9_]*){1,}){1,}))
)?
( # BCP 47 Private attribute
((?P<linguam_privatum>((\+ix_[a-z0-9]{2,8})){1,}))
)?
)
)?
.*
#(?P<etcetera>
# (\+[0-9a-z_]*)
#)?
.* # TODO: remove this
(?P<datum_vocabularium>
(\+v_[a-z_]*)
)?
# subspeciem:
# javascript: >-
# \(?<divisionem>(#item|#meta)).+?(?<classem>(conceptum|linguam|terminum))(?<speciem>.*)
Expand Down Expand Up @@ -3158,12 +3183,13 @@ ontologia_regulam:
python: '(^#item|^#meta)\+terminum'

### Example test cases
#meta+linguam+i_en+i_eng+ig_stan1293+ir_076+is_latn+it_en_por_latn+ib_x_ambigua
#meta+linguam+i_en+i_eng+ig_stan1293+ir_076+is_latn+it_1_pt_por_latn+iu_1_emerrocha+iw_1_bing+ix_ambiguum+ix_periculo+v_linguam_maximum
#meta+linguam+i_en+i_eng+ig_stan1293+ir_076+is_latn+it_1_pt_por_latn+it_2_es_spa_latn+iu_1_emerrocha+ix_ambiguum+ix_periculo
#item+terminum+ii_de_linguam_fontem
#meta+conceptum+i_en+i_eng+is_latn
#item+conceptum+codicem
#meta+linguam+i_en+i_eng+is_latn
#meta+linguam+i_en+i_eng+ig_stan1293+is_latn+it_en_por_latn+ib_x_ambigua
#meta+linguam+i_en+i_eng+ig_stan1293+is_latn+it_1_en_por_latn+ix_ambigua
#meta+linguam+i_en+i_eng+is_215
#item+terminum+ii_est_linguam+v_linguam_maximum
#item+terminum+ii_est_linguam+v_linguam_a
Expand Down

0 comments on commit 810044c

Please sign in to comment.