Skip to content

Commit

Permalink
Add extractor configurations for Amharic
Browse files Browse the repository at this point in the history
  • Loading branch information
Meti-Adane committed Aug 21, 2024
1 parent 2074df4 commit fc4e376
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ object DisambiguationExtractorConfig
// For "ar" and "he" configurations, rendering right-to-left may seem like a bug, but it's not.
// Don't change this unless you know what you're doing.
val disambiguationTitlePartMap = Map(
"am" -> " (መንታ)",
"ar" -> " (توضيح)",
"bg" -> " (пояснение)",
"ca" -> " (desambiguació)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ object GenderExtractorConfig
val pronounsMap = Map(
"en" -> Map("she" -> "female", "her" -> "female", "he" -> "male", "his" -> "male", "him" -> "male", "herself" -> "female", "himself" -> "male",
"She" -> "female", "Her" -> "female", "He" -> "male", "His" -> "male", "Him" -> "male", "Herself" -> "female", "Himself" -> "male" //TODO why not just do case insensitive matches?
),
"am" -> Map(
"እሷ" -> "ሴት",
"እሷን" -> "ሴት",
"የሷ" -> "ሴት",
"እራሷን" -> "ሴት",
"እራሷ" -> "ሴት",
"እሱ" -> "ወንድ",
"እሱን" -> "ወንድ",
"የእሱ" -> "ወንድ",
"የራሱ" -> "ወንድ",
"እራሱ" -> "ወንድ",
"እራሱን" -> "ወንድ"
),
"pt" -> Map ("ela"-> "mulher", "dela" -> "mulher", "ele" -> "homem", "dele" -> "homem", "nela" -> "mulher", "nele" -> "homem",
"Ela"-> "mulher", "Dela" -> "mulher", "Ele" -> "homem", "Dele" -> "homem", "Nela" -> "mulher", "Nele" -> "homem"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,23 @@ object HomepageExtractorConfig
// Don't change this else if you know how it is done.

private val propertyNamesMap = Map(
"am" -> Set(
"ድህረገፅ",
"ድህረ_ገፅ",
"ገጽ",
"ድህረ ገጽ",
"ድህረ_ገጽ",
"ድረ_ገፅ",
"ድረገፅ",
"ድረገጽ",
"ድረ ገጽ",
"ድረ_ገጽ",
"ዋና_ገጽ",
"ዌብሳይት",
"website",
"web",
"site"
),
"ar" -> Set("الموقع", "الصفحة الرسمية", "موقع", "الصفحة الرئيسية", "صفحة ويب", "موقع ويب"),
"bg" -> Set("сайт", "уебсайт"),
"ca" -> Set("pàgina", "web", "lloc"),
Expand Down Expand Up @@ -38,6 +55,7 @@ object HomepageExtractorConfig
val supportedLanguages = propertyNamesMap.keySet

private val externalLinkSectionsMap = Map(
"am" -> "(?:የውጭ ንባብ|የውጭ ማያያዣ)",
"ar" -> "وصلات خارجية",
"bg" -> "Външни препратки",
"ca" -> "(?:Enllaços externs|Enllaço extern)",
Expand Down Expand Up @@ -65,6 +83,7 @@ object HomepageExtractorConfig
}

private val officialMap = Map(
"am" -> "ዋና",
"ar" -> "رسمي",
"bg" -> "официален",
"ca" -> "oficial",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object ImageExtractorConfig
// Don't change this else if you know how it is done.
val NonFreeRegex = Map(
"ar" -> """(?i)\{\{\s?غير حر""".r,
"am" ->"""(?i)\{\{\s?(non-free|Logo|Screenshot|Noncommercial|ነፃ_ያልሆነ)""".r,
"bg" ->"""(?i)\{\{\s?non-free""".r,
"de" -> """(?iu)\{\{\s?(Dateiüberprüfung/benachrichtigt_\(Kategorie\)|Geschützt|Geschützt-Ungeklärt|Bild-LogoSH|Bild-PD-alt-100|Bild-PD-alt-1923|Bild-WikimediaCopyright)\s?\}\}""".r ,
"el" -> """(?iu)\{\{\s?(εύλογη χρήση|σήμα|σήμα αθλητικού σωματείου|αφίσα ταινίας|σκηνή από ταινία|γραφικά υπολογιστή|εξώφυλλο άλμπουμ|εξώφυλλο βιβλίου|μη ελεύθερο έργο τέχνης|σελίδα κόμικς|σελίδα εφημερίδας|εικόνα-βιντεοπαιχνίδι|ιδιοκτησία Wikimedia)\s?\}\}""".r ,
Expand All @@ -29,9 +30,9 @@ object ImageExtractorConfig
"ru" -> """(?iu)\{\{\s?(CopyrightByWikimedia|Fairuse|несвободный файл|несвободная лицензия|запрещенная лицензия)\s?\}\}""".r
)

val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг)([^\w]*|[_\s]+)""".r
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage)([^\w]*|[_\s]+)""".r
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift)""".r
val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг|ባንዲራ|ሰንደቅ_ዓላማ)([^\w]*|[_\s]+)""".r
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage|ካርታ)([^\w]*|[_\s]+)""".r
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift|ፊርማ)""".r
val cOARegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(coat_of_arms|emblem|crest|wappen|grandes_armes|blason|armoiries)([^\w]*|[_\s]+)""".r


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object InfoboxExtractorConfig

val ignoreProperties = Map (
"en"-> Set("image", "image_photo", "map"),
"am"-> Set("ምስል", "ፎቶ", "ስዕል", "ካርታ", "አርማ"),
"ar"-> Set("صورة"),
"id"-> Set("foto", "gambar"),
"el"-> Set("εικόνα", "εικονα", "Εικόνα", "Εικονα", "χάρτης", "Χάρτης"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ object TopicalConceptsExtractorConfig
val catMainTemplates = Set(
"مزيد" ,// ar
"Infocat", "Infocatm", // ca
"Catmore", // el,ja
"Catmore", // el,ja,am
"Cat main", // en
"AP", // es
"Nagusia", // eu
Expand Down

0 comments on commit fc4e376

Please sign in to comment.