code/exploiting_wilddiacs/wilddiac_utils.py

from collections import namedtuple
import re
from typing import Iterable, List

import rapidfuzz.distance.Levenshtein as editdistance

from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.charsets import UNICODE_PUNCT_CHARSET
from camel_tools.utils.charsets import AR_LETTERS_CHARSET, AR_DIAC_CHARSET
from camel_tools.utils.charsets import AR_CHARSET
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.disambig.common import ScoredAnalysis


__all__ = (
    'remove_contextual_diac_flags',
    'fix_contextual_diacs',
    'fix_diac_order',
    'normalize_diac',
    'OpCount',
    'word_to_word_opcount',
    'normalize_consonants',
    'word_is_oov',
)


ALL_WASLA_RE = re.compile('\u0671')
STARTING_WASLA_RE = re.compile(u'^\u0671')
MIDDLE_WASLA_WITH_DIAC_RE = re.compile(u'^\u0671[\u064e\u064f\u0650\u0652]?')
ENDING_SUKUN_RE = re.compile(u'\u0652$')
CONTEXTUAL_DIACS_FLAG_RE = re.compile('%[mn]')
MEEM_FLAG_RE = re.compile('%m')
MIN_FLAG_RE = re.compile('%n')
DIAC_AR_AT_START_RE = re.compile('^[' + ''.join(AR_LETTERS_CHARSET) + '][' +
                                 ''.join(AR_DIAC_CHARSET) + ']')

IS_AR_RE = re.compile('^[' + re.escape(''.join(AR_CHARSET)) + ']+(%[mn])?$')

FIX_TANWEEN_RE_01 = re.compile('(\u0651?)\u064b(\u0651?)(\u0627|\u0649)')
FIX_TANWEEN_RE_02 = re.compile('(\u0627|\u0649)\u064b')

FIX_DIAC_ORDER_RE = re.compile('([\u064b-\u0650]?)(\u0651?)(\u0652?)(\u0670?)')


def is_ar_word(word: str) -> bool:
    """Returns True if a word is composed entirely of Arabic characters,
    False otherwise.
    """

    return IS_AR_RE.match(word) is not None


def is_punct(token: str) -> bool:
    """Returns True if a word is composed entirely of punctuation/symbolic
    characters, False otherwise.
    """

    for char in token:
        if (char not in UNICODE_PUNCT_CHARSET):
            return False

    return True


def remove_contextual_diac_flags(word: str) -> str:
    """Removes contextual flags from a given word generated by our extended
    Morphological Analyzer.
    """

    if not is_ar_word(word):
        return word

    result = CONTEXTUAL_DIACS_FLAG_RE.sub('', word)
    result = ALL_WASLA_RE.sub('\u0627', result)

    return result


def fix_contextual_diacs(diacs: List[str]):
    """Perform contextual diacritization fixes on a given sentence.
    """

    n = len(diacs)
    for i in range(n):
        # If a word is not an Arabic word do nothing.
        if not is_ar_word(diacs[i]):
            continue

        # Determine if a word is the start of a context.
        is_context_beginning = True
        if i != 0 and not is_punct(diacs[i - 1]):
            is_context_beginning = False

        # If a words starts with a diacritized Alif Wasla in the middle of a
        # context, remove the diacritic.
        if not is_context_beginning:
            diacs[i] = MIDDLE_WASLA_WITH_DIAC_RE.sub(u'\u0671', diacs[i])

        # Determine if the next word begins with an Alif Wasla.
        is_followed_by_wasla = False
        if i < (n - 1) and diacs[i + 1][0] == u'\u0671':
            is_followed_by_wasla = True

        # If the current word ends with a Sukun and is followed by an
        # Alif Wasla, determine the appropriate replacement for the Sukun.
        # Use Kasra by default.
        replacement = u'\u0650' # Kasra

        # If the word contains a marker indicating third masculine plural
        # or second maculine plural ending with a Meem, use a Damma.
        if MEEM_FLAG_RE.search(diacs[i]) is not None:
            diacs[i] = MEEM_FLAG_RE.sub('', diacs[i])
            replacement = u'\u064f'

        # If the word is "min" then we use the diacritic on the following
        # Alif Wasla.
        elif MIN_FLAG_RE.search(diacs[i]) is not None:
                diacs[i] = MIN_FLAG_RE.sub('', diacs[i])
                if (i < (n - 1) and
                    DIAC_AR_AT_START_RE.match(diacs[i+1]) is not None):
                    replacement = diacs[i + 1][1] # Take wasla diac

        # If the current word ends with a Sukun and is followed by an
        # Alif Wasla, replace Sukun with our chosen replacement.
        if is_followed_by_wasla:
            if diacs[i][-1] == u'\u0652':
                diacs[i] = ENDING_SUKUN_RE.sub(replacement, diacs[i])

        # Convert all starting Alif Waslas to an Alif.
        diacs[i] = STARTING_WASLA_RE.sub(u'\u0627', diacs[i])

    return diacs


def fix_tanween_alef(word: str) -> str:
    """Move Tanween Fatha preceeding an Alef to come after it.
    """

    return FIX_TANWEEN_RE_01.sub('\\g<1>\\g<2>\\g<3>\u064b', word)


def fix_tanween_alef_before(word: str):
    """Move Tanween Fatha following an Alef to come before it.
    """

    return FIX_TANWEEN_RE_02.sub('\u064b\\g<1>', word)


def fix_diac_order(word: str) -> str:
    """Reorder diacritics of a given word to a more natural order.
    """

    word_fixed = normalize_unicode(word)
    word_fixed = fix_tanween_alef_before(word_fixed)
    word_fixed = FIX_DIAC_ORDER_RE.sub('\\g<2>\\g<1>\\g<3>\\g<4>', word_fixed)

    return word_fixed


def normalize_diac(word) -> str:
    """Normalize the diacritics of a given word to a form suitable for
    comparison.
    """

    word_norm = normalize_unicode(word)
    word_norm = fix_tanween_alef(word_norm)
    word_norm = normalize_unicode(word_norm)

    return word_norm


OpCount = namedtuple('OpCount', ['noop', 'insert', 'substitute', 'delete'])


def word_to_word_opcount(src: str, trg: str) -> OpCount:
    """Returns counts for the number of insertions, deletions, substitutions,
    and no-ops for the Levenshtein edit distance between a source and taget
    word.
    """

    i_count = 0
    s_count = 0
    d_count = 0
    n_count = 0

    opcodes = editdistance.opcodes(src, trg)
    for opcode in opcodes:
        action = opcode.tag
        if action == 'equal':
            n_count += 1
        elif action == 'insert':
            i_count += 1
        elif action == 'replace':
            s_count += 1
        elif action == 'delete':
            d_count += 1

    return OpCount(n_count, i_count, s_count, d_count)


"""Normalizes all Alif, Yaa/Alif Maksoura, and Haa/Taa Marbouta forms.
"""
normalize_consonants = CharMapper({
    u'\u0625': u'\u0627',
    u'\u0623': u'\u0627',
    u'\u0622': u'\u0627',
    u'\u0671': u'\u0627',
    u'\u0649': u'\u064a',
    u'\u0629': u'\u0647',
    u'\u0640': u''
})


def word_is_oov(analyses: Iterable[ScoredAnalysis]) -> bool:
    """Returns True if either there are no analyses or all analyses are backoff
    analyses, returns False otherwise.
    """

    if len(analyses) == 0:
        return False

    for analysis in analyses:
        if analysis.analysis['source'] != 'backoff':
            return False

    return True