From 061567fca3ae4af7de9f9ea3a220da6318bdedcd Mon Sep 17 00:00:00 2001
From: CjangCjengh <101577701+CjangCjengh@users.noreply.github.com>
Date: Fri, 30 Sep 2022 23:31:13 +0800
Subject: [PATCH] support English

---
 text/cleaners.py |  72 ++++++++++++++++++++
 text/english.py  | 171 +++++++++++++++++++++++++++++++++++++++++++++++
 text/japanese.py |  25 +++++++
 text/korean.py   |   5 ++
 text/mandarin.py |  83 ++++++++++++++++++++++-
 5 files changed, 355 insertions(+), 1 deletion(-)
 create mode 100644 text/english.py

diff --git a/text/cleaners.py b/text/cleaners.py
index 15c5cc1..4dfd361 100644
--- a/text/cleaners.py
+++ b/text/cleaners.py
@@ -65,10 +65,12 @@ def cjks_cleaners(text):
     from text.japanese import japanese_to_ipa
     from text.korean import korean_to_lazy_ipa
     from text.sanskrit import devanagari_to_ipa
+    from text.english import english_to_lazy_ipa
     chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
     japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
     korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
     sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
+    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
     for chinese_text in chinese_texts:
         cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
         text = text.replace(chinese_text, cleaned_text+' ', 1)
@@ -81,7 +83,77 @@ def cjks_cleaners(text):
     for sanskrit_text in sanskrit_texts:
         cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
         text = text.replace(sanskrit_text, cleaned_text+' ', 1)
+    for english_text in english_texts:
+        cleaned_text = english_to_lazy_ipa(english_text[4:-4])
+        text = text.replace(english_text, cleaned_text+' ', 1)
     text = text[:-1]
     if re.match(r'[^\.,!\?\-…~]', text[-1]):
         text += '.'
     return text
+
+
+def cjke_cleaners(text):
+    from text.mandarin import chinese_to_lazy_ipa
+    from text.japanese import japanese_to_ipa
+    from text.korean import korean_to_ipa
+    from text.english import english_to_ipa2
+    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
+    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
+    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
+    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
+    for chinese_text in chinese_texts:
+        cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
+        cleaned_text = cleaned_text.replace(
+            'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
+        text = text.replace(chinese_text, cleaned_text+' ', 1)
+    for japanese_text in japanese_texts:
+        cleaned_text = japanese_to_ipa(japanese_text[4:-4])
+        cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
+            'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
+        text = text.replace(japanese_text, cleaned_text+' ', 1)
+    for korean_text in korean_texts:
+        cleaned_text = korean_to_ipa(korean_text[4:-4])
+        text = text.replace(korean_text, cleaned_text+' ', 1)
+    for english_text in english_texts:
+        cleaned_text = english_to_ipa2(english_text[4:-4])
+        cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
+            'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
+        text = text.replace(english_text, cleaned_text+' ', 1)
+    text = text[:-1]
+    if re.match(r'[^\.,!\?\-…~]', text[-1]):
+        text += '.'
+    return text
+
+
+def cjke_cleaners2(text):
+    from text.mandarin import chinese_to_ipa
+    from text.japanese import japanese_to_ipa2
+    from text.korean import korean_to_ipa
+    from text.english import english_to_ipa2
+    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
+    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
+    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
+    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
+    for chinese_text in chinese_texts:
+        cleaned_text = chinese_to_ipa(chinese_text[4:-4])
+        text = text.replace(chinese_text, cleaned_text+' ', 1)
+    for japanese_text in japanese_texts:
+        cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
+        text = text.replace(japanese_text, cleaned_text+' ', 1)
+    for korean_text in korean_texts:
+        cleaned_text = korean_to_ipa(korean_text[4:-4])
+        text = text.replace(korean_text, cleaned_text+' ', 1)
+    for english_text in english_texts:
+        cleaned_text = english_to_ipa2(english_text[4:-4])
+        text = text.replace(english_text, cleaned_text+' ', 1)
+    text = text[:-1]
+    if re.match(r'[^\.,!\?\-…~]', text[-1]):
+        text += '.'
+    return text
+
+
+def thai_cleaners(text):
+    from text.thai import num_to_thai, latin_to_thai
+    text = num_to_thai(text)
+    text = latin_to_thai(text)
+    return text
diff --git a/text/english.py b/text/english.py
new file mode 100644
index 0000000..4de565e
--- /dev/null
+++ b/text/english.py
@@ -0,0 +1,171 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+'''
+
+
+# Regular expression matching whitespace:
+
+
+import re
+import inflect
+from unidecode import unidecode
+import eng_to_ipa as ipa
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('æ', 'e'),
+    ('ɑ', 'a'),
+    ('ɔ', 'o'),
+    ('ð', 'z'),
+    ('θ', 's'),
+    ('ɛ', 'e'),
+    ('ɪ', 'i'),
+    ('ʊ', 'u'),
+    ('ʒ', 'ʥ'),
+    ('ʤ', 'ʥ'),
+    ('ˈ', '↓'),
+]]
+
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('ʤ', 'dʒ'),
+    ('ʧ', 'tʃ')
+]]
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def collapse_whitespace(text):
+    return re.sub(r'\s+', ' ', text)
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+
+
+def mark_dark_l(text):
+    return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
+
+
+def english_to_ipa(text):
+    text = unidecode(text).lower()
+    text = expand_abbreviations(text)
+    text = normalize_numbers(text)
+    phonemes = ipa.convert(text)
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes
+
+
+def english_to_lazy_ipa(text):
+    text = english_to_ipa(text)
+    for regex, replacement in _lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def english_to_ipa2(text):
+    text = english_to_ipa(text)
+    text = mark_dark_l(text)
+    for regex, replacement in _ipa_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text.replace('...', '…')
diff --git a/text/japanese.py b/text/japanese.py
index 50ebf71..30e7d11 100644
--- a/text/japanese.py
+++ b/text/japanese.py
@@ -32,6 +32,22 @@
     ('r', 'ɾ')
 ]]
 
+# List of (romaji, ipa2) pairs for marks:
+_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('u', 'ɯ'),
+    ('ʧ', 'tʃ'),
+    ('j', 'dʑ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+
 # Dictinary of (consonant, sokuon) pairs:
 _real_sokuon = {
   'k': 'k#',
@@ -129,3 +145,12 @@ def japanese_to_ipa(text):
     text = get_real_sokuon(text)
     text = get_real_hatsuon(text)
     return text
+
+
+def japanese_to_ipa2(text):
+    text=japanese_to_romaji_with_accent(text).replace('...', '…')
+    for regex, replacement in _romaji_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    return text
diff --git a/text/korean.py b/text/korean.py
index e964dd7..edee074 100644
--- a/text/korean.py
+++ b/text/korean.py
@@ -203,3 +203,8 @@ def korean_to_lazy_ipa(text):
     for regex, replacement in _ipa_to_lazy_ipa:
         text = re.sub(regex, replacement, text)
     return text
+
+
+def korean_to_ipa(text):
+    text = korean_to_lazy_ipa(text)
+    return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
diff --git a/text/mandarin.py b/text/mandarin.py
index c1e062a..97a0187 100644
--- a/text/mandarin.py
+++ b/text/mandarin.py
@@ -101,7 +101,6 @@
     ('—', '-')
 ]]
 
-
 # List of (romaji, ipa) pairs:
 _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
     ('ʃy', 'ʃ'),
@@ -113,6 +112,68 @@
     ('h', 'x')
 ]]
 
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'x'),
+    ('ㄐ', 'tʃ⁼'),
+    ('ㄑ', 'tʃʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ts`⁼'),
+    ('ㄔ', 'ts`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ts⁼'),
+    ('ㄘ', 'tsʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ə'),
+    ('ㄝ', 'ɛ'),
+    ('ㄞ', 'aɪ'),
+    ('ㄟ', 'eɪ'),
+    ('ㄠ', 'ɑʊ'),
+    ('ㄡ', 'oʊ'),
+    ('ㄧㄢ', 'jɛn'),
+    ('ㄩㄢ', 'ɥæn'),
+    ('ㄢ', 'an'),
+    ('ㄧㄣ', 'in'),
+    ('ㄩㄣ', 'ɥn'),
+    ('ㄣ', 'ən'),
+    ('ㄤ', 'ɑŋ'),
+    ('ㄧㄥ', 'iŋ'),
+    ('ㄨㄥ', 'ʊŋ'),
+    ('ㄩㄥ', 'jʊŋ'),
+    ('ㄥ', 'əŋ'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'ɥ'),
+    ('ˉ', '→'),
+    ('ˊ', '↑'),
+    ('ˇ', '↓↑'),
+    ('ˋ', '↓'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+
 
 def number_to_chinese(text):
     numbers = re.findall(r'\d+(?:\.?\d+)?', text)
@@ -151,6 +212,12 @@ def bopomofo_to_romaji(text):
     return text
 
 
+def bopomofo_to_ipa(text):
+    for regex, replacement in _bopomofo_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
 def chinese_to_romaji(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
@@ -170,3 +237,17 @@ def chinese_to_lazy_ipa(text):
     for regex, replacement in _romaji_to_ipa:
         text = re.sub(regex, replacement, text)
     return text
+
+
+def chinese_to_ipa(text):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa(text)
+    text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
+    text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
+    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
+                  'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
+    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
+                  lambda x: x.group(1)+'ɹ'+x.group(2), text)
+    return text