diff --git a/text/cleaners.py b/text/cleaners.py index 15c5cc1..4dfd361 100644 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -65,10 +65,12 @@ def cjks_cleaners(text): from text.japanese import japanese_to_ipa from text.korean import korean_to_lazy_ipa from text.sanskrit import devanagari_to_ipa + from text.english import english_to_lazy_ipa chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text) japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text) korean_texts = re.findall(r'\[KO\].*?\[KO\]', text) sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text) + english_texts = re.findall(r'\[EN\].*?\[EN\]', text) for chinese_text in chinese_texts: cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4]) text = text.replace(chinese_text, cleaned_text+' ', 1) @@ -81,7 +83,77 @@ def cjks_cleaners(text): for sanskrit_text in sanskrit_texts: cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4]) text = text.replace(sanskrit_text, cleaned_text+' ', 1) + for english_text in english_texts: + cleaned_text = english_to_lazy_ipa(english_text[4:-4]) + text = text.replace(english_text, cleaned_text+' ', 1) text = text[:-1] if re.match(r'[^\.,!\?\-…~]', text[-1]): text += '.' return text + + +def cjke_cleaners(text): + from text.mandarin import chinese_to_lazy_ipa + from text.japanese import japanese_to_ipa + from text.korean import korean_to_ipa + from text.english import english_to_ipa2 + chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text) + japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text) + korean_texts = re.findall(r'\[KO\].*?\[KO\]', text) + english_texts = re.findall(r'\[EN\].*?\[EN\]', text) + for chinese_text in chinese_texts: + cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4]) + cleaned_text = cleaned_text.replace( + 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + text = text.replace(chinese_text, cleaned_text+' ', 1) + for japanese_text in japanese_texts: + cleaned_text = japanese_to_ipa(japanese_text[4:-4]) + cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace( + 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + text = text.replace(japanese_text, cleaned_text+' ', 1) + for korean_text in korean_texts: + cleaned_text = korean_to_ipa(korean_text[4:-4]) + text = text.replace(korean_text, cleaned_text+' ', 1) + for english_text in english_texts: + cleaned_text = english_to_ipa2(english_text[4:-4]) + cleaned_text = cleaned_text.replace('ɑ', 'a').replace( + 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + text = text.replace(english_text, cleaned_text+' ', 1) + text = text[:-1] + if re.match(r'[^\.,!\?\-…~]', text[-1]): + text += '.' + return text + + +def cjke_cleaners2(text): + from text.mandarin import chinese_to_ipa + from text.japanese import japanese_to_ipa2 + from text.korean import korean_to_ipa + from text.english import english_to_ipa2 + chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text) + japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text) + korean_texts = re.findall(r'\[KO\].*?\[KO\]', text) + english_texts = re.findall(r'\[EN\].*?\[EN\]', text) + for chinese_text in chinese_texts: + cleaned_text = chinese_to_ipa(chinese_text[4:-4]) + text = text.replace(chinese_text, cleaned_text+' ', 1) + for japanese_text in japanese_texts: + cleaned_text = japanese_to_ipa2(japanese_text[4:-4]) + text = text.replace(japanese_text, cleaned_text+' ', 1) + for korean_text in korean_texts: + cleaned_text = korean_to_ipa(korean_text[4:-4]) + text = text.replace(korean_text, cleaned_text+' ', 1) + for english_text in english_texts: + cleaned_text = english_to_ipa2(english_text[4:-4]) + text = text.replace(english_text, cleaned_text+' ', 1) + text = text[:-1] + if re.match(r'[^\.,!\?\-…~]', text[-1]): + text += '.' + return text + + +def thai_cleaners(text): + from text.thai import num_to_thai, latin_to_thai + text = num_to_thai(text) + text = latin_to_thai(text) + return text diff --git a/text/english.py b/text/english.py new file mode 100644 index 0000000..4de565e --- /dev/null +++ b/text/english.py @@ -0,0 +1,171 @@ +""" from https://github.com/keithito/tacotron """ + +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +''' + + +# Regular expression matching whitespace: + + +import re +import inflect +from unidecode import unidecode +import eng_to_ipa as ipa +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +# List of (ipa, lazy ipa) pairs: +_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ + ('r', 'ɹ'), + ('æ', 'e'), + ('ɑ', 'a'), + ('ɔ', 'o'), + ('ð', 'z'), + ('θ', 's'), + ('ɛ', 'e'), + ('ɪ', 'i'), + ('ʊ', 'u'), + ('ʒ', 'ʥ'), + ('ʤ', 'ʥ'), + ('ˈ', '↓'), +]] + +# List of (ipa, ipa2) pairs +_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ + ('r', 'ɹ'), + ('ʤ', 'dʒ'), + ('ʧ', 'tʃ') +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def collapse_whitespace(text): + return re.sub(r'\s+', ' ', text) + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text + + +def mark_dark_l(text): + return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) + + +def english_to_ipa(text): + text = unidecode(text).lower() + text = expand_abbreviations(text) + text = normalize_numbers(text) + phonemes = ipa.convert(text) + phonemes = collapse_whitespace(phonemes) + return phonemes + + +def english_to_lazy_ipa(text): + text = english_to_ipa(text) + for regex, replacement in _lazy_ipa: + text = re.sub(regex, replacement, text) + return text + + +def english_to_ipa2(text): + text = english_to_ipa(text) + text = mark_dark_l(text) + for regex, replacement in _ipa_to_ipa2: + text = re.sub(regex, replacement, text) + return text.replace('...', '…') diff --git a/text/japanese.py b/text/japanese.py index 50ebf71..30e7d11 100644 --- a/text/japanese.py +++ b/text/japanese.py @@ -32,6 +32,22 @@ ('r', 'ɾ') ]] +# List of (romaji, ipa2) pairs for marks: +_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ + ('u', 'ɯ'), + ('ʧ', 'tʃ'), + ('j', 'dʑ'), + ('y', 'j'), + ('ni', 'n^i'), + ('nj', 'n^'), + ('hi', 'çi'), + ('hj', 'ç'), + ('f', 'ɸ'), + ('I', 'i*'), + ('U', 'ɯ*'), + ('r', 'ɾ') +]] + # Dictinary of (consonant, sokuon) pairs: _real_sokuon = { 'k': 'k#', @@ -129,3 +145,12 @@ def japanese_to_ipa(text): text = get_real_sokuon(text) text = get_real_hatsuon(text) return text + + +def japanese_to_ipa2(text): + text=japanese_to_romaji_with_accent(text).replace('...', '…') + for regex, replacement in _romaji_to_ipa2: + text = re.sub(regex, replacement, text) + text = get_real_sokuon(text) + text = get_real_hatsuon(text) + return text diff --git a/text/korean.py b/text/korean.py index e964dd7..edee074 100644 --- a/text/korean.py +++ b/text/korean.py @@ -203,3 +203,8 @@ def korean_to_lazy_ipa(text): for regex, replacement in _ipa_to_lazy_ipa: text = re.sub(regex, replacement, text) return text + + +def korean_to_ipa(text): + text = korean_to_lazy_ipa(text) + return text.replace('ʧ','tʃ').replace('ʥ','dʑ') diff --git a/text/mandarin.py b/text/mandarin.py index c1e062a..97a0187 100644 --- a/text/mandarin.py +++ b/text/mandarin.py @@ -101,7 +101,6 @@ ('—', '-') ]] - # List of (romaji, ipa) pairs: _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ ('ʃy', 'ʃ'), @@ -113,6 +112,68 @@ ('h', 'x') ]] +# List of (bopomofo, ipa) pairs: +_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ + ('ㄅㄛ', 'p⁼wo'), + ('ㄆㄛ', 'pʰwo'), + ('ㄇㄛ', 'mwo'), + ('ㄈㄛ', 'fwo'), + ('ㄅ', 'p⁼'), + ('ㄆ', 'pʰ'), + ('ㄇ', 'm'), + ('ㄈ', 'f'), + ('ㄉ', 't⁼'), + ('ㄊ', 'tʰ'), + ('ㄋ', 'n'), + ('ㄌ', 'l'), + ('ㄍ', 'k⁼'), + ('ㄎ', 'kʰ'), + ('ㄏ', 'x'), + ('ㄐ', 'tʃ⁼'), + ('ㄑ', 'tʃʰ'), + ('ㄒ', 'ʃ'), + ('ㄓ', 'ts`⁼'), + ('ㄔ', 'ts`ʰ'), + ('ㄕ', 's`'), + ('ㄖ', 'ɹ`'), + ('ㄗ', 'ts⁼'), + ('ㄘ', 'tsʰ'), + ('ㄙ', 's'), + ('ㄚ', 'a'), + ('ㄛ', 'o'), + ('ㄜ', 'ə'), + ('ㄝ', 'ɛ'), + ('ㄞ', 'aɪ'), + ('ㄟ', 'eɪ'), + ('ㄠ', 'ɑʊ'), + ('ㄡ', 'oʊ'), + ('ㄧㄢ', 'jɛn'), + ('ㄩㄢ', 'ɥæn'), + ('ㄢ', 'an'), + ('ㄧㄣ', 'in'), + ('ㄩㄣ', 'ɥn'), + ('ㄣ', 'ən'), + ('ㄤ', 'ɑŋ'), + ('ㄧㄥ', 'iŋ'), + ('ㄨㄥ', 'ʊŋ'), + ('ㄩㄥ', 'jʊŋ'), + ('ㄥ', 'əŋ'), + ('ㄦ', 'əɻ'), + ('ㄧ', 'i'), + ('ㄨ', 'u'), + ('ㄩ', 'ɥ'), + ('ˉ', '→'), + ('ˊ', '↑'), + ('ˇ', '↓↑'), + ('ˋ', '↓'), + ('˙', ''), + (',', ','), + ('。', '.'), + ('!', '!'), + ('?', '?'), + ('—', '-') +]] + def number_to_chinese(text): numbers = re.findall(r'\d+(?:\.?\d+)?', text) @@ -151,6 +212,12 @@ def bopomofo_to_romaji(text): return text +def bopomofo_to_ipa(text): + for regex, replacement in _bopomofo_to_ipa: + text = re.sub(regex, replacement, text) + return text + + def chinese_to_romaji(text): text = number_to_chinese(text) text = chinese_to_bopomofo(text) @@ -170,3 +237,17 @@ def chinese_to_lazy_ipa(text): for regex, replacement in _romaji_to_ipa: text = re.sub(regex, replacement, text) return text + + +def chinese_to_ipa(text): + text = number_to_chinese(text) + text = chinese_to_bopomofo(text) + text = latin_to_bopomofo(text) + text = bopomofo_to_ipa(text) + text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text) + text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text) + text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) + + 'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`') + text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', + lambda x: x.group(1)+'ɹ'+x.group(2), text) + return text