| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
- import re
- class KoreanCleaner:
- @classmethod
- def _normalize_numbers(cls, text):
- number_to_kor = {
- "0": "영",
- "1": "일",
- "2": "이",
- "3": "삼",
- "4": "사",
- "5": "오",
- "6": "육",
- "7": "칠",
- "8": "팔",
- "9": "구",
- }
- new_text = "".join(
- number_to_kor[char] if char in number_to_kor.keys() else char
- for char in text
- )
- return new_text
- @classmethod
- def _normalize_english_text(cls, text):
- upper_alphabet_to_kor = {
- "A": "에이",
- "B": "비",
- "C": "씨",
- "D": "디",
- "E": "이",
- "F": "에프",
- "G": "지",
- "H": "에이치",
- "I": "아이",
- "J": "제이",
- "K": "케이",
- "L": "엘",
- "M": "엠",
- "N": "엔",
- "O": "오",
- "P": "피",
- "Q": "큐",
- "R": "알",
- "S": "에스",
- "T": "티",
- "U": "유",
- "V": "브이",
- "W": "더블유",
- "X": "엑스",
- "Y": "와이",
- "Z": "지",
- }
- new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
- new_text = "".join(
- upper_alphabet_to_kor[char]
- if char in upper_alphabet_to_kor.keys()
- else char
- for char in new_text
- )
- return new_text
- @classmethod
- def normalize_text(cls, text):
- # stage 0 : text strip
- text = text.strip()
- # stage 1 : normalize numbers
- text = cls._normalize_numbers(text)
- # stage 2 : normalize english text
- text = cls._normalize_english_text(text)
- return text
|