korean_cleaner.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
  2. import re
  3. class KoreanCleaner:
  4. @classmethod
  5. def _normalize_numbers(cls, text):
  6. number_to_kor = {
  7. "0": "영",
  8. "1": "일",
  9. "2": "이",
  10. "3": "삼",
  11. "4": "사",
  12. "5": "오",
  13. "6": "육",
  14. "7": "칠",
  15. "8": "팔",
  16. "9": "구",
  17. }
  18. new_text = "".join(
  19. number_to_kor[char] if char in number_to_kor.keys() else char
  20. for char in text
  21. )
  22. return new_text
  23. @classmethod
  24. def _normalize_english_text(cls, text):
  25. upper_alphabet_to_kor = {
  26. "A": "에이",
  27. "B": "비",
  28. "C": "씨",
  29. "D": "디",
  30. "E": "이",
  31. "F": "에프",
  32. "G": "지",
  33. "H": "에이치",
  34. "I": "아이",
  35. "J": "제이",
  36. "K": "케이",
  37. "L": "엘",
  38. "M": "엠",
  39. "N": "엔",
  40. "O": "오",
  41. "P": "피",
  42. "Q": "큐",
  43. "R": "알",
  44. "S": "에스",
  45. "T": "티",
  46. "U": "유",
  47. "V": "브이",
  48. "W": "더블유",
  49. "X": "엑스",
  50. "Y": "와이",
  51. "Z": "지",
  52. }
  53. new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
  54. new_text = "".join(
  55. upper_alphabet_to_kor[char]
  56. if char in upper_alphabet_to_kor.keys()
  57. else char
  58. for char in new_text
  59. )
  60. return new_text
  61. @classmethod
  62. def normalize_text(cls, text):
  63. # stage 0 : text strip
  64. text = text.strip()
  65. # stage 1 : normalize numbers
  66. text = cls._normalize_numbers(text)
  67. # stage 2 : normalize english text
  68. text = cls._normalize_english_text(text)
  69. return text