phoneme_tokenizer.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. import logging
  2. from pathlib import Path
  3. import re
  4. from typing import Iterable
  5. from typing import List
  6. from typing import Optional
  7. from typing import Union
  8. import warnings
  9. # import g2p_en
  10. import jamo
  11. from typeguard import check_argument_types
  12. from funasr.text.abs_tokenizer import AbsTokenizer
  13. g2p_choices = [
  14. None,
  15. "g2p_en",
  16. "g2p_en_no_space",
  17. "pyopenjtalk",
  18. "pyopenjtalk_kana",
  19. "pyopenjtalk_accent",
  20. "pyopenjtalk_accent_with_pause",
  21. "pyopenjtalk_prosody",
  22. "pypinyin_g2p",
  23. "pypinyin_g2p_phone",
  24. "espeak_ng_arabic",
  25. "espeak_ng_german",
  26. "espeak_ng_french",
  27. "espeak_ng_spanish",
  28. "espeak_ng_russian",
  29. "espeak_ng_greek",
  30. "espeak_ng_finnish",
  31. "espeak_ng_hungarian",
  32. "espeak_ng_dutch",
  33. "espeak_ng_english_us_vits",
  34. "espeak_ng_hindi",
  35. "g2pk",
  36. "g2pk_no_space",
  37. "korean_jaso",
  38. "korean_jaso_no_space",
  39. ]
  40. def split_by_space(text) -> List[str]:
  41. if " " in text:
  42. text = text.replace(" ", " <space> ")
  43. return [c.replace("<space>", " ") for c in text.split(" ")]
  44. else:
  45. return text.split(" ")
  46. def pyopenjtalk_g2p(text) -> List[str]:
  47. import pyopenjtalk
  48. # phones is a str object separated by space
  49. phones = pyopenjtalk.g2p(text, kana=False)
  50. phones = phones.split(" ")
  51. return phones
  52. def pyopenjtalk_g2p_accent(text) -> List[str]:
  53. import pyopenjtalk
  54. import re
  55. phones = []
  56. for labels in pyopenjtalk.run_frontend(text)[1]:
  57. p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
  58. if len(p) == 1:
  59. phones += [p[0][0], p[0][2], p[0][1]]
  60. return phones
  61. def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]:
  62. import pyopenjtalk
  63. import re
  64. phones = []
  65. for labels in pyopenjtalk.run_frontend(text)[1]:
  66. if labels.split("-")[1].split("+")[0] == "pau":
  67. phones += ["pau"]
  68. continue
  69. p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
  70. if len(p) == 1:
  71. phones += [p[0][0], p[0][2], p[0][1]]
  72. return phones
  73. def pyopenjtalk_g2p_kana(text) -> List[str]:
  74. import pyopenjtalk
  75. kanas = pyopenjtalk.g2p(text, kana=True)
  76. return list(kanas)
  77. def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> List[str]:
  78. """Extract phoneme + prosoody symbol sequence from input full-context labels.
  79. The algorithm is based on `Prosodic features control by symbols as input of
  80. sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
  81. Args:
  82. text (str): Input text.
  83. drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
  84. Returns:
  85. List[str]: List of phoneme + prosody symbols.
  86. Examples:
  87. >>> from funasr.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
  88. >>> pyopenjtalk_g2p_prosody("こんにちは。")
  89. ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
  90. .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
  91. modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
  92. """
  93. import pyopenjtalk
  94. labels = pyopenjtalk.run_frontend(text)[1]
  95. N = len(labels)
  96. phones = []
  97. for n in range(N):
  98. lab_curr = labels[n]
  99. # current phoneme
  100. p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
  101. # deal unvoiced vowels as normal vowels
  102. if drop_unvoiced_vowels and p3 in "AEIOU":
  103. p3 = p3.lower()
  104. # deal with sil at the beginning and the end of text
  105. if p3 == "sil":
  106. assert n == 0 or n == N - 1
  107. if n == 0:
  108. phones.append("^")
  109. elif n == N - 1:
  110. # check question form or not
  111. e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
  112. if e3 == 0:
  113. phones.append("$")
  114. elif e3 == 1:
  115. phones.append("?")
  116. continue
  117. elif p3 == "pau":
  118. phones.append("_")
  119. continue
  120. else:
  121. phones.append(p3)
  122. # accent type and position info (forward or backward)
  123. a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
  124. a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
  125. a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
  126. # number of mora in accent phrase
  127. f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
  128. a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
  129. # accent phrase border
  130. if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
  131. phones.append("#")
  132. # pitch falling
  133. elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
  134. phones.append("]")
  135. # pitch rising
  136. elif a2 == 1 and a2_next == 2:
  137. phones.append("[")
  138. return phones
  139. def _numeric_feature_by_regex(regex, s):
  140. match = re.search(regex, s)
  141. if match is None:
  142. return -50
  143. return int(match.group(1))
  144. def pypinyin_g2p(text) -> List[str]:
  145. from pypinyin import pinyin
  146. from pypinyin import Style
  147. phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
  148. return phones
  149. def pypinyin_g2p_phone(text) -> List[str]:
  150. from pypinyin import pinyin
  151. from pypinyin import Style
  152. from pypinyin.style._utils import get_finals
  153. from pypinyin.style._utils import get_initials
  154. phones = [
  155. p
  156. for phone in pinyin(text, style=Style.TONE3)
  157. for p in [
  158. get_initials(phone[0], strict=True),
  159. get_finals(phone[0], strict=True),
  160. ]
  161. if len(p) != 0
  162. ]
  163. return phones
  164. class G2p_en:
  165. """On behalf of g2p_en.G2p.
  166. g2p_en.G2p isn't pickalable and it can't be copied to the other processes
  167. via multiprocessing module.
  168. As a workaround, g2p_en.G2p is instantiated upon calling this class.
  169. """
  170. def __init__(self, no_space: bool = False):
  171. self.no_space = no_space
  172. self.g2p = None
  173. def __call__(self, text) -> List[str]:
  174. if self.g2p is None:
  175. self.g2p = g2p_en.G2p()
  176. phones = self.g2p(text)
  177. if self.no_space:
  178. # remove space which represents word serapater
  179. phones = list(filter(lambda s: s != " ", phones))
  180. return phones
  181. class G2pk:
  182. """On behalf of g2pk.G2p.
  183. g2pk.G2p isn't pickalable and it can't be copied to the other processes
  184. via multiprocessing module.
  185. As a workaround, g2pk.G2p is instantiated upon calling this class.
  186. """
  187. def __init__(
  188. self, descritive=False, group_vowels=False, to_syl=False, no_space=False
  189. ):
  190. self.descritive = descritive
  191. self.group_vowels = group_vowels
  192. self.to_syl = to_syl
  193. self.no_space = no_space
  194. self.g2p = None
  195. def __call__(self, text) -> List[str]:
  196. if self.g2p is None:
  197. import g2pk
  198. self.g2p = g2pk.G2p()
  199. phones = list(
  200. self.g2p(
  201. text,
  202. descriptive=self.descritive,
  203. group_vowels=self.group_vowels,
  204. to_syl=self.to_syl,
  205. )
  206. )
  207. if self.no_space:
  208. # remove space which represents word serapater
  209. phones = list(filter(lambda s: s != " ", phones))
  210. return phones
  211. class Jaso:
  212. PUNC = "!'(),-.:;?"
  213. SPACE = " "
  214. JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
  215. JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
  216. JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
  217. VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
  218. def __init__(self, space_symbol=" ", no_space=False):
  219. self.space_symbol = space_symbol
  220. self.no_space = no_space
  221. def _text_to_jaso(self, line: str) -> List[str]:
  222. jasos = list(jamo.hangul_to_jamo(line))
  223. return jasos
  224. def _remove_non_korean_characters(self, tokens):
  225. new_tokens = [token for token in tokens if token in self.VALID_CHARS]
  226. return new_tokens
  227. def __call__(self, text) -> List[str]:
  228. graphemes = [x for x in self._text_to_jaso(text)]
  229. graphemes = self._remove_non_korean_characters(graphemes)
  230. if self.no_space:
  231. graphemes = list(filter(lambda s: s != " ", graphemes))
  232. else:
  233. graphemes = [x if x != " " else self.space_symbol for x in graphemes]
  234. return graphemes
  235. class Phonemizer:
  236. """Phonemizer module for various languages.
  237. This is wrapper module of https://github.com/bootphon/phonemizer.
  238. You can define various g2p modules by specifying options for phonemizer.
  239. See available options:
  240. https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32
  241. """
  242. def __init__(
  243. self,
  244. backend,
  245. word_separator: Optional[str] = None,
  246. syllable_separator: Optional[str] = None,
  247. phone_separator: Optional[str] = " ",
  248. strip=False,
  249. split_by_single_token: bool = False,
  250. **phonemizer_kwargs,
  251. ):
  252. # delayed import
  253. from phonemizer.backend import BACKENDS
  254. from phonemizer.separator import Separator
  255. self.separator = Separator(
  256. word=word_separator,
  257. syllable=syllable_separator,
  258. phone=phone_separator,
  259. )
  260. # define logger to suppress the warning in phonemizer
  261. logger = logging.getLogger("phonemizer")
  262. logger.setLevel(logging.ERROR)
  263. self.phonemizer = BACKENDS[backend](
  264. **phonemizer_kwargs,
  265. logger=logger,
  266. )
  267. self.strip = strip
  268. self.split_by_single_token = split_by_single_token
  269. def __call__(self, text) -> List[str]:
  270. tokens = self.phonemizer.phonemize(
  271. [text],
  272. separator=self.separator,
  273. strip=self.strip,
  274. njobs=1,
  275. )[0]
  276. if not self.split_by_single_token:
  277. return tokens.split()
  278. else:
  279. # "a: ab" -> ["a", ":", "<space>", "a", "b"]
  280. # TODO(kan-bayashi): space replacement should be dealt in PhonemeTokenizer
  281. return [c.replace(" ", "<space>") for c in tokens]
  282. class PhonemeTokenizer(AbsTokenizer):
  283. def __init__(
  284. self,
  285. g2p_type: Union[None, str],
  286. non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
  287. space_symbol: str = "<space>",
  288. remove_non_linguistic_symbols: bool = False,
  289. ):
  290. assert check_argument_types()
  291. if g2p_type is None:
  292. self.g2p = split_by_space
  293. elif g2p_type == "g2p_en":
  294. self.g2p = G2p_en(no_space=False)
  295. elif g2p_type == "g2p_en_no_space":
  296. self.g2p = G2p_en(no_space=True)
  297. elif g2p_type == "pyopenjtalk":
  298. self.g2p = pyopenjtalk_g2p
  299. elif g2p_type == "pyopenjtalk_kana":
  300. self.g2p = pyopenjtalk_g2p_kana
  301. elif g2p_type == "pyopenjtalk_accent":
  302. self.g2p = pyopenjtalk_g2p_accent
  303. elif g2p_type == "pyopenjtalk_accent_with_pause":
  304. self.g2p = pyopenjtalk_g2p_accent_with_pause
  305. elif g2p_type == "pyopenjtalk_prosody":
  306. self.g2p = pyopenjtalk_g2p_prosody
  307. elif g2p_type == "pypinyin_g2p":
  308. self.g2p = pypinyin_g2p
  309. elif g2p_type == "pypinyin_g2p_phone":
  310. self.g2p = pypinyin_g2p_phone
  311. elif g2p_type == "espeak_ng_arabic":
  312. self.g2p = Phonemizer(
  313. language="ar",
  314. backend="espeak",
  315. with_stress=True,
  316. preserve_punctuation=True,
  317. )
  318. elif g2p_type == "espeak_ng_german":
  319. self.g2p = Phonemizer(
  320. language="de",
  321. backend="espeak",
  322. with_stress=True,
  323. preserve_punctuation=True,
  324. )
  325. elif g2p_type == "espeak_ng_french":
  326. self.g2p = Phonemizer(
  327. language="fr-fr",
  328. backend="espeak",
  329. with_stress=True,
  330. preserve_punctuation=True,
  331. )
  332. elif g2p_type == "espeak_ng_spanish":
  333. self.g2p = Phonemizer(
  334. language="es",
  335. backend="espeak",
  336. with_stress=True,
  337. preserve_punctuation=True,
  338. )
  339. elif g2p_type == "espeak_ng_russian":
  340. self.g2p = Phonemizer(
  341. language="ru",
  342. backend="espeak",
  343. with_stress=True,
  344. preserve_punctuation=True,
  345. )
  346. elif g2p_type == "espeak_ng_greek":
  347. self.g2p = Phonemizer(
  348. language="el",
  349. backend="espeak",
  350. with_stress=True,
  351. preserve_punctuation=True,
  352. )
  353. elif g2p_type == "espeak_ng_finnish":
  354. self.g2p = Phonemizer(
  355. language="fi",
  356. backend="espeak",
  357. with_stress=True,
  358. preserve_punctuation=True,
  359. )
  360. elif g2p_type == "espeak_ng_hungarian":
  361. self.g2p = Phonemizer(
  362. language="hu",
  363. backend="espeak",
  364. with_stress=True,
  365. preserve_punctuation=True,
  366. )
  367. elif g2p_type == "espeak_ng_dutch":
  368. self.g2p = Phonemizer(
  369. language="nl",
  370. backend="espeak",
  371. with_stress=True,
  372. preserve_punctuation=True,
  373. )
  374. elif g2p_type == "espeak_ng_hindi":
  375. self.g2p = Phonemizer(
  376. language="hi",
  377. backend="espeak",
  378. with_stress=True,
  379. preserve_punctuation=True,
  380. )
  381. elif g2p_type == "g2pk":
  382. self.g2p = G2pk(no_space=False)
  383. elif g2p_type == "g2pk_no_space":
  384. self.g2p = G2pk(no_space=True)
  385. elif g2p_type == "espeak_ng_english_us_vits":
  386. # VITS official implementation-like processing
  387. # Reference: https://github.com/jaywalnut310/vits
  388. self.g2p = Phonemizer(
  389. language="en-us",
  390. backend="espeak",
  391. with_stress=True,
  392. preserve_punctuation=True,
  393. strip=True,
  394. word_separator=" ",
  395. phone_separator="",
  396. split_by_single_token=True,
  397. )
  398. elif g2p_type == "korean_jaso":
  399. self.g2p = Jaso(space_symbol=space_symbol, no_space=False)
  400. elif g2p_type == "korean_jaso_no_space":
  401. self.g2p = Jaso(no_space=True)
  402. else:
  403. raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")
  404. self.g2p_type = g2p_type
  405. self.space_symbol = space_symbol
  406. if non_linguistic_symbols is None:
  407. self.non_linguistic_symbols = set()
  408. elif isinstance(non_linguistic_symbols, (Path, str)):
  409. non_linguistic_symbols = Path(non_linguistic_symbols)
  410. try:
  411. with non_linguistic_symbols.open("r", encoding="utf-8") as f:
  412. self.non_linguistic_symbols = set(line.rstrip() for line in f)
  413. except FileNotFoundError:
  414. warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
  415. self.non_linguistic_symbols = set()
  416. else:
  417. self.non_linguistic_symbols = set(non_linguistic_symbols)
  418. self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
  419. def __repr__(self):
  420. return (
  421. f"{self.__class__.__name__}("
  422. f'g2p_type="{self.g2p_type}", '
  423. f'space_symbol="{self.space_symbol}", '
  424. f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
  425. ")"
  426. )
  427. def text2tokens(self, line: str) -> List[str]:
  428. tokens = []
  429. while len(line) != 0:
  430. for w in self.non_linguistic_symbols:
  431. if line.startswith(w):
  432. if not self.remove_non_linguistic_symbols:
  433. tokens.append(line[: len(w)])
  434. line = line[len(w) :]
  435. break
  436. else:
  437. t = line[0]
  438. tokens.append(t)
  439. line = line[1:]
  440. line = "".join(tokens)
  441. tokens = self.g2p(line)
  442. return tokens
  443. def tokens2text(self, tokens: Iterable[str]) -> str:
  444. # phoneme type is not invertible
  445. return "".join(tokens)