build_tokenizer.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. from pathlib import Path
  2. from typing import Iterable
  3. from typing import Union
  4. from funasr.text.abs_tokenizer import AbsTokenizer
  5. from funasr.text.char_tokenizer import CharTokenizer
  6. from funasr.text.phoneme_tokenizer import PhonemeTokenizer
  7. from funasr.text.sentencepiece_tokenizer import SentencepiecesTokenizer
  8. from funasr.text.word_tokenizer import WordTokenizer
  9. def build_tokenizer(
  10. token_type: str,
  11. bpemodel: Union[Path, str, Iterable[str]] = None,
  12. non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
  13. remove_non_linguistic_symbols: bool = False,
  14. space_symbol: str = "<space>",
  15. delimiter: str = None,
  16. g2p_type: str = None,
  17. ) -> AbsTokenizer:
  18. """A helper function to instantiate Tokenizer"""
  19. if token_type == "bpe":
  20. if bpemodel is None:
  21. raise ValueError('bpemodel is required if token_type = "bpe"')
  22. if remove_non_linguistic_symbols:
  23. raise RuntimeError(
  24. "remove_non_linguistic_symbols is not implemented for token_type=bpe"
  25. )
  26. return SentencepiecesTokenizer(bpemodel)
  27. elif token_type == "word":
  28. if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
  29. return WordTokenizer(
  30. delimiter=delimiter,
  31. non_linguistic_symbols=non_linguistic_symbols,
  32. remove_non_linguistic_symbols=True,
  33. )
  34. else:
  35. return WordTokenizer(delimiter=delimiter)
  36. elif token_type == "char":
  37. return CharTokenizer(
  38. non_linguistic_symbols=non_linguistic_symbols,
  39. space_symbol=space_symbol,
  40. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  41. )
  42. elif token_type == "phn":
  43. return PhonemeTokenizer(
  44. g2p_type=g2p_type,
  45. non_linguistic_symbols=non_linguistic_symbols,
  46. space_symbol=space_symbol,
  47. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  48. )
  49. else:
  50. raise ValueError(
  51. f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
  52. )