build_tokenizer.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from pathlib import Path
  2. from typing import Iterable
  3. from typing import Union
  4. from typeguard import check_argument_types
  5. from funasr.text.abs_tokenizer import AbsTokenizer
  6. from funasr.text.char_tokenizer import CharTokenizer
  7. from funasr.text.phoneme_tokenizer import PhonemeTokenizer
  8. from funasr.text.sentencepiece_tokenizer import SentencepiecesTokenizer
  9. from funasr.text.word_tokenizer import WordTokenizer
  10. def build_tokenizer(
  11. token_type: str,
  12. bpemodel: Union[Path, str, Iterable[str]] = None,
  13. non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
  14. remove_non_linguistic_symbols: bool = False,
  15. space_symbol: str = "<space>",
  16. delimiter: str = None,
  17. g2p_type: str = None,
  18. ) -> AbsTokenizer:
  19. """A helper function to instantiate Tokenizer"""
  20. assert check_argument_types()
  21. if token_type == "bpe":
  22. if bpemodel is None:
  23. raise ValueError('bpemodel is required if token_type = "bpe"')
  24. if remove_non_linguistic_symbols:
  25. raise RuntimeError(
  26. "remove_non_linguistic_symbols is not implemented for token_type=bpe"
  27. )
  28. return SentencepiecesTokenizer(bpemodel)
  29. elif token_type == "word":
  30. if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
  31. return WordTokenizer(
  32. delimiter=delimiter,
  33. non_linguistic_symbols=non_linguistic_symbols,
  34. remove_non_linguistic_symbols=True,
  35. )
  36. else:
  37. return WordTokenizer(delimiter=delimiter)
  38. elif token_type == "char":
  39. return CharTokenizer(
  40. non_linguistic_symbols=non_linguistic_symbols,
  41. space_symbol=space_symbol,
  42. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  43. )
  44. elif token_type == "phn":
  45. return PhonemeTokenizer(
  46. g2p_type=g2p_type,
  47. non_linguistic_symbols=non_linguistic_symbols,
  48. space_symbol=space_symbol,
  49. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  50. )
  51. else:
  52. raise ValueError(
  53. f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
  54. )