funtoken.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. from pathlib import Path
  2. from typing import Iterable
  3. from typing import Union
  4. from abc import ABC
  5. from abc import abstractmethod
  6. from typing import Iterable
  7. from typing import List
  8. from pathlib import Path
  9. from typing import Dict
  10. from typing import Iterable
  11. from typing import List
  12. from typing import Union
  13. import numpy as np
  14. from funasr.tokenizer.abs_tokenizer import AbsTokenizer
  15. from funasr.tokenizer.char_tokenizer import CharTokenizer
  16. from funasr.tokenizer.phoneme_tokenizer import PhonemeTokenizer
  17. from funasr.tokenizer.sentencepiece_tokenizer import SentencepiecesTokenizer
  18. from funasr.tokenizer.word_tokenizer import WordTokenizer
  19. def build_tokenizer(
  20. token_type: str,
  21. bpemodel: Union[Path, str, Iterable[str]] = None,
  22. non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
  23. remove_non_linguistic_symbols: bool = False,
  24. space_symbol: str = "<space>",
  25. delimiter: str = None,
  26. g2p_type: str = None,
  27. **kwargs,
  28. ):
  29. """A helper function to instantiate Tokenizer"""
  30. # import pdb;
  31. # pdb.set_trace()
  32. if token_type == "bpe":
  33. if bpemodel is None:
  34. raise ValueError('bpemodel is required if token_type = "bpe"')
  35. if remove_non_linguistic_symbols:
  36. raise RuntimeError(
  37. "remove_non_linguistic_symbols is not implemented for token_type=bpe"
  38. )
  39. return SentencepiecesTokenizer(bpemodel, **kwargs)
  40. elif token_type == "word":
  41. if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
  42. return WordTokenizer(
  43. delimiter=delimiter,
  44. non_linguistic_symbols=non_linguistic_symbols,
  45. remove_non_linguistic_symbols=True,
  46. )
  47. else:
  48. return WordTokenizer(delimiter=delimiter, **kwargs)
  49. elif token_type == "char":
  50. return CharTokenizer(
  51. non_linguistic_symbols=non_linguistic_symbols,
  52. space_symbol=space_symbol,
  53. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  54. **kwargs
  55. )
  56. elif token_type == "phn":
  57. return PhonemeTokenizer(
  58. g2p_type=g2p_type,
  59. non_linguistic_symbols=non_linguistic_symbols,
  60. space_symbol=space_symbol,
  61. remove_non_linguistic_symbols=remove_non_linguistic_symbols,
  62. **kwargs
  63. )
  64. else:
  65. raise ValueError(
  66. f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
  67. )