sentencepiece_tokenizer.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. from pathlib import Path
  2. from typing import Iterable
  3. from typing import List
  4. from typing import Union
  5. import sentencepiece as spm
  6. from typeguard import check_argument_types
  7. from funasr.text.abs_tokenizer import AbsTokenizer
  8. class SentencepiecesTokenizer(AbsTokenizer):
  9. def __init__(self, model: Union[Path, str]):
  10. assert check_argument_types()
  11. self.model = str(model)
  12. # NOTE(kamo):
  13. # Don't build SentencePieceProcessor in __init__()
  14. # because it's not picklable and it may cause following error,
  15. # "TypeError: can't pickle SwigPyObject objects",
  16. # when giving it as argument of "multiprocessing.Process()".
  17. self.sp = None
  18. def __repr__(self):
  19. return f'{self.__class__.__name__}(model="{self.model}")'
  20. def _build_sentence_piece_processor(self):
  21. # Build SentencePieceProcessor lazily.
  22. if self.sp is None:
  23. self.sp = spm.SentencePieceProcessor()
  24. self.sp.load(self.model)
  25. def text2tokens(self, line: str) -> List[str]:
  26. self._build_sentence_piece_processor()
  27. return self.sp.EncodeAsPieces(line)
  28. def tokens2text(self, tokens: Iterable[str]) -> str:
  29. self._build_sentence_piece_processor()
  30. return self.sp.DecodePieces(list(tokens))