postprocessor.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import pynini
  2. from fun_text_processing.text_normalization.zh.graph_utils import (
  3. FUN_ALPHA,
  4. FUN_DIGIT,
  5. FUN_PUNCT,
  6. FUN_SIGMA,
  7. FUN_WHITE_SPACE,
  8. GraphFst,
  9. )
  10. from fun_text_processing.text_normalization.zh.utils import get_abs_path
  11. from pynini.lib import pynutil, utf8
  12. class PostProcessor(GraphFst):
  13. '''
  14. Postprocessing of TN, now contains:
  15. 1. punctuation removal
  16. 2. letter case conversion
  17. 3. oov tagger
  18. '''
  19. def __init__(
  20. self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False,
  21. ):
  22. super().__init__(name="PostProcessor", kind="processor")
  23. graph = pynini.cdrewrite('', '', '', FUN_SIGMA)
  24. if remove_puncts:
  25. remove_puncts_graph = pynutil.delete(
  26. pynini.union(FUN_PUNCT, pynini.string_file(get_abs_path('data/char/punctuations_zh.tsv')))
  27. )
  28. graph @= pynini.cdrewrite(remove_puncts_graph, "", "", FUN_SIGMA).optimize()
  29. if to_upper or to_lower:
  30. if to_upper:
  31. conv_cases_graph = pynini.inverse(pynini.string_file(get_abs_path('data/char/upper_to_lower.tsv')))
  32. else:
  33. conv_cases_graph = pynini.string_file(get_abs_path('data/char/upper_to_lower.tsv'))
  34. graph @= pynini.cdrewrite(conv_cases_graph, "", "", FUN_SIGMA).optimize()
  35. if tag_oov:
  36. zh_charset_std = pynini.string_file(get_abs_path("data/char/charset_national_standard_2013_8105.tsv"))
  37. zh_charset_ext = pynini.string_file(get_abs_path("data/char/charset_extension.tsv"))
  38. zh_charset = (
  39. zh_charset_std | zh_charset_ext | pynini.string_file(get_abs_path("data/char/punctuations_zh.tsv"))
  40. )
  41. en_charset = FUN_DIGIT | FUN_ALPHA | FUN_PUNCT | FUN_WHITE_SPACE
  42. charset = zh_charset | en_charset
  43. with open(get_abs_path("data/char/oov_tags.tsv"), "r") as f:
  44. tags = f.readline().strip().split('\t')
  45. assert len(tags) == 2
  46. ltag, rtag = tags
  47. oov_charset = pynini.difference(utf8.VALID_UTF8_CHAR, charset)
  48. tag_oov_graph = pynutil.insert(ltag) + oov_charset + pynutil.insert(rtag)
  49. graph @= pynini.cdrewrite(tag_oov_graph, "", "", FUN_SIGMA).optimize()
  50. self.fst = graph.optimize()