electronic.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import pynini
  2. from fun_text_processing.text_normalization.de.utils import get_abs_path
  3. from fun_text_processing.text_normalization.en.graph_utils import (
  4. DAMO_NOT_QUOTE,
  5. DAMO_SIGMA,
  6. GraphFst,
  7. delete_preserve_order,
  8. insert_space,
  9. )
  10. from pynini.lib import pynutil
  11. class ElectronicFst(GraphFst):
  12. """
  13. Finite state transducer for verbalizing electronic
  14. e.g. electronic { username: "abc" domain: "hotmail.com" } -> "a b c at hotmail punkt com"
  15. -> "a b c at h o t m a i l punkt c o m"
  16. -> "a b c at hotmail punkt c o m"
  17. -> "a b c at h o t m a i l punkt com"
  18. Args:
  19. deterministic: if True will provide a single transduction option,
  20. for False multiple transduction are generated (used for audio-based normalization)
  21. """
  22. def __init__(self, deterministic: bool = True):
  23. super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
  24. graph_digit_no_zero = pynini.invert(
  25. pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
  26. ).optimize() | pynini.cross("1", "eins")
  27. graph_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))).optimize()
  28. graph_digit = graph_digit_no_zero | graph_zero
  29. graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize()
  30. server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
  31. domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
  32. def add_space_after_char():
  33. return pynini.closure(DAMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
  34. DAMO_NOT_QUOTE - pynini.accep(" ")
  35. )
  36. verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", DAMO_SIGMA)
  37. user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
  38. user_name @= verbalize_characters
  39. convert_defaults = pynutil.add_weight(DAMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
  40. domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
  41. domain @= verbalize_characters
  42. domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
  43. protocol = (
  44. pynutil.delete("protocol: \"")
  45. + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", DAMO_SIGMA)
  46. + pynutil.delete("\"")
  47. )
  48. self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
  49. user_name + pynini.accep(" ") + pynutil.insert("at ") + domain
  50. )
  51. delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
  52. self.fst = delete_tokens.optimize()