time.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. import pynini
  2. from fun_text_processing.text_normalization.de.utils import get_abs_path, load_labels
  3. from fun_text_processing.text_normalization.en.graph_utils import (
  4. DAMO_DIGIT,
  5. DAMO_SIGMA,
  6. GraphFst,
  7. convert_space,
  8. delete_preserve_order,
  9. )
  10. from pynini.lib import pynutil
  11. class TimeFst(GraphFst):
  12. """
  13. Finite state transducer for verbalizing electronic, e.g.
  14. time { hours: "2" minutes: "15"} -> "zwei uhr fünfzehn"
  15. time { minutes: "15" hours: "2" } -> "viertel nach zwei"
  16. time { minutes: "15" hours: "2" } -> "fünfzehn nach zwei"
  17. time { hours: "14" minutes: "15"} -> "vierzehn uhr fünfzehn"
  18. time { minutes: "15" hours: "14" } -> "viertel nach zwei"
  19. time { minutes: "15" hours: "14" } -> "fünfzehn nach drei"
  20. time { minutes: "45" hours: "14" } -> "viertel vor drei"
  21. Args:
  22. cardinal_tagger: cardinal_tagger tagger GraphFst
  23. deterministic: if True will provide a single transduction option,
  24. for False multiple transduction are generated (used for audio-based normalization)
  25. """
  26. def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True):
  27. super().__init__(name="time", kind="verbalize", deterministic=deterministic)
  28. # add weight so when using inverse text normalization this conversion is depriotized
  29. night_to_early = pynutil.add_weight(
  30. pynini.invert(pynini.string_file(get_abs_path("data/time/hour_to_night.tsv"))).optimize(), weight=0.0001
  31. )
  32. hour_to = pynini.invert(pynini.string_file(get_abs_path("data/time/hour_to.tsv"))).optimize()
  33. minute_to = pynini.invert(pynini.string_file(get_abs_path("data/time/minute_to.tsv"))).optimize()
  34. time_zone_graph = pynini.invert(
  35. convert_space(pynini.union(*[x[1] for x in load_labels(get_abs_path("data/time/time_zone.tsv"))]))
  36. )
  37. graph_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))).optimize()
  38. number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero
  39. hour = pynutil.delete("hours: \"") + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete("\"")
  40. hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite(
  41. pynini.cross("eins", "ein"), "[BOS]", "[EOS]", DAMO_SIGMA
  42. ) + pynutil.insert(" uhr")
  43. minute = pynutil.delete("minutes: \"") + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete("\"")
  44. zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete("\"")
  45. optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1)
  46. second = pynutil.delete("seconds: \"") + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete("\"")
  47. graph_hms = (
  48. hour_verbalized
  49. + pynini.accep(" ")
  50. + minute @ number_verbalization
  51. + pynutil.insert(" minuten")
  52. + pynini.accep(" ")
  53. + second @ number_verbalization
  54. + pynutil.insert(" sekunden")
  55. + optional_zone
  56. )
  57. graph_hms @= pynini.cdrewrite(
  58. pynini.cross("eins minuten", "eine minute") | pynini.cross("eins sekunden", "eine sekunde"),
  59. pynini.union(" ", "[BOS]"),
  60. "",
  61. DAMO_SIGMA,
  62. )
  63. min_30 = [str(x) for x in range(1, 31)]
  64. min_30 = pynini.union(*min_30)
  65. min_29 = [str(x) for x in range(1, 30)]
  66. min_29 = pynini.union(*min_29)
  67. graph_h = hour_verbalized
  68. graph_hm = hour_verbalized + pynini.accep(" ") + minute @ number_verbalization
  69. graph_m_past_h = (
  70. minute @ min_30 @ (number_verbalization | pynini.cross("15", "viertel"))
  71. + pynini.accep(" ")
  72. + pynutil.insert("nach ")
  73. # + hour @ number_verbalization
  74. + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", DAMO_SIGMA) @ number_verbalization
  75. )
  76. graph_m30_h = (
  77. minute @ pynini.cross("30", "halb")
  78. + pynini.accep(" ")
  79. + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", DAMO_SIGMA) @ hour_to @ number_verbalization
  80. )
  81. graph_m_to_h = (
  82. minute @ minute_to @ min_29 @ (number_verbalization | pynini.cross("15", "viertel"))
  83. + pynini.accep(" ")
  84. + pynutil.insert("vor ")
  85. + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", DAMO_SIGMA) @ hour_to @ number_verbalization
  86. )
  87. self.graph = (
  88. graph_hms
  89. | graph_h
  90. | graph_hm
  91. | pynutil.add_weight(graph_m_past_h, weight=0.0001)
  92. | pynutil.add_weight(graph_m30_h, weight=0.0001)
  93. | pynutil.add_weight(graph_m_to_h, weight=0.0001)
  94. ) + optional_zone
  95. delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
  96. self.fst = delete_tokens.optimize()