date.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import pynini
  2. from fun_text_processing.text_normalization.de.utils import get_abs_path, load_labels
  3. from fun_text_processing.text_normalization.en.graph_utils import (
  4. DAMO_NOT_QUOTE,
  5. DAMO_SIGMA,
  6. GraphFst,
  7. delete_preserve_order,
  8. )
  9. from pynini.lib import pynutil
  10. class DateFst(GraphFst):
  11. """
  12. Finite state transducer for verbalizing date, e.g.
  13. date { day: "vier" month: "april" year: "zwei tausend zwei" } -> "vierter april zwei tausend zwei"
  14. date { day: "vier" month: "mai" year: "zwei tausend zwei" } -> "vierter mai zwei tausend zwei"
  15. Args:
  16. ordinal: ordinal verbalizer GraphFst
  17. deterministic: if True will provide a single transduction option,
  18. for False multiple transduction are generated (used for audio-based normalization)
  19. """
  20. def __init__(self, ordinal: GraphFst, deterministic: bool = True):
  21. super().__init__(name="date", kind="verbalize", deterministic=deterministic)
  22. day_cardinal = pynutil.delete("day: \"") + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete("\"")
  23. day = day_cardinal @ pynini.cdrewrite(ordinal.ordinal_stem, "", "[EOS]", DAMO_SIGMA) + pynutil.insert("ter")
  24. months_names = pynini.union(*[x[1] for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv"))])
  25. month = pynutil.delete("month: \"") + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete("\"")
  26. final_month = month @ months_names
  27. final_month |= month @ pynini.difference(DAMO_SIGMA, months_names) @ pynini.cdrewrite(
  28. ordinal.ordinal_stem, "", "[EOS]", DAMO_SIGMA
  29. ) + pynutil.insert("ter")
  30. year = pynutil.delete("year: \"") + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete("\"")
  31. # day month year
  32. graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure(pynini.accep(" ") + year, 0, 1)
  33. graph_dmy |= final_month + pynini.accep(" ") + year
  34. self.graph = graph_dmy | year
  35. final_graph = self.graph + delete_preserve_order
  36. delete_tokens = self.delete_tokens(final_graph)
  37. self.fst = delete_tokens.optimize()