measure.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import pynini
  2. from fun_text_processing.text_normalization.en.graph_utils import (
  3. DAMO_NOT_QUOTE,
  4. GraphFst,
  5. delete_extra_space,
  6. delete_preserve_order,
  7. )
  8. from pynini.lib import pynutil
  9. class MeasureFst(GraphFst):
  10. """
  11. Finite state transducer for verbalizing measure, e.g.
  12. measure { cardinal { integer: "zwei" units: "unzen" } } -> "zwei unzen"
  13. measure { cardinal { integer_part: "zwei" quantity: "millionen" units: "unzen" } } -> "zwei millionen unzen"
  14. Args:
  15. decimal: decimal GraphFst
  16. cardinal: cardinal GraphFst
  17. fraction: fraction GraphFst
  18. deterministic: if True will provide a single transduction option,
  19. for False multiple transduction are generated (used for audio-based normalization)
  20. """
  21. def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool):
  22. super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
  23. unit = pynutil.delete("units: \"") + pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\"")
  24. graph_decimal = decimal.fst
  25. graph_cardinal = cardinal.fst
  26. graph_fraction = fraction.fst
  27. graph = (graph_cardinal | graph_decimal | graph_fraction) + pynini.accep(" ") + unit
  28. graph |= unit + delete_extra_space + (graph_cardinal | graph_decimal)
  29. graph += delete_preserve_order
  30. delete_tokens = self.delete_tokens(graph)
  31. self.fst = delete_tokens.optimize()