measure.py 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import pynini
  2. from fun_text_processing.text_normalization.en.graph_utils import (
  3. DAMO_NOT_QUOTE,
  4. DAMO_SIGMA,
  5. DAMO_SPACE,
  6. DAMO_WHITE_SPACE,
  7. GraphFst,
  8. delete_extra_space,
  9. delete_preserve_order,
  10. )
  11. from fun_text_processing.text_normalization.es.graph_utils import ones
  12. from fun_text_processing.text_normalization.es.utils import get_abs_path
  13. from pynini.lib import pynutil
  14. unit_plural_fem = pynini.string_file(get_abs_path("data/measures/measurements_plural_fem.tsv"))
  15. unit_plural_masc = pynini.string_file(get_abs_path("data/measures/measurements_plural_masc.tsv"))
  16. unit_singular_fem = pynini.project(unit_plural_fem, "input")
  17. unit_singular_masc = pynini.project(unit_plural_masc, "input")
  18. unit_plural_fem = pynini.project(unit_plural_fem, "output")
  19. unit_plural_masc = pynini.project(unit_plural_masc, "output")
  20. class MeasureFst(GraphFst):
  21. """
  22. Finite state transducer for verbalizing measure, e.g.
  23. measure { cardinal { integer: "dos" units: "gramos" } } -> "dos gramos"
  24. measure { decimal { integer_part: "dos" quantity: "millones" units: "gramos" } } -> "dos millones de gramos"
  25. Args:
  26. decimal: DecimalFst
  27. cardinal: CardinalFst
  28. fraction: FractionFst
  29. deterministic: if True will provide a single transduction option,
  30. for False multiple transduction are generated (used for audio-based normalization)
  31. """
  32. def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool):
  33. super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
  34. graph_decimal_masc = decimal.delete_tokens(decimal.graph_masc)
  35. graph_decimal_fem = decimal.delete_tokens(decimal.graph_fem)
  36. graph_cardinal_masc = cardinal.delete_tokens(cardinal.graph_masc)
  37. graph_cardinal_fem = cardinal.delete_tokens(cardinal.graph_fem)
  38. graph_fraction_fem = fraction.delete_tokens(fraction.graph_fem)
  39. graph_fraction_masc = fraction.delete_tokens(fraction.graph_masc)
  40. unit_masc = (unit_plural_masc | unit_singular_masc) + pynini.closure(
  41. DAMO_WHITE_SPACE + "por" + pynini.closure(DAMO_NOT_QUOTE, 1), 0, 1
  42. )
  43. unit_masc |= "por" + pynini.closure(DAMO_NOT_QUOTE, 1)
  44. unit_masc = pynutil.delete("units: \"") + (pynini.closure(DAMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"")
  45. unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure(
  46. DAMO_WHITE_SPACE + "por" + pynini.closure(DAMO_NOT_QUOTE, 1), 0, 1
  47. )
  48. unit_fem = pynutil.delete("units: \"") + (pynini.closure(DAMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"")
  49. graph_masc = (graph_cardinal_masc | graph_decimal_masc) + DAMO_WHITE_SPACE + unit_masc
  50. graph_masc |= graph_fraction_masc + DAMO_WHITE_SPACE + pynutil.insert("de ") + unit_masc
  51. graph_masc |= pynutil.add_weight(
  52. graph_fraction_masc @ (DAMO_SIGMA + pynini.union("medio", "medios")) + DAMO_WHITE_SPACE + unit_masc, -0.001
  53. ) # "medio litro" not "medio de litro"
  54. graph_fem = (graph_cardinal_fem | graph_decimal_fem) + DAMO_WHITE_SPACE + unit_fem
  55. graph_fem |= graph_fraction_fem + DAMO_WHITE_SPACE + pynutil.insert("de ") + unit_fem
  56. graph_fem |= pynutil.add_weight(
  57. graph_fraction_fem @ (DAMO_SIGMA + pynini.union("media", "medias")) + DAMO_WHITE_SPACE + unit_fem, -0.001
  58. )
  59. graph = graph_masc | graph_fem
  60. graph = (
  61. pynini.cdrewrite(
  62. pynutil.insert(" de"), "quantity: \"" + pynini.closure(DAMO_NOT_QUOTE, 1), "\"", DAMO_SIGMA
  63. )
  64. @ graph
  65. ) # billones de xyz
  66. graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", DAMO_WHITE_SPACE + "por", DAMO_SIGMA)
  67. # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path.
  68. alpha_num_unit = pynutil.delete("units: \"") + pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\"")
  69. graph_alpha_num = pynini.union(
  70. (graph_cardinal_masc | graph_decimal_masc) + DAMO_SPACE + alpha_num_unit,
  71. alpha_num_unit + delete_extra_space + (graph_cardinal_masc | graph_decimal_masc),
  72. )
  73. graph |= pynutil.add_weight(graph_alpha_num, 0.01)
  74. graph += delete_preserve_order
  75. delete_tokens = self.delete_tokens(graph)
  76. self.fst = delete_tokens.optimize()