electronic.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import pynini
  15. from fun_text_processing.text_normalization.en.graph_utils import (
  16. DAMO_NOT_QUOTE,
  17. DAMO_SIGMA,
  18. GraphFst,
  19. delete_preserve_order,
  20. insert_space,
  21. )
  22. from fun_text_processing.text_normalization.es.utils import get_abs_path
  23. from pynini.lib import pynutil
  24. digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
  25. zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
  26. graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
  27. server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
  28. domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
  29. class ElectronicFst(GraphFst):
  30. """
  31. Finite state transducer for verbalizing electronic
  32. e.g. electronic { username: "abc" domain: "hotmail.com" } -> "a b c arroba hotmail punto com"
  33. -> "a b c arroba h o t m a i l punto c o m"
  34. -> "a b c arroba hotmail punto c o m"
  35. -> "a b c at h o t m a i l punto com"
  36. Args:
  37. deterministic: if True will provide a single transduction option,
  38. for False multiple transduction are generated (used for audio-based normalization)
  39. """
  40. def __init__(self, deterministic: bool = True):
  41. super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
  42. graph_digit_no_zero = (
  43. digit_no_zero @ pynini.cdrewrite(pynini.cross("un", "uno"), "", "", DAMO_SIGMA).optimize()
  44. )
  45. graph_digit = graph_digit_no_zero | zero
  46. def add_space_after_char():
  47. return pynini.closure(DAMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
  48. DAMO_NOT_QUOTE - pynini.accep(" ")
  49. )
  50. verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", DAMO_SIGMA)
  51. user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
  52. user_name @= verbalize_characters
  53. convert_defaults = pynutil.add_weight(DAMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
  54. domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
  55. domain @= verbalize_characters
  56. domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
  57. protocol = (
  58. pynutil.delete("protocol: \"")
  59. + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", DAMO_SIGMA)
  60. + pynutil.delete("\"")
  61. )
  62. self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
  63. user_name + pynini.accep(" ") + pynutil.insert("arroba ") + domain
  64. )
  65. delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
  66. self.fst = delete_tokens.optimize()