alphabet.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. # Copyright 2017 Google Inc.
  2. # Adapted from https://github.com/google/TextNormalizationCoveringGrammars
  3. # Russian minimally supervised number grammar.
  4. import pynini
  5. from fun_text_processing.text_normalization.en.graph_utils import DAMO_NON_BREAKING_SPACE, DAMO_SPACE
  6. from fun_text_processing.text_normalization.ru.utils import get_abs_path
  7. RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
  8. RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
  9. RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
  10. RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
  11. RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
  12. RU_STRESSED_MAP = [
  13. ("А́", "А'"),
  14. ("Е́", "Е'"),
  15. ("Ё́", "Е'"),
  16. ("И́", "И'"),
  17. ("О́", "О'"),
  18. ("У́", "У'"),
  19. ("Ы́", "Ы'"),
  20. ("Э́", "Э'"),
  21. ("Ю́", "Ю'"),
  22. ("Я́", "Я'"),
  23. ("а́", "а'"),
  24. ("е́", "е'"),
  25. ("ё́", "е'"),
  26. ("и́", "и'"),
  27. ("о́", "о'"),
  28. ("у́", "у'"),
  29. ("ы́", "ы'"),
  30. ("э́", "э'"),
  31. ("ю́", "ю'"),
  32. ("я́", "я'"),
  33. ("ё", "е"),
  34. ("Ё", "Е"),
  35. ]
  36. REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
  37. TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
  38. TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
  39. RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()