| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- # Copyright 2017 Google Inc.
- # Adapted from https://github.com/google/TextNormalizationCoveringGrammars
- # Russian minimally supervised number grammar.
- import pynini
- from fun_text_processing.text_normalization.en.graph_utils import DAMO_NON_BREAKING_SPACE, DAMO_SPACE
- from fun_text_processing.text_normalization.ru.utils import get_abs_path
- RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
- RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
- RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
- RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
- RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
- RU_STRESSED_MAP = [
- ("А́", "А'"),
- ("Е́", "Е'"),
- ("Ё́", "Е'"),
- ("И́", "И'"),
- ("О́", "О'"),
- ("У́", "У'"),
- ("Ы́", "Ы'"),
- ("Э́", "Э'"),
- ("Ю́", "Ю'"),
- ("Я́", "Я'"),
- ("а́", "а'"),
- ("е́", "е'"),
- ("ё́", "е'"),
- ("и́", "и'"),
- ("о́", "о'"),
- ("у́", "у'"),
- ("ы́", "ы'"),
- ("э́", "э'"),
- ("ю́", "ю'"),
- ("я́", "я'"),
- ("ё", "е"),
- ("Ё", "Е"),
- ]
- REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
- TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
- TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
- RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()
|