siweilian
/
FunASR


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
							# Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import string
from collections import OrderedDict
from typing import Dict, List, Union

PRESERVE_ORDER_KEY = "preserve_order"
EOS = "<EOS>"


class TokenParser:
    """
    Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'

    Args
        text: tokenized text
    """

    def __call__(self, text):
        """
        Setup function

        Args:
            text: text to be parsed
        
        """
        self.text = text
        self.len_text = len(text)
        self.char = text[0]  # cannot handle empty string
        self.index = 0

    def parse(self) -> List[dict]:
        """
        Main function. Implements grammar:
        A -> space F space F space F ... space

        Returns list of dictionaries
        """
        l = list()
        while self.parse_ws():
            token = self.parse_token()
            if not token:
                break
            l.append(token)
        return l

    def parse_token(self) -> Dict[str, Union[str, dict]]:
        """
        Implements grammar:
        F-> no_space KG no_space

        Returns: K, G as dictionary values
        """
        d = OrderedDict()
        key = self.parse_string_key()
        if key is None:
            return None
        self.parse_ws()
        if key == PRESERVE_ORDER_KEY:
            self.parse_char(":")
            self.parse_ws()
            value = self.parse_chars("true")
        else:
            value = self.parse_token_value()

        d[key] = value
        return d

    def parse_token_value(self) -> Union[str, dict]:
        """
        Implements grammar:
        G-> no_space :"VALUE" no_space | no_space {A} no_space

        Returns: string or dictionary
        """
        if self.char == ":":
            self.parse_char(":")
            self.parse_ws()
            self.parse_char("\"")
            value_string = self.parse_string_value()
            self.parse_char("\"")
            return value_string
        elif self.char == "{":
            d = OrderedDict()
            self.parse_char("{")
            list_token_dicts = self.parse()
            # flatten tokens
            for tok_dict in list_token_dicts:
                for k, v in tok_dict.items():
                    d[k] = v
            self.parse_char("}")
            return d
        else:
            raise ValueError()

    def parse_char(self, exp) -> bool:
        """
        Parses character 

        Args:
            exp: character to read in
        
        Returns true if successful
        """
        assert self.char == exp
        self.read()
        return True

    def parse_chars(self, exp) -> bool:
        """
        Parses characters

        Args:
            exp: characters to read in
        
        Returns true if successful
        """
        ok = False
        for x in exp:
            ok |= self.parse_char(x)
        return ok

    def parse_string_key(self) -> str:
        """
        Parses string key, can only contain ascii and '_' characters

        Returns parsed string key
        """
        assert self.char not in string.whitespace and self.char != EOS
        incl_criterium = string.ascii_letters + "_"
        l = []
        while self.char in incl_criterium:
            l.append(self.char)
            if not self.read():
                raise ValueError()

        if not l:
            return None
        return "".join(l)

    def parse_string_value(self) -> str:
        """
        Parses string value, ends with quote followed by space

        Returns parsed string value
        """
        # assert self.char not in string.whitespace and self.char != EOS
        assert self.char != EOS
        l = []
        while self.char != "\"" or self.text[self.index + 1] != " ":
            l.append(self.char)
            if not self.read():
                raise ValueError()

        if not l:
            return None
        return "".join(l)

    def parse_ws(self):
        """
        Deletes whitespaces.

        Returns true if not EOS after parsing
        """
        not_eos = self.char != EOS
        while not_eos and self.char == " ":
            not_eos = self.read()
        return not_eos

    def read(self):
        """
        Reads in next char. 
        
        Returns true if not EOS
        """
        if self.index < self.len_text - 1:  # should be unique
            self.index += 1
            self.char = self.text[self.index]
            return True
        self.char = EOS
        return False