| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- import string
- from collections import OrderedDict
- from typing import Dict, List, Union
- PRESERVE_ORDER_KEY = "preserve_order"
- EOS = "<EOS>"
- class TokenParser:
- """
- Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'
- Args
- text: tokenized text
- """
- def __call__(self, text):
- """
- Setup function
- Args:
- text: text to be parsed
-
- """
- self.text = text
- self.len_text = len(text)
- self.char = text[0] # cannot handle empty string
- self.index = 0
- def parse(self) -> List[dict]:
- """
- Main function. Implements grammar:
- A -> space F space F space F ... space
- Returns list of dictionaries
- """
- l = list()
- while self.parse_ws():
- token = self.parse_token()
- if not token:
- break
- l.append(token)
- return l
- def parse_token(self) -> Dict[str, Union[str, dict]]:
- """
- Implements grammar:
- F-> no_space KG no_space
- Returns: K, G as dictionary values
- """
- d = OrderedDict()
- key = self.parse_string_key()
- if key is None:
- return None
- self.parse_ws()
- if key == PRESERVE_ORDER_KEY:
- self.parse_char(":")
- self.parse_ws()
- value = self.parse_chars("true")
- else:
- value = self.parse_token_value()
- d[key] = value
- return d
- def parse_token_value(self) -> Union[str, dict]:
- """
- Implements grammar:
- G-> no_space :"VALUE" no_space | no_space {A} no_space
- Returns: string or dictionary
- """
- if self.char == ":":
- self.parse_char(":")
- self.parse_ws()
- self.parse_char("\"")
- value_string = self.parse_string_value()
- self.parse_char("\"")
- return value_string
- elif self.char == "{":
- d = OrderedDict()
- self.parse_char("{")
- list_token_dicts = self.parse()
- # flatten tokens
- for tok_dict in list_token_dicts:
- for k, v in tok_dict.items():
- d[k] = v
- self.parse_char("}")
- return d
- else:
- raise ValueError()
- def parse_char(self, exp) -> bool:
- """
- Parses character
- Args:
- exp: character to read in
-
- Returns true if successful
- """
- assert self.char == exp
- self.read()
- return True
- def parse_chars(self, exp) -> bool:
- """
- Parses characters
- Args:
- exp: characters to read in
-
- Returns true if successful
- """
- ok = False
- for x in exp:
- ok |= self.parse_char(x)
- return ok
- def parse_string_key(self) -> str:
- """
- Parses string key, can only contain ascii and '_' characters
- Returns parsed string key
- """
- assert self.char not in string.whitespace and self.char != EOS
- incl_criterium = string.ascii_letters + "_"
- l = []
- while self.char in incl_criterium:
- l.append(self.char)
- if not self.read():
- raise ValueError()
- if not l:
- return None
- return "".join(l)
- def parse_string_value(self) -> str:
- """
- Parses string value, ends with quote followed by space
- Returns parsed string value
- """
- # assert self.char not in string.whitespace and self.char != EOS
- assert self.char != EOS
- l = []
- while self.char != "\"" or self.text[self.index + 1] != " ":
- l.append(self.char)
- if not self.read():
- raise ValueError()
- if not l:
- return None
- return "".join(l)
- def parse_ws(self):
- """
- Deletes whitespaces.
- Returns true if not EOS after parsing
- """
- not_eos = self.char != EOS
- while not_eos and self.char == " ":
- not_eos = self.read()
- return not_eos
- def read(self):
- """
- Reads in next char.
-
- Returns true if not EOS
- """
- if self.index < self.len_text - 1: # should be unique
- self.index += 1
- self.char = self.text[self.index]
- return True
- self.char = EOS
- return False
|