| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import logging
- from pathlib import Path
- from typing import Dict
- from typing import List
- from typing import Union
- def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
- """Read a text file having 2 column as dict object.
- Examples:
- wav.scp:
- key1 /some/path/a.wav
- key2 /some/path/b.wav
- >>> read_2column_text('wav.scp')
- {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
- """
- data = {}
- with Path(path).open("r", encoding="utf-8") as f:
- for linenum, line in enumerate(f, 1):
- sps = line.rstrip().split(maxsplit=1)
- if len(sps) == 1:
- k, v = sps[0], ""
- else:
- k, v = sps
- if k in data:
- raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
- data[k] = v
- return data
- def load_num_sequence_text(
- path: Union[Path, str], loader_type: str = "csv_int"
- ) -> Dict[str, List[Union[float, int]]]:
- """Read a text file indicating sequences of number
- Examples:
- key1 1 2 3
- key2 34 5 6
- >>> d = load_num_sequence_text('text')
- >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
- """
- if loader_type == "text_int":
- delimiter = " "
- dtype = int
- elif loader_type == "text_float":
- delimiter = " "
- dtype = float
- elif loader_type == "csv_int":
- delimiter = ","
- dtype = int
- elif loader_type == "csv_float":
- delimiter = ","
- dtype = float
- else:
- raise ValueError(f"Not supported loader_type={loader_type}")
- # path looks like:
- # utta 1,0
- # uttb 3,4,5
- # -> return {'utta': np.ndarray([1, 0]),
- # 'uttb': np.ndarray([3, 4, 5])}
- d = read_2column_text(path)
- # Using for-loop instead of dict-comprehension for debuggability
- retval = {}
- for k, v in d.items():
- try:
- retval[k] = [dtype(i) for i in v.split(delimiter)]
- except TypeError:
- logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
- raise
- return retval
|