read_text.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import logging
  2. from pathlib import Path
  3. from typing import Dict
  4. from typing import List
  5. from typing import Union
  6. from typeguard import check_argument_types
  7. def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
  8. """Read a text file having 2 column as dict object.
  9. Examples:
  10. wav.scp:
  11. key1 /some/path/a.wav
  12. key2 /some/path/b.wav
  13. >>> read_2column_text('wav.scp')
  14. {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
  15. """
  16. assert check_argument_types()
  17. data = {}
  18. with Path(path).open("r", encoding="utf-8") as f:
  19. for linenum, line in enumerate(f, 1):
  20. sps = line.rstrip().split(maxsplit=1)
  21. if len(sps) == 1:
  22. k, v = sps[0], ""
  23. else:
  24. k, v = sps
  25. if k in data:
  26. raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
  27. data[k] = v
  28. return data
  29. def load_num_sequence_text(
  30. path: Union[Path, str], loader_type: str = "csv_int"
  31. ) -> Dict[str, List[Union[float, int]]]:
  32. """Read a text file indicating sequences of number
  33. Examples:
  34. key1 1 2 3
  35. key2 34 5 6
  36. >>> d = load_num_sequence_text('text')
  37. >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
  38. """
  39. assert check_argument_types()
  40. if loader_type == "text_int":
  41. delimiter = " "
  42. dtype = int
  43. elif loader_type == "text_float":
  44. delimiter = " "
  45. dtype = float
  46. elif loader_type == "csv_int":
  47. delimiter = ","
  48. dtype = int
  49. elif loader_type == "csv_float":
  50. delimiter = ","
  51. dtype = float
  52. else:
  53. raise ValueError(f"Not supported loader_type={loader_type}")
  54. # path looks like:
  55. # utta 1,0
  56. # uttb 3,4,5
  57. # -> return {'utta': np.ndarray([1, 0]),
  58. # 'uttb': np.ndarray([3, 4, 5])}
  59. d = read_2column_text(path)
  60. # Using for-loop instead of dict-comprehension for debuggability
  61. retval = {}
  62. for k, v in d.items():
  63. try:
  64. retval[k] = [dtype(i) for i in v.split(delimiter)]
  65. except TypeError:
  66. logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
  67. raise
  68. return retval