sound_scp.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import collections.abc
  2. from pathlib import Path
  3. from typing import List, Tuple, Union
  4. import random
  5. import numpy as np
  6. import soundfile
  7. import librosa
  8. import torch
  9. import torchaudio
  10. from funasr.fileio.read_text import read_2column_text
  11. def soundfile_read(
  12. wavs: Union[str, List[str]],
  13. dtype=None,
  14. always_2d: bool = False,
  15. concat_axis: int = 1,
  16. start: int = 0,
  17. end: int = None,
  18. return_subtype: bool = False,
  19. ) -> Tuple[np.array, int]:
  20. if isinstance(wavs, str):
  21. wavs = [wavs]
  22. arrays = []
  23. subtypes = []
  24. prev_rate = None
  25. prev_wav = None
  26. for wav in wavs:
  27. with soundfile.SoundFile(wav) as f:
  28. f.seek(start)
  29. if end is not None:
  30. frames = end - start
  31. else:
  32. frames = -1
  33. if dtype == "float16":
  34. array = f.read(
  35. frames,
  36. dtype="float32",
  37. always_2d=always_2d,
  38. ).astype(dtype)
  39. else:
  40. array = f.read(frames, dtype=dtype, always_2d=always_2d)
  41. rate = f.samplerate
  42. subtype = f.subtype
  43. subtypes.append(subtype)
  44. if len(wavs) > 1 and array.ndim == 1 and concat_axis == 1:
  45. # array: (Time, Channel)
  46. array = array[:, None]
  47. if prev_wav is not None:
  48. if prev_rate != rate:
  49. raise RuntimeError(
  50. f"'{prev_wav}' and '{wav}' have mismatched sampling rate: "
  51. f"{prev_rate} != {rate}"
  52. )
  53. dim1 = arrays[0].shape[1 - concat_axis]
  54. dim2 = array.shape[1 - concat_axis]
  55. if dim1 != dim2:
  56. raise RuntimeError(
  57. "Shapes must match with "
  58. f"{1 - concat_axis} axis, but gut {dim1} and {dim2}"
  59. )
  60. prev_rate = rate
  61. prev_wav = wav
  62. arrays.append(array)
  63. if len(arrays) == 1:
  64. array = arrays[0]
  65. else:
  66. array = np.concatenate(arrays, axis=concat_axis)
  67. if return_subtype:
  68. return array, rate, subtypes
  69. else:
  70. return array, rate
  71. class SoundScpReader(collections.abc.Mapping):
  72. """Reader class for 'wav.scp'.
  73. Examples:
  74. key1 /some/path/a.wav
  75. key2 /some/path/b.wav
  76. key3 /some/path/c.wav
  77. key4 /some/path/d.wav
  78. ...
  79. >>> reader = SoundScpReader('wav.scp')
  80. >>> rate, array = reader['key1']
  81. """
  82. def __init__(
  83. self,
  84. fname,
  85. dtype=np.int16,
  86. always_2d: bool = False,
  87. normalize: bool = False,
  88. dest_sample_rate: int = 16000,
  89. speed_perturb: Union[list, tuple] = None,
  90. ):
  91. self.fname = fname
  92. self.dtype = dtype
  93. self.always_2d = always_2d
  94. self.normalize = normalize
  95. self.data = read_2column_text(fname)
  96. self.dest_sample_rate = dest_sample_rate
  97. self.speed_perturb = speed_perturb
  98. def __getitem__(self, key):
  99. wav = self.data[key]
  100. if self.normalize:
  101. # soundfile.read normalizes data to [-1,1] if dtype is not given
  102. array, rate = librosa.load(
  103. wav, sr=self.dest_sample_rate, mono=self.always_2d
  104. )
  105. else:
  106. array, rate = librosa.load(
  107. wav, sr=self.dest_sample_rate, mono=self.always_2d, dtype=self.dtype
  108. )
  109. if self.speed_perturb is not None:
  110. speed = random.choice(self.speed_perturb)
  111. if speed != 1.0:
  112. array, _ = torchaudio.sox_effects.apply_effects_tensor(
  113. torch.tensor(array).view(1, -1), rate,
  114. [['speed', str(speed)], ['rate', str(rate)]])
  115. array = array.view(-1).numpy()
  116. if array.ndim==2:
  117. array=array.transpose((1, 0))
  118. return rate, array
  119. def get_path(self, key):
  120. return self.data[key]
  121. def __contains__(self, item):
  122. return item
  123. def __len__(self):
  124. return len(self.data)
  125. def __iter__(self):
  126. return iter(self.data)
  127. def keys(self):
  128. return self.data.keys()
  129. class SoundScpWriter:
  130. """Writer class for 'wav.scp'
  131. Examples:
  132. key1 /some/path/a.wav
  133. key2 /some/path/b.wav
  134. key3 /some/path/c.wav
  135. key4 /some/path/d.wav
  136. ...
  137. >>> writer = SoundScpWriter('./data/', './data/feat.scp')
  138. >>> writer['aa'] = 16000, numpy_array
  139. >>> writer['bb'] = 16000, numpy_array
  140. """
  141. def __init__(
  142. self,
  143. outdir: Union[Path, str],
  144. scpfile: Union[Path, str],
  145. format="wav",
  146. dtype=None,
  147. ):
  148. self.dir = Path(outdir)
  149. self.dir.mkdir(parents=True, exist_ok=True)
  150. scpfile = Path(scpfile)
  151. scpfile.parent.mkdir(parents=True, exist_ok=True)
  152. self.fscp = scpfile.open("w", encoding="utf-8")
  153. self.format = format
  154. self.dtype = dtype
  155. self.data = {}
  156. def __setitem__(self, key: str, value):
  157. rate, signal = value
  158. assert isinstance(rate, int), type(rate)
  159. assert isinstance(signal, np.ndarray), type(signal)
  160. if signal.ndim not in (1, 2):
  161. raise RuntimeError(f"Input signal must be 1 or 2 dimension: {signal.ndim}")
  162. if signal.ndim == 1:
  163. signal = signal[:, None]
  164. wav = self.dir / f"{key}.{self.format}"
  165. wav.parent.mkdir(parents=True, exist_ok=True)
  166. soundfile.write(str(wav), signal, rate)
  167. self.fscp.write(f"{key} {wav}\n")
  168. # Store the file path
  169. self.data[key] = str(wav)
  170. def get_path(self, key):
  171. return self.data[key]
  172. def __enter__(self):
  173. return self
  174. def __exit__(self, exc_type, exc_val, exc_tb):
  175. self.close()
  176. def close(self):
  177. self.fscp.close()