dataset.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. import logging
  2. import os
  3. import random
  4. from functools import partial
  5. import torch
  6. import torch.distributed as dist
  7. import torchaudio
  8. from kaldiio import ReadHelper
  9. from torch.utils.data import IterableDataset
  10. from funasr.datasets.large_datasets.datapipes.batch import MaxTokenBucketizerIterDataPipe
  11. from funasr.datasets.large_datasets.datapipes.filter import FilterIterDataPipe
  12. from funasr.datasets.large_datasets.datapipes.map import MapperIterDataPipe
  13. from funasr.datasets.large_datasets.utils.clipping import clipping
  14. from funasr.datasets.large_datasets.utils.filter import filter
  15. from funasr.datasets.large_datasets.utils.padding import padding
  16. from funasr.datasets.large_datasets.utils.tokenize import tokenize
  17. def read_lists(list_file):
  18. lists = []
  19. with open(list_file, 'r', encoding='utf8') as fin:
  20. for line in fin:
  21. parts = line.strip()
  22. lists.append(parts)
  23. return lists
  24. class AudioDataset(IterableDataset):
  25. def __init__(self, scp_lists, data_names, data_types, frontend_conf=None, shuffle=True, speed_perturb=None,
  26. mode="train"):
  27. self.scp_lists = scp_lists
  28. self.data_names = data_names
  29. self.data_types = data_types
  30. self.frontend_conf = frontend_conf
  31. self.shuffle = shuffle
  32. self.mode = mode
  33. self.epoch = -1
  34. self.rank = 0
  35. self.world_size = 1
  36. self.worker_id = 0
  37. self.num_workers = 1
  38. self.speed_perturb = speed_perturb
  39. if self.speed_perturb is not None:
  40. logging.info("Using speed_perturb: {}".format(speed_perturb))
  41. def set_epoch(self, epoch):
  42. self.epoch = epoch
  43. def get_rank_data_list(self, data_index):
  44. assert dist.is_available()
  45. if dist.is_initialized():
  46. self.rank = dist.get_rank()
  47. self.world_size = dist.get_world_size()
  48. else:
  49. self.rank = 0
  50. self.world_size = 1
  51. if self.mode == "train":
  52. if self.shuffle:
  53. random.seed(self.epoch)
  54. random.shuffle(data_index)
  55. return data_index[self.rank::self.world_size]
  56. return data_index
  57. def get_worker_data_list(self, rank_data_index):
  58. worker_info = torch.utils.data.get_worker_info()
  59. if worker_info is None:
  60. self.worker_id = 0
  61. self.num_workers = 1
  62. else:
  63. self.worker_id = worker_info.id
  64. self.num_workers = worker_info.num_workers
  65. return rank_data_index[self.worker_id::self.num_workers]
  66. def close_reader(self, reader_list):
  67. for reader in reader_list:
  68. reader.close()
  69. def __iter__(self):
  70. data_index = list(range(len(self.scp_lists)))
  71. rank_data_index = self.get_rank_data_list(data_index)
  72. worker_data_index = self.get_worker_data_list(rank_data_index)
  73. for index in worker_data_index:
  74. data = dict(scp=self.scp_lists[index])
  75. assert 'scp' in data
  76. scp = data['scp']
  77. data_file_list = scp.strip().split()
  78. data_name_list = self.data_names.split(",")
  79. data_type_list = self.data_types.split(",")
  80. for file in data_file_list:
  81. assert os.path.exists(file), "{} not exists".format(file)
  82. assert len(data_file_list) == len(data_name_list) == len(data_type_list), \
  83. "The item number of data, data_names, data_types must be the same "
  84. reader_list = []
  85. for data_file, data_type in zip(data_file_list, data_type_list):
  86. if data_type == "kaldi_ark":
  87. ark_reader = ReadHelper('ark:{}'.format(data_file))
  88. reader_list.append(ark_reader)
  89. elif data_type == "text" or data_type == "sound" or data_type == 'text_hotword':
  90. text_reader = open(data_file, "r")
  91. reader_list.append(text_reader)
  92. elif data_type == "none":
  93. continue
  94. else:
  95. raise TypeError("Data type {} is not supported".format(data_type))
  96. for items in zip(*reader_list):
  97. sample_dict = {}
  98. for item, (data_name, data_type) in zip(items, zip(data_name_list, data_type_list)):
  99. if data_type == "kaldi_ark":
  100. key, mat = item
  101. sample_dict[data_name] = mat
  102. if data_name == "speech":
  103. sample_dict["key"] = key
  104. elif data_type == "sound":
  105. key, path = item.strip().split()
  106. waveform, sampling_rate = torchaudio.load(path)
  107. if self.frontend_conf is not None:
  108. if sampling_rate != self.frontend_conf["fs"]:
  109. waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
  110. new_freq=self.frontend_conf["fs"])(waveform)
  111. sampling_rate = self.frontend_conf["fs"]
  112. waveform = waveform.numpy()
  113. mat = waveform[0]
  114. if self.speed_perturb is not None:
  115. speed = random.choice(self.speed_perturb)
  116. if speed != 1.0:
  117. mat, _ = torchaudio.sox_effects.apply_effects_tensor(
  118. torch.tensor(mat).view(1, -1), sampling_rate, [['speed', str(speed)], ['rate', str(sampling_rate)]])
  119. mat = mat.view(-1).numpy()
  120. sample_dict[data_name] = mat
  121. sample_dict["sampling_rate"] = sampling_rate
  122. if data_name == "speech":
  123. sample_dict["key"] = key
  124. elif data_type == "text_hotword":
  125. text = item
  126. segs = text.strip().split()
  127. sample_dict[data_name] = segs[1:]
  128. if "key" not in sample_dict:
  129. sample_dict["key"] = segs[0]
  130. sample_dict['hw_tag'] = 1
  131. else:
  132. text = item
  133. segs = text.strip().split()
  134. sample_dict[data_name] = segs[1:]
  135. if "key" not in sample_dict:
  136. sample_dict["key"] = segs[0]
  137. yield sample_dict
  138. self.close_reader(reader_list)
  139. def len_fn_example(data):
  140. return 1
  141. def len_fn_token(data):
  142. assert "speech" in data
  143. if "sampling_rate" in data:
  144. return (data["speech"].shape[0] / data["sampling_rate"]) * 1000.
  145. else:
  146. return data["speech"].shape[0]
  147. def Dataset(data_list_file,
  148. dict,
  149. seg_dict,
  150. punc_dict,
  151. bpe_tokenizer,
  152. conf,
  153. frontend_conf,
  154. speed_perturb=None,
  155. mode="train",
  156. batch_mode="padding"):
  157. scp_lists = read_lists(data_list_file)
  158. shuffle = conf.get('shuffle', True)
  159. data_names = conf.get("data_names", "speech,text")
  160. data_types = conf.get("data_types", "kaldi_ark,text")
  161. pre_hwfile = conf.get("pre_hwlist", None)
  162. pre_prob = conf.get("pre_prob", 0) # unused yet
  163. hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
  164. "double_rate": conf.get("double_rate", 0.1),
  165. "hotword_min_length": conf.get("hotword_min_length", 2),
  166. "hotword_max_length": conf.get("hotword_max_length", 8),
  167. "pre_prob": conf.get("pre_prob", 0.0)}
  168. if pre_hwfile is not None:
  169. pre_hwlist = []
  170. with open(pre_hwfile, 'r') as fin:
  171. for line in fin.readlines():
  172. pre_hwlist.append(line.strip())
  173. else:
  174. pre_hwlist = None
  175. dataset = AudioDataset(scp_lists,
  176. data_names,
  177. data_types,
  178. frontend_conf=frontend_conf,
  179. shuffle=shuffle,
  180. speed_perturb=speed_perturb,
  181. mode=mode,
  182. )
  183. filter_conf = conf.get('filter_conf', {})
  184. filter_fn = partial(filter, **filter_conf)
  185. dataset = FilterIterDataPipe(dataset, fn=filter_fn)
  186. if "text" in data_names:
  187. vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
  188. tokenize_fn = partial(tokenize, **vocab)
  189. dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
  190. if shuffle:
  191. buffer_conf = conf.get('shuffle_conf', {})
  192. buffer_size = buffer_conf['shuffle_size']
  193. sort_size = buffer_conf['sort_size']
  194. else:
  195. buffer_size = 0
  196. sort_size = 1
  197. batch_conf = conf.get('batch_conf', {})
  198. batch_size = batch_conf['batch_size']
  199. batch_type = batch_conf['batch_type']
  200. assert batch_type in ["example", "token"]
  201. if batch_type == 'example':
  202. len_fn = len_fn_example
  203. else:
  204. len_fn = len_fn_token
  205. dataset = MaxTokenBucketizerIterDataPipe(dataset,
  206. batch_size=batch_size,
  207. len_fn=len_fn,
  208. buffer_size=buffer_size,
  209. sort_size=sort_size,
  210. batch_mode=batch_mode)
  211. int_pad_value = conf.get("int_pad_value", -1)
  212. float_pad_value = conf.get("float_pad_value", 0.0)
  213. padding_conf = {"int_pad_value": int_pad_value, "float_pad_value": float_pad_value}
  214. padding_fn = partial(padding, **padding_conf)
  215. dataset = MapperIterDataPipe(dataset, fn=padding_fn if batch_mode == "padding" else clipping)
  216. return dataset