ms_dataset.py 1.3 KB

123456789101112131415161718192021222324252627282930313233
  1. import os
  2. class MsDataset(object):
  3. @classmethod
  4. def load_core(cls, data_dir, data_set):
  5. wav_file = os.path.join(data_dir, data_set, "wav.scp")
  6. text_file = os.path.join(data_dir, data_set, "text")
  7. with open(wav_file) as f:
  8. wav_lines = f.readlines()
  9. with open(text_file) as f:
  10. text_lines = f.readlines()
  11. data_list = []
  12. for wav_line, text_line in zip(wav_lines, text_lines):
  13. item = {}
  14. item["Audio:FILE"] = wav_line.strip().split()[-1]
  15. item["Text:LABEL"] = " ".join(text_line.strip().split()[1:])
  16. data_list.append(item)
  17. return data_list
  18. @classmethod
  19. def load(cls, dataset_name, namespace="speech_asr", train_set="train", dev_set="validation"):
  20. if os.path.exists(dataset_name):
  21. data_dir = dataset_name
  22. ds_dict = {}
  23. ds_dict["train"] = cls.load_core(data_dir, train_set)
  24. ds_dict["validation"] = cls.load_core(data_dir, dev_set)
  25. ds_dict["raw_data_dir"] = data_dir
  26. return ds_dict
  27. else:
  28. from modelscope.msdatasets import MsDataset
  29. ds_dict = MsDataset.load(dataset_name=dataset_name, namespace=namespace)
  30. return ds_dict