| 123456789101112131415161718192021222324252627282930313233 |
- import os
- class MsDataset(object):
- @classmethod
- def load_core(cls, data_dir, data_set):
- wav_file = os.path.join(data_dir, data_set, "wav.scp")
- text_file = os.path.join(data_dir, data_set, "text")
- with open(wav_file) as f:
- wav_lines = f.readlines()
- with open(text_file) as f:
- text_lines = f.readlines()
- data_list = []
- for wav_line, text_line in zip(wav_lines, text_lines):
- item = {}
- item["Audio:FILE"] = wav_line.strip().split()[-1]
- item["Text:LABEL"] = " ".join(text_line.strip().split()[1:])
- data_list.append(item)
- return data_list
- @classmethod
- def load(cls, dataset_name, namespace="speech_asr", train_set="train", dev_set="validation"):
- if os.path.exists(dataset_name):
- data_dir = dataset_name
- ds_dict = {}
- ds_dict["train"] = cls.load_core(data_dir, train_set)
- ds_dict["validation"] = cls.load_core(data_dir, dev_set)
- ds_dict["raw_data_dir"] = data_dir
- return ds_dict
- else:
- from modelscope.msdatasets import MsDataset
- ds_dict = MsDataset.load(dataset_name=dataset_name, namespace=namespace)
- return ds_dict
|