3 лет назад · 4afdd97df4
--- a/funasr/datasets/dataset.py
+++ b/funasr/datasets/dataset.py
@@ -107,7 +107,7 @@ class H5FileWrapper:
 
															         return value[()]
														
 
															-def sound_loader(path, float_dtype=None):
														
 
															+def sound_loader(path, dest_sample_rate=16000, float_dtype=None):
														
 
															     # The file is as follows:
														
 
															     #   utterance_id_A /some/where/a.wav
														
 
															     #   utterance_id_B /some/where/a.flac
														
@@ -115,7 +115,7 @@ def sound_loader(path, float_dtype=None):
 
															     # NOTE(kamo): SoundScpReader doesn't support pipe-fashion
														
 
															     # like Kaldi e.g. "cat a.wav |".
														
 
															     # NOTE(kamo): The audio signal is normalized to [-1,1] range.
														
 
															-    loader = SoundScpReader(path, normalize=True, always_2d=False)
														
 
															+    loader = SoundScpReader(path, dest_sample_rate=16000, normalize=True, always_2d=False)
														
 
															     # SoundScpReader.__getitem__() returns Tuple[int, ndarray],
														
 
															     # but ndarray is desired, so Adapter class is inserted here
														
@@ -139,7 +139,7 @@ def rand_int_loader(filepath, loader_type):
 
															 DATA_TYPES = {
														
 
															     "sound": dict(
														
 
															         func=sound_loader,
														
 
															-        kwargs=["float_dtype"],
														
 
															+        kwargs=["dest_sample_rate","float_dtype"],
														
 
															         help="Audio format types which supported by sndfile wav, flac, etc."
														
 
															         "\n\n"
														
 
															         "   utterance_id_a a.wav\n"
														
@@ -282,6 +282,7 @@ class ESPnetDataset(AbsDataset):
 
															         int_dtype: str = "long",
														
 
															         max_cache_size: Union[float, int, str] = 0.0,
														
 
															         max_cache_fd: int = 0,
														
 
															+        dest_sample_rate: int = 16000,
														
 
															     ):
														
 
															         assert check_argument_types()
														
 
															         if len(path_name_type_list) == 0:
														
@@ -295,6 +296,7 @@ class ESPnetDataset(AbsDataset):
 
															         self.float_dtype = float_dtype
														
 
															         self.int_dtype = int_dtype
														
 
															         self.max_cache_fd = max_cache_fd
														
 
															+        self.dest_sample_rate = dest_sample_rate
														
 
															         self.loader_dict = {}
														
 
															         self.debug_info = {}
														
@@ -335,6 +337,8 @@ class ESPnetDataset(AbsDataset):
 
															                 for key2 in dic["kwargs"]:
														
 
															                     if key2 == "loader_type":
														
 
															                         kwargs["loader_type"] = loader_type
														
 
															+                    elif key2 == "dest_sample_rate" and loader_type=="sound":
														
 
															+                        kwargs["dest_sample_rate"] = self.dest_sample_rate
														
 
															                     elif key2 == "float_dtype":
														
 
															                         kwargs["float_dtype"] = self.float_dtype
														
 
															                     elif key2 == "int_dtype":
														
--- a/funasr/fileio/sound_scp.py
+++ b/funasr/fileio/sound_scp.py
@@ -4,6 +4,7 @@ from typing import Union
 
															 import numpy as np
														
 
															 import soundfile
														
 
															+import librosa
														
 
															 from typeguard import check_argument_types
														
 
															 from funasr.fileio.read_text import read_2column_text
														
@@ -30,6 +31,7 @@ class SoundScpReader(collections.abc.Mapping):
 
															         dtype=np.int16,
														
 
															         always_2d: bool = False,
														
 
															         normalize: bool = False,
														
 
															+        dest_sample_rate: int = 16000,
														
 
															     ):
														
 
															         assert check_argument_types()
														
 
															         self.fname = fname
														
@@ -37,15 +39,18 @@ class SoundScpReader(collections.abc.Mapping):
 
															         self.always_2d = always_2d
														
 
															         self.normalize = normalize
														
 
															         self.data = read_2column_text(fname)
														
 
															+        self.dest_sample_rate = dest_sample_rate
														
 
															     def __getitem__(self, key):
														
 
															         wav = self.data[key]
														
 
															         if self.normalize:
														
 
															             # soundfile.read normalizes data to [-1,1] if dtype is not given
														
 
															-            array, rate = soundfile.read(wav, always_2d=self.always_2d)
														
 
															+            array, rate = librosa.load(
														
 
															+                wav, sr=self.dest_sample_rate, mono=not self.always_2d
														
 
															+            )
														
 
															         else:
														
 
															-            array, rate = soundfile.read(
														
 
															-                wav, dtype=self.dtype, always_2d=self.always_2d
														
 
															+            array, rate = librosa.load(
														
 
															+                wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
														
 
															             )
														
 
															         return rate, array
														
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -1576,6 +1576,7 @@ class AbsTask(ABC):
 
															             preprocess=iter_options.preprocess_fn,
														
 
															             max_cache_size=iter_options.max_cache_size,
														
 
															             max_cache_fd=iter_options.max_cache_fd,
														
 
															+            dest_sample_rate=args.frontend_conf["fs"],
														
 
															         )
														
 
															         cls.check_task_requirements(
														
 
															             dataset, args.allow_variable_data_keys, train=iter_options.train