|
|
@@ -9,6 +9,7 @@ from torch.nn.utils.rnn import pad_sequence
|
|
|
from typeguard import check_argument_types
|
|
|
|
|
|
import funasr.models.frontend.eend_ola_feature as eend_ola_feature
|
|
|
+from funasr.models.frontend.abs_frontend import AbsFrontend
|
|
|
|
|
|
|
|
|
def load_cmvn(cmvn_file):
|
|
|
@@ -33,11 +34,11 @@ def load_cmvn(cmvn_file):
|
|
|
means = np.array(means_list).astype(np.float)
|
|
|
vars = np.array(vars_list).astype(np.float)
|
|
|
cmvn = np.array([means, vars])
|
|
|
- cmvn = torch.as_tensor(cmvn)
|
|
|
+ cmvn = torch.as_tensor(cmvn, dtype=torch.float32)
|
|
|
return cmvn
|
|
|
|
|
|
|
|
|
-def apply_cmvn(inputs, cmvn_file): # noqa
|
|
|
+def apply_cmvn(inputs, cmvn): # noqa
|
|
|
"""
|
|
|
Apply CMVN with mvn data
|
|
|
"""
|
|
|
@@ -46,11 +47,10 @@ def apply_cmvn(inputs, cmvn_file): # noqa
|
|
|
dtype = inputs.dtype
|
|
|
frame, dim = inputs.shape
|
|
|
|
|
|
- cmvn = load_cmvn(cmvn_file)
|
|
|
- means = np.tile(cmvn[0:1, :dim], (frame, 1))
|
|
|
- vars = np.tile(cmvn[1:2, :dim], (frame, 1))
|
|
|
- inputs += torch.from_numpy(means).type(dtype).to(device)
|
|
|
- inputs *= torch.from_numpy(vars).type(dtype).to(device)
|
|
|
+ means = cmvn[0:1, :dim]
|
|
|
+ vars = cmvn[1:2, :dim]
|
|
|
+ inputs += means.to(device)
|
|
|
+ inputs *= vars.to(device)
|
|
|
|
|
|
return inputs.type(torch.float32)
|
|
|
|
|
|
@@ -75,7 +75,7 @@ def apply_lfr(inputs, lfr_m, lfr_n):
|
|
|
return LFR_outputs.type(torch.float32)
|
|
|
|
|
|
|
|
|
-class WavFrontend(torch.nn.Module):
|
|
|
+class WavFrontend(AbsFrontend):
|
|
|
"""Conventional frontend structure for ASR.
|
|
|
"""
|
|
|
|
|
|
@@ -110,6 +110,7 @@ class WavFrontend(torch.nn.Module):
|
|
|
self.dither = dither
|
|
|
self.snip_edges = snip_edges
|
|
|
self.upsacle_samples = upsacle_samples
|
|
|
+ self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file)
|
|
|
|
|
|
def output_size(self) -> int:
|
|
|
return self.n_mels * self.lfr_m
|
|
|
@@ -139,8 +140,8 @@ class WavFrontend(torch.nn.Module):
|
|
|
|
|
|
if self.lfr_m != 1 or self.lfr_n != 1:
|
|
|
mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
|
|
|
- if self.cmvn_file is not None:
|
|
|
- mat = apply_cmvn(mat, self.cmvn_file)
|
|
|
+ if self.cmvn is not None:
|
|
|
+ mat = apply_cmvn(mat, self.cmvn)
|
|
|
feat_length = mat.size(0)
|
|
|
feats.append(mat)
|
|
|
feats_lens.append(feat_length)
|
|
|
@@ -193,8 +194,8 @@ class WavFrontend(torch.nn.Module):
|
|
|
mat = input[i, :input_lengths[i], :]
|
|
|
if self.lfr_m != 1 or self.lfr_n != 1:
|
|
|
mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
|
|
|
- if self.cmvn_file is not None:
|
|
|
- mat = apply_cmvn(mat, self.cmvn_file)
|
|
|
+ if self.cmvn is not None:
|
|
|
+ mat = apply_cmvn(mat, self.cmvn)
|
|
|
feat_length = mat.size(0)
|
|
|
feats.append(mat)
|
|
|
feats_lens.append(feat_length)
|
|
|
@@ -206,7 +207,7 @@ class WavFrontend(torch.nn.Module):
|
|
|
return feats_pad, feats_lens
|
|
|
|
|
|
|
|
|
-class WavFrontendOnline(torch.nn.Module):
|
|
|
+class WavFrontendOnline(AbsFrontend):
|
|
|
"""Conventional frontend structure for streaming ASR/VAD.
|
|
|
"""
|
|
|
|
|
|
@@ -451,7 +452,7 @@ class WavFrontendOnline(torch.nn.Module):
|
|
|
self.lfr_splice_cache = []
|
|
|
|
|
|
|
|
|
-class WavFrontendMel23(torch.nn.Module):
|
|
|
+class WavFrontendMel23(AbsFrontend):
|
|
|
"""Conventional frontend structure for ASR.
|
|
|
"""
|
|
|
|
|
|
@@ -499,4 +500,4 @@ class WavFrontendMel23(torch.nn.Module):
|
|
|
feats_pad = pad_sequence(feats,
|
|
|
batch_first=True,
|
|
|
padding_value=0.0)
|
|
|
- return feats_pad, feats_lens
|
|
|
+ return feats_pad, feats_lens
|