|
|
@@ -633,6 +633,8 @@ class SANMEncoderChunkOpt(AbsEncoder):
|
|
|
self.embed = torch.nn.Linear(input_size, output_size)
|
|
|
elif input_layer == "pe":
|
|
|
self.embed = SinusoidalPositionEncoder()
|
|
|
+ elif input_layer == "pe_online":
|
|
|
+ self.embed = StreamSinusoidalPositionEncoder()
|
|
|
else:
|
|
|
raise ValueError("unknown input_layer: " + input_layer)
|
|
|
self.normalize_before = normalize_before
|
|
|
@@ -818,6 +820,59 @@ class SANMEncoderChunkOpt(AbsEncoder):
|
|
|
return (xs_pad, intermediate_outs), olens, None
|
|
|
return xs_pad, olens, None
|
|
|
|
|
|
+ def _add_overlap_chunk(self, feats: np.ndarray, cache: dict = {}):
|
|
|
+ if len(cache) == 0:
|
|
|
+ return feats
|
|
|
+ cache["feats"] = to_device(cache["feats"], device=feats.device)
|
|
|
+ overlap_feats = torch.cat((cache["feats"], feats), dim=1)
|
|
|
+ cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :]
|
|
|
+ return overlap_feats
|
|
|
+
|
|
|
+ def forward_chunk(self,
|
|
|
+ xs_pad: torch.Tensor,
|
|
|
+ ilens: torch.Tensor,
|
|
|
+ cache: dict = None,
|
|
|
+ ctc: CTC = None,
|
|
|
+ ):
|
|
|
+ xs_pad *= self.output_size() ** 0.5
|
|
|
+ if self.embed is None:
|
|
|
+ xs_pad = xs_pad
|
|
|
+ else:
|
|
|
+ xs_pad = self.embed(xs_pad, cache)
|
|
|
+ if cache["tail_chunk"]:
|
|
|
+ xs_pad = to_device(cache["feats"], device=xs_pad.device)
|
|
|
+ else:
|
|
|
+ xs_pad = self._add_overlap_chunk(xs_pad, cache)
|
|
|
+ encoder_outs = self.encoders0(xs_pad, None, None, None, None)
|
|
|
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
|
|
|
+ intermediate_outs = []
|
|
|
+ if len(self.interctc_layer_idx) == 0:
|
|
|
+ encoder_outs = self.encoders(xs_pad, None, None, None, None)
|
|
|
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
|
|
|
+ else:
|
|
|
+ for layer_idx, encoder_layer in enumerate(self.encoders):
|
|
|
+ encoder_outs = encoder_layer(xs_pad, None, None, None, None)
|
|
|
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
|
|
|
+ if layer_idx + 1 in self.interctc_layer_idx:
|
|
|
+ encoder_out = xs_pad
|
|
|
+
|
|
|
+ # intermediate outputs are also normalized
|
|
|
+ if self.normalize_before:
|
|
|
+ encoder_out = self.after_norm(encoder_out)
|
|
|
+
|
|
|
+ intermediate_outs.append((layer_idx + 1, encoder_out))
|
|
|
+
|
|
|
+ if self.interctc_use_conditioning:
|
|
|
+ ctc_out = ctc.softmax(encoder_out)
|
|
|
+ xs_pad = xs_pad + self.conditioning_layer(ctc_out)
|
|
|
+
|
|
|
+ if self.normalize_before:
|
|
|
+ xs_pad = self.after_norm(xs_pad)
|
|
|
+
|
|
|
+ if len(intermediate_outs) > 0:
|
|
|
+ return (xs_pad, intermediate_outs), None, None
|
|
|
+ return xs_pad, ilens, None
|
|
|
+
|
|
|
def gen_tf2torch_map_dict(self):
|
|
|
tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
|
|
|
tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
|