3 lat temu · 04a7ce3205
--- a/egs/mars/sd/scripts/convert_rttm_to_seg_file.py
+++ b/egs/mars/sd/scripts/convert_rttm_to_seg_file.py
@@ -0,0 +1,57 @@
 
				+import numpy as np
			
 
				+from funasr.utils.job_runner import MultiProcessRunnerV3
			
 
				+from funasr.utils.misc import load_scp_as_list, load_scp_as_dict
			
 
				+import os
			
 
				+import argparse
			
 
				+
			
 
				+
			
 
				+class MyRunner(MultiProcessRunnerV3):
			
 
				+
			
 
				+    def prepare(self, parser):
			
 
				+        parser.add_argument("--rttm_scp", type=str)
			
 
				+        parser.add_argument("--seg_file", type=str)
			
 
				+        args = parser.parse_args()
			
 
				+
			
 
				+        if not os.path.exists(os.path.dirname(args.seg_file)):
			
 
				+            os.makedirs(os.path.dirname(args.seg_file))
			
 
				+
			
 
				+        task_list = load_scp_as_list(args.rttm_scp)
			
 
				+        return task_list, None, args
			
 
				+
			
 
				+    def post(self, results_list, args):
			
 
				+        with open(args.seg_file, "wt", encoding="utf-8") as fd:
			
 
				+            for results in results_list:
			
 
				+                fd.writelines(results)
			
 
				+
			
 
				+
			
 
				+def process(task_args):
			
 
				+    _, task_list, _, args = task_args
			
 
				+    outputs = []
			
 
				+    for mid, rttm_path in task_list:
			
 
				+        spk_turns = []
			
 
				+        length = 0
			
 
				+        for one_line in open(rttm_path, 'rt', encoding="utf-8"):
			
 
				+            parts = one_line.strip().split(" ")
			
 
				+            _, st, dur, spk_name = parts[1], float(parts[3]), float(parts[4]), parts[7]
			
 
				+            st, ed = int(st*100), int((st + dur)*100)
			
 
				+            length = ed if ed > length else length
			
 
				+            spk_turns.append([mid, st, ed, spk_name])
			
 
				+        is_sph = np.zeros((length+1, ), dtype=bool)
			
 
				+        for _, st, ed, _ in spk_turns:
			
 
				+            is_sph[st:ed] = True
			
 
				+
			
 
				+        st, in_speech = 0, False
			
 
				+        for i in range(length+1):
			
 
				+            if not in_speech and is_sph[i]:
			
 
				+                st, in_speech = i, True
			
 
				+            if in_speech and not is_sph[i]:
			
 
				+                in_speech = False
			
 
				+                outputs.append("{}-{:07d}-{:07d} {} {:.2f} {:.2f}\n".format(
			
 
				+                    mid, st, i, mid, float(st)/100, float(i)/100
			
 
				+                ))
			
 
				+    return outputs
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    my_runner = MyRunner(process)
			
 
				+    my_runner.run()
			
--- a/funasr/bin/diar_train.py
+++ b/funasr/bin/diar_train.py
@@ -0,0 +1,46 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from funasr.tasks.diar import DiarTask
			
 
				+
			
 
				+
			
 
				+# for ASR Training
			
 
				+def parse_args():
			
 
				+    parser = DiarTask.get_parser()
			
 
				+    parser.add_argument(
			
 
				+        "--gpu_id",
			
 
				+        type=int,
			
 
				+        default=0,
			
 
				+        help="local gpu id.",
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+    return args
			
 
				+
			
 
				+
			
 
				+def main(args=None, cmd=None):
			
 
				+    # for ASR Training
			
 
				+    DiarTask.main(args=args, cmd=cmd)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    args = parse_args()
			
 
				+
			
 
				+    # setup local gpu_id
			
 
				+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
			
 
				+
			
 
				+    # DDP settings
			
 
				+    if args.ngpu > 1:
			
 
				+        args.distributed = True
			
 
				+    else:
			
 
				+        args.distributed = False
			
 
				+    assert args.num_worker_count == 1
			
 
				+
			
 
				+    # re-compute batch size: when dataset type is small
			
 
				+    if args.dataset_type == "small":
			
 
				+        if args.batch_size is not None:
			
 
				+            args.batch_size = args.batch_size * args.ngpu
			
 
				+        if args.batch_bins is not None:
			
 
				+            args.batch_bins = args.batch_bins * args.ngpu
			
 
				+
			
 
				+    main(args=args)
			
--- a/funasr/models/e2e_diar_sond.py
+++ b/funasr/models/e2e_diar_sond.py
@@ -86,6 +86,8 @@ class DiarSondModel(AbsESPnetModel):
 
				         )
			
 
				         self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
			
 
				         self.pse_embedding = self.generate_pse_embedding()
			
 
				+        self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :])
			
 
				+        self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :])
			
 
				         self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
			
 
				         self.inter_score_loss_weight = inter_score_loss_weight
			
 
				 
			
@@ -102,8 +104,8 @@ class DiarSondModel(AbsESPnetModel):
 
				         speech_lengths: torch.Tensor = None,
			
 
				         profile: torch.Tensor = None,
			
 
				         profile_lengths: torch.Tensor = None,
			
 
				-        spk_labels: torch.Tensor = None,
			
 
				-        spk_labels_lengths: torch.Tensor = None,
			
 
				+        binary_labels: torch.Tensor = None,
			
 
				+        binary_labels_lengths: torch.Tensor = None,
			
 
				     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			
 
				         """Frontend + Encoder + Speaker Encoder + CI Scorer + CD Scorer + Decoder + Calc loss
			
 
				 
			
@@ -116,10 +118,10 @@ class DiarSondModel(AbsESPnetModel):
 
				                                      espnet2/iterators/chunk_iter_factory.py
			
 
				             profile: (Batch, N_spk, dim)
			
 
				             profile_lengths: (Batch,)
			
 
				-            spk_labels: (Batch, frames, input_size)
			
 
				-            spk_labels_lengths: (Batch,)
			
 
				+            binary_labels: (Batch, frames, max_spk_num)
			
 
				+            binary_labels_lengths: (Batch,)
			
 
				         """
			
 
				-        assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
			
 
				+        assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
			
 
				         batch_size = speech.shape[0]
			
 
				 
			
 
				         # 1. Network forward
			
@@ -132,23 +134,25 @@ class DiarSondModel(AbsESPnetModel):
 
				 
			
 
				         # 2. Aggregate time-domain labels to match forward outputs
			
 
				         if self.label_aggregator is not None:
			
 
				-            spk_labels, spk_labels_lengths = self.label_aggregator(
			
 
				-                spk_labels.unsqueeze(2), spk_labels_lengths
			
 
				+            binary_labels, binary_labels_lengths = self.label_aggregator(
			
 
				+                binary_labels, binary_labels_lengths
			
 
				             )
			
 
				-            spk_labels = spk_labels.squeeze(2)
			
 
				+        # 2. Calculate power-set encoding (PSE) labels
			
 
				+        raw_pse_labels = torch.sum(binary_labels * self.power_weight, dim=2, keepdim=True)
			
 
				+        pse_labels = torch.argmax(raw_pse_labels == self.int_token_arr, dim=2)
			
 
				 
			
 
				         # If encoder uses conv* as input_layer (i.e., subsampling),
			
 
				         # the sequence length of 'pred' might be slightly less than the
			
 
				         # length of 'spk_labels'. Here we force them to be equal.
			
 
				         length_diff_tolerance = 2
			
 
				-        length_diff = spk_labels.shape[1] - pred.shape[1]
			
 
				+        length_diff = pse_labels.shape[1] - pred.shape[1]
			
 
				         if 0 < length_diff <= length_diff_tolerance:
			
 
				-            spk_labels = spk_labels[:, 0: pred.shape[1], :]
			
 
				+            pse_labels = pse_labels[:, 0: pred.shape[1]]
			
 
				 
			
 
				-        loss_diar = self.classification_loss(pred, spk_labels, spk_labels_lengths)
			
 
				+        loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
			
 
				         loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
			
 
				-        loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, spk_labels, spk_labels_lengths)
			
 
				-        label_mask = make_pad_mask(spk_labels_lengths, maxlen=spk_labels.shape[1])
			
 
				+        loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, pse_labels, binary_labels_lengths)
			
 
				+        label_mask = make_pad_mask(binary_labels_lengths, maxlen=pse_labels.shape[1])
			
 
				         loss = (loss_diar + self.speaker_discrimination_loss_weight * loss_spk_dis
			
 
				                 + self.inter_score_loss_weight * (loss_inter_ci + loss_inter_cd))
			
 
				 
			
@@ -164,8 +168,8 @@ class DiarSondModel(AbsESPnetModel):
 
				             speaker_error,
			
 
				         ) = self.calc_diarization_error(
			
 
				             pred=F.embedding(pred.argmax(dim=2) * label_mask, self.pse_embedding),
			
 
				-            label=F.embedding(spk_labels * label_mask, self.pse_embedding),
			
 
				-            length=spk_labels_lengths
			
 
				+            label=F.embedding(pse_labels * label_mask, self.pse_embedding),
			
 
				+            length=binary_labels_lengths
			
 
				         )
			
 
				 
			
 
				         if speech_scored > 0 and num_frames > 0:
			
--- a/funasr/models/encoder/ecapa_tdnn_encoder.py
+++ b/funasr/models/encoder/ecapa_tdnn_encoder.py
@@ -0,0 +1,689 @@
 
				+import math
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+
			
 
				+class _BatchNorm1d(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        input_shape=None,
			
 
				+        input_size=None,
			
 
				+        eps=1e-05,
			
 
				+        momentum=0.1,
			
 
				+        affine=True,
			
 
				+        track_running_stats=True,
			
 
				+        combine_batch_time=False,
			
 
				+        skip_transpose=False,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.combine_batch_time = combine_batch_time
			
 
				+        self.skip_transpose = skip_transpose
			
 
				+
			
 
				+        if input_size is None and skip_transpose:
			
 
				+            input_size = input_shape[1]
			
 
				+        elif input_size is None:
			
 
				+            input_size = input_shape[-1]
			
 
				+
			
 
				+        self.norm = nn.BatchNorm1d(
			
 
				+            input_size,
			
 
				+            eps=eps,
			
 
				+            momentum=momentum,
			
 
				+            affine=affine,
			
 
				+            track_running_stats=track_running_stats,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        shape_or = x.shape
			
 
				+        if self.combine_batch_time:
			
 
				+            if x.ndim == 3:
			
 
				+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
			
 
				+            else:
			
 
				+                x = x.reshape(
			
 
				+                    shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
			
 
				+                )
			
 
				+
			
 
				+        elif not self.skip_transpose:
			
 
				+            x = x.transpose(-1, 1)
			
 
				+
			
 
				+        x_n = self.norm(x)
			
 
				+
			
 
				+        if self.combine_batch_time:
			
 
				+            x_n = x_n.reshape(shape_or)
			
 
				+        elif not self.skip_transpose:
			
 
				+            x_n = x_n.transpose(1, -1)
			
 
				+
			
 
				+        return x_n
			
 
				+
			
 
				+
			
 
				+class _Conv1d(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        out_channels,
			
 
				+        kernel_size,
			
 
				+        input_shape=None,
			
 
				+        in_channels=None,
			
 
				+        stride=1,
			
 
				+        dilation=1,
			
 
				+        padding="same",
			
 
				+        groups=1,
			
 
				+        bias=True,
			
 
				+        padding_mode="reflect",
			
 
				+        skip_transpose=False,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.kernel_size = kernel_size
			
 
				+        self.stride = stride
			
 
				+        self.dilation = dilation
			
 
				+        self.padding = padding
			
 
				+        self.padding_mode = padding_mode
			
 
				+        self.unsqueeze = False
			
 
				+        self.skip_transpose = skip_transpose
			
 
				+
			
 
				+        if input_shape is None and in_channels is None:
			
 
				+            raise ValueError("Must provide one of input_shape or in_channels")
			
 
				+
			
 
				+        if in_channels is None:
			
 
				+            in_channels = self._check_input_shape(input_shape)
			
 
				+
			
 
				+        self.conv = nn.Conv1d(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            self.kernel_size,
			
 
				+            stride=self.stride,
			
 
				+            dilation=self.dilation,
			
 
				+            padding=0,
			
 
				+            groups=groups,
			
 
				+            bias=bias,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if not self.skip_transpose:
			
 
				+            x = x.transpose(1, -1)
			
 
				+
			
 
				+        if self.unsqueeze:
			
 
				+            x = x.unsqueeze(1)
			
 
				+
			
 
				+        if self.padding == "same":
			
 
				+            x = self._manage_padding(
			
 
				+                x, self.kernel_size, self.dilation, self.stride
			
 
				+            )
			
 
				+
			
 
				+        elif self.padding == "causal":
			
 
				+            num_pad = (self.kernel_size - 1) * self.dilation
			
 
				+            x = F.pad(x, (num_pad, 0))
			
 
				+
			
 
				+        elif self.padding == "valid":
			
 
				+            pass
			
 
				+
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                "Padding must be 'same', 'valid' or 'causal'. Got "
			
 
				+                + self.padding
			
 
				+            )
			
 
				+
			
 
				+        wx = self.conv(x)
			
 
				+
			
 
				+        if self.unsqueeze:
			
 
				+            wx = wx.squeeze(1)
			
 
				+
			
 
				+        if not self.skip_transpose:
			
 
				+            wx = wx.transpose(1, -1)
			
 
				+
			
 
				+        return wx
			
 
				+
			
 
				+    def _manage_padding(
			
 
				+        self, x, kernel_size: int, dilation: int, stride: int,
			
 
				+    ):
			
 
				+        # Detecting input shape
			
 
				+        L_in = x.shape[-1]
			
 
				+
			
 
				+        # Time padding
			
 
				+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
			
 
				+
			
 
				+        # Applying padding
			
 
				+        x = F.pad(x, padding, mode=self.padding_mode)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def _check_input_shape(self, shape):
			
 
				+        """Checks the input shape and returns the number of input channels.
			
 
				+        """
			
 
				+
			
 
				+        if len(shape) == 2:
			
 
				+            self.unsqueeze = True
			
 
				+            in_channels = 1
			
 
				+        elif self.skip_transpose:
			
 
				+            in_channels = shape[1]
			
 
				+        elif len(shape) == 3:
			
 
				+            in_channels = shape[2]
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                "conv1d expects 2d, 3d inputs. Got " + str(len(shape))
			
 
				+            )
			
 
				+
			
 
				+        # Kernel size must be odd
			
 
				+        if self.kernel_size % 2 == 0:
			
 
				+            raise ValueError(
			
 
				+                "The field kernel size must be an odd number. Got %s."
			
 
				+                % (self.kernel_size)
			
 
				+            )
			
 
				+        return in_channels
			
 
				+
			
 
				+
			
 
				+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
			
 
				+    if stride > 1:
			
 
				+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
			
 
				+        L_out = stride * (n_steps - 1) + kernel_size * dilation
			
 
				+        padding = [kernel_size // 2, kernel_size // 2]
			
 
				+
			
 
				+    else:
			
 
				+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
			
 
				+
			
 
				+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
			
 
				+    return padding
			
 
				+
			
 
				+
			
 
				+# Skip transpose as much as possible for efficiency
			
 
				+class Conv1d(_Conv1d):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(skip_transpose=True, *args, **kwargs)
			
 
				+
			
 
				+
			
 
				+class BatchNorm1d(_BatchNorm1d):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(skip_transpose=True, *args, **kwargs)
			
 
				+
			
 
				+
			
 
				+def length_to_mask(length, max_len=None, dtype=None, device=None):
			
 
				+    assert len(length.shape) == 1
			
 
				+
			
 
				+    if max_len is None:
			
 
				+        max_len = length.max().long().item()  # using arange to generate mask
			
 
				+    mask = torch.arange(
			
 
				+        max_len, device=length.device, dtype=length.dtype
			
 
				+    ).expand(len(length), max_len) < length.unsqueeze(1)
			
 
				+
			
 
				+    if dtype is None:
			
 
				+        dtype = length.dtype
			
 
				+
			
 
				+    if device is None:
			
 
				+        device = length.device
			
 
				+
			
 
				+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
			
 
				+    return mask
			
 
				+
			
 
				+
			
 
				+class TDNNBlock(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels,
			
 
				+        out_channels,
			
 
				+        kernel_size,
			
 
				+        dilation,
			
 
				+        activation=nn.ReLU,
			
 
				+        groups=1,
			
 
				+    ):
			
 
				+        super(TDNNBlock, self).__init__()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=in_channels,
			
 
				+            out_channels=out_channels,
			
 
				+            kernel_size=kernel_size,
			
 
				+            dilation=dilation,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.activation = activation()
			
 
				+        self.norm = BatchNorm1d(input_size=out_channels)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return self.norm(self.activation(self.conv(x)))
			
 
				+
			
 
				+
			
 
				+class Res2NetBlock(torch.nn.Module):
			
 
				+    """An implementation of Res2NetBlock w/ dilation.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    in_channels : int
			
 
				+        The number of channels expected in the input.
			
 
				+    out_channels : int
			
 
				+        The number of output channels.
			
 
				+    scale : int
			
 
				+        The scale of the Res2Net block.
			
 
				+    kernel_size: int
			
 
				+        The kernel size of the Res2Net block.
			
 
				+    dilation : int
			
 
				+        The dilation of the Res2Net block.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
			
 
				+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1
			
 
				+    ):
			
 
				+        super(Res2NetBlock, self).__init__()
			
 
				+        assert in_channels % scale == 0
			
 
				+        assert out_channels % scale == 0
			
 
				+
			
 
				+        in_channel = in_channels // scale
			
 
				+        hidden_channel = out_channels // scale
			
 
				+
			
 
				+        self.blocks = nn.ModuleList(
			
 
				+            [
			
 
				+                TDNNBlock(
			
 
				+                    in_channel,
			
 
				+                    hidden_channel,
			
 
				+                    kernel_size=kernel_size,
			
 
				+                    dilation=dilation,
			
 
				+                )
			
 
				+                for i in range(scale - 1)
			
 
				+            ]
			
 
				+        )
			
 
				+        self.scale = scale
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        y = []
			
 
				+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
			
 
				+            if i == 0:
			
 
				+                y_i = x_i
			
 
				+            elif i == 1:
			
 
				+                y_i = self.blocks[i - 1](x_i)
			
 
				+            else:
			
 
				+                y_i = self.blocks[i - 1](x_i + y_i)
			
 
				+            y.append(y_i)
			
 
				+        y = torch.cat(y, dim=1)
			
 
				+        return y
			
 
				+
			
 
				+
			
 
				+class SEBlock(nn.Module):
			
 
				+    """An implementation of squeeze-and-excitation block.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    in_channels : int
			
 
				+        The number of input channels.
			
 
				+    se_channels : int
			
 
				+        The number of output channels after squeeze.
			
 
				+    out_channels : int
			
 
				+        The number of output channels.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> se_layer = SEBlock(64, 16, 64)
			
 
				+    >>> lengths = torch.rand((8,))
			
 
				+    >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, in_channels, se_channels, out_channels):
			
 
				+        super(SEBlock, self).__init__()
			
 
				+
			
 
				+        self.conv1 = Conv1d(
			
 
				+            in_channels=in_channels, out_channels=se_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.relu = torch.nn.ReLU(inplace=True)
			
 
				+        self.conv2 = Conv1d(
			
 
				+            in_channels=se_channels, out_channels=out_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.sigmoid = torch.nn.Sigmoid()
			
 
				+
			
 
				+    def forward(self, x, lengths=None):
			
 
				+        L = x.shape[-1]
			
 
				+        if lengths is not None:
			
 
				+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
			
 
				+            mask = mask.unsqueeze(1)
			
 
				+            total = mask.sum(dim=2, keepdim=True)
			
 
				+            s = (x * mask).sum(dim=2, keepdim=True) / total
			
 
				+        else:
			
 
				+            s = x.mean(dim=2, keepdim=True)
			
 
				+
			
 
				+        s = self.relu(self.conv1(s))
			
 
				+        s = self.sigmoid(self.conv2(s))
			
 
				+
			
 
				+        return s * x
			
 
				+
			
 
				+
			
 
				+class AttentiveStatisticsPooling(nn.Module):
			
 
				+    """This class implements an attentive statistic pooling layer for each channel.
			
 
				+    It returns the concatenated mean and std of the input tensor.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    channels: int
			
 
				+        The number of input channels.
			
 
				+    attention_channels: int
			
 
				+        The number of attention channels.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> asp_layer = AttentiveStatisticsPooling(64)
			
 
				+    >>> lengths = torch.rand((8,))
			
 
				+    >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 1, 128])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, channels, attention_channels=128, global_context=True):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.eps = 1e-12
			
 
				+        self.global_context = global_context
			
 
				+        if global_context:
			
 
				+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
			
 
				+        else:
			
 
				+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
			
 
				+        self.tanh = nn.Tanh()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=attention_channels, out_channels=channels, kernel_size=1
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x, lengths=None):
			
 
				+        """Calculates mean and std for a batch (input tensor).
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape [N, C, L].
			
 
				+        """
			
 
				+        L = x.shape[-1]
			
 
				+
			
 
				+        def _compute_statistics(x, m, dim=2, eps=self.eps):
			
 
				+            mean = (m * x).sum(dim)
			
 
				+            std = torch.sqrt(
			
 
				+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
			
 
				+            )
			
 
				+            return mean, std
			
 
				+
			
 
				+        if lengths is None:
			
 
				+            lengths = torch.ones(x.shape[0], device=x.device)
			
 
				+
			
 
				+        # Make binary mask of shape [N, 1, L]
			
 
				+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
			
 
				+        mask = mask.unsqueeze(1)
			
 
				+
			
 
				+        # Expand the temporal context of the pooling layer by allowing the
			
 
				+        # self-attention to look at global properties of the utterance.
			
 
				+        if self.global_context:
			
 
				+            # torch.std is unstable for backward computation
			
 
				+            # https://github.com/pytorch/pytorch/issues/4320
			
 
				+            total = mask.sum(dim=2, keepdim=True).float()
			
 
				+            mean, std = _compute_statistics(x, mask / total)
			
 
				+            mean = mean.unsqueeze(2).repeat(1, 1, L)
			
 
				+            std = std.unsqueeze(2).repeat(1, 1, L)
			
 
				+            attn = torch.cat([x, mean, std], dim=1)
			
 
				+        else:
			
 
				+            attn = x
			
 
				+
			
 
				+        # Apply layers
			
 
				+        attn = self.conv(self.tanh(self.tdnn(attn)))
			
 
				+
			
 
				+        # Filter out zero-paddings
			
 
				+        attn = attn.masked_fill(mask == 0, float("-inf"))
			
 
				+
			
 
				+        attn = F.softmax(attn, dim=2)
			
 
				+        mean, std = _compute_statistics(x, attn)
			
 
				+        # Append mean and std of the batch
			
 
				+        pooled_stats = torch.cat((mean, std), dim=1)
			
 
				+        pooled_stats = pooled_stats.unsqueeze(2)
			
 
				+
			
 
				+        return pooled_stats
			
 
				+
			
 
				+
			
 
				+class SERes2NetBlock(nn.Module):
			
 
				+    """An implementation of building block in ECAPA-TDNN, i.e.,
			
 
				+    TDNN-Res2Net-TDNN-SEBlock.
			
 
				+
			
 
				+    Arguments
			
 
				+    ----------
			
 
				+    out_channels: int
			
 
				+        The number of output channels.
			
 
				+    res2net_scale: int
			
 
				+        The scale of the Res2Net block.
			
 
				+    kernel_size: int
			
 
				+        The kernel size of the TDNN blocks.
			
 
				+    dilation: int
			
 
				+        The dilation of the Res2Net block.
			
 
				+    activation : torch class
			
 
				+        A class for constructing the activation layers.
			
 
				+    groups: int
			
 
				+    Number of blocked connections from input channels to output channels.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
			
 
				+    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
			
 
				+    >>> out = conv(x).transpose(1, 2)
			
 
				+    >>> out.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels,
			
 
				+        out_channels,
			
 
				+        res2net_scale=8,
			
 
				+        se_channels=128,
			
 
				+        kernel_size=1,
			
 
				+        dilation=1,
			
 
				+        activation=torch.nn.ReLU,
			
 
				+        groups=1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.out_channels = out_channels
			
 
				+        self.tdnn1 = TDNNBlock(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            activation=activation,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.res2net_block = Res2NetBlock(
			
 
				+            out_channels, out_channels, res2net_scale, kernel_size, dilation
			
 
				+        )
			
 
				+        self.tdnn2 = TDNNBlock(
			
 
				+            out_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            activation=activation,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
			
 
				+
			
 
				+        self.shortcut = None
			
 
				+        if in_channels != out_channels:
			
 
				+            self.shortcut = Conv1d(
			
 
				+                in_channels=in_channels,
			
 
				+                out_channels=out_channels,
			
 
				+                kernel_size=1,
			
 
				+            )
			
 
				+
			
 
				+    def forward(self, x, lengths=None):
			
 
				+        residual = x
			
 
				+        if self.shortcut:
			
 
				+            residual = self.shortcut(x)
			
 
				+
			
 
				+        x = self.tdnn1(x)
			
 
				+        x = self.res2net_block(x)
			
 
				+        x = self.tdnn2(x)
			
 
				+        x = self.se_block(x, lengths)
			
 
				+
			
 
				+        return x + residual
			
 
				+
			
 
				+
			
 
				+class ECAPA_TDNN(torch.nn.Module):
			
 
				+    """An implementation of the speaker embedding model in a paper.
			
 
				+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
			
 
				+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    device : str
			
 
				+        Device used, e.g., "cpu" or "cuda".
			
 
				+    activation : torch class
			
 
				+        A class for constructing the activation layers.
			
 
				+    channels : list of ints
			
 
				+        Output channels for TDNN/SERes2Net layer.
			
 
				+    kernel_sizes : list of ints
			
 
				+        List of kernel sizes for each layer.
			
 
				+    dilations : list of ints
			
 
				+        List of dilations for kernels in each layer.
			
 
				+    lin_neurons : int
			
 
				+        Number of neurons in linear layers.
			
 
				+    groups : list of ints
			
 
				+        List of groups for kernels in each layer.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> input_feats = torch.rand([5, 120, 80])
			
 
				+    >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
			
 
				+    >>> outputs = compute_embedding(input_feats)
			
 
				+    >>> outputs.shape
			
 
				+    torch.Size([5, 1, 192])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        input_size,
			
 
				+        device="cpu",
			
 
				+        lin_neurons=192,
			
 
				+        activation=torch.nn.ReLU,
			
 
				+        channels=[512, 512, 512, 512, 1536],
			
 
				+        kernel_sizes=[5, 3, 3, 3, 1],
			
 
				+        dilations=[1, 2, 3, 4, 1],
			
 
				+        attention_channels=128,
			
 
				+        res2net_scale=8,
			
 
				+        se_channels=128,
			
 
				+        global_context=True,
			
 
				+        groups=[1, 1, 1, 1, 1],
			
 
				+        window_size=20,
			
 
				+        window_shift=1,
			
 
				+    ):
			
 
				+
			
 
				+        super().__init__()
			
 
				+        assert len(channels) == len(kernel_sizes)
			
 
				+        assert len(channels) == len(dilations)
			
 
				+        self.channels = channels
			
 
				+        self.blocks = nn.ModuleList()
			
 
				+        self.window_size = window_size
			
 
				+        self.window_shift = window_shift
			
 
				+
			
 
				+        # The initial TDNN layer
			
 
				+        self.blocks.append(
			
 
				+            TDNNBlock(
			
 
				+                input_size,
			
 
				+                channels[0],
			
 
				+                kernel_sizes[0],
			
 
				+                dilations[0],
			
 
				+                activation,
			
 
				+                groups[0],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # SE-Res2Net layers
			
 
				+        for i in range(1, len(channels) - 1):
			
 
				+            self.blocks.append(
			
 
				+                SERes2NetBlock(
			
 
				+                    channels[i - 1],
			
 
				+                    channels[i],
			
 
				+                    res2net_scale=res2net_scale,
			
 
				+                    se_channels=se_channels,
			
 
				+                    kernel_size=kernel_sizes[i],
			
 
				+                    dilation=dilations[i],
			
 
				+                    activation=activation,
			
 
				+                    groups=groups[i],
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        self.mfa = TDNNBlock(
			
 
				+            channels[-1],
			
 
				+            channels[-1],
			
 
				+            kernel_sizes[-1],
			
 
				+            dilations[-1],
			
 
				+            activation,
			
 
				+            groups=groups[-1],
			
 
				+        )
			
 
				+
			
 
				+        # Attentive Statistical Pooling
			
 
				+        self.asp = AttentiveStatisticsPooling(
			
 
				+            channels[-1],
			
 
				+            attention_channels=attention_channels,
			
 
				+            global_context=global_context,
			
 
				+        )
			
 
				+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
			
 
				+
			
 
				+        # Final linear transformation
			
 
				+        self.fc = Conv1d(
			
 
				+            in_channels=channels[-1] * 2,
			
 
				+            out_channels=lin_neurons,
			
 
				+            kernel_size=1,
			
 
				+        )
			
 
				+
			
 
				+    def windowed_pooling(self, x, lengths=None):
			
 
				+        # x: Batch, Channel, Time
			
 
				+        tt = x.shape[2]
			
 
				+        num_chunk = int(math.ceil(tt / self.window_shift))
			
 
				+        pad = self.window_size // 2
			
 
				+        x = F.pad(x, (pad, pad, 0, 0), "reflect")
			
 
				+        stat_list = []
			
 
				+
			
 
				+        for i in range(num_chunk):
			
 
				+            # B x C
			
 
				+            st, ed = i * self.window_shift, i * self.window_shift + self.window_size
			
 
				+            x = self.asp(x[:, :, st: ed],
			
 
				+                         lengths=torch.clamp(lengths - i, 0, self.window_size)
			
 
				+                         if lengths is not None else None)
			
 
				+            x = self.asp_bn(x)
			
 
				+            x = self.fc(x)
			
 
				+            stat_list.append(x)
			
 
				+
			
 
				+        return torch.cat(stat_list, dim=2)
			
 
				+
			
 
				+    def forward(self, x, lengths=None):
			
 
				+        """Returns the embedding vector.
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape (batch, time, channel).
			
 
				+        lengths: torch.Tensor
			
 
				+            Tensor of shape (batch, )
			
 
				+        """
			
 
				+        # Minimize transpose for efficiency
			
 
				+        x = x.transpose(1, 2)
			
 
				+
			
 
				+        xl = []
			
 
				+        for layer in self.blocks:
			
 
				+            try:
			
 
				+                x = layer(x, lengths=lengths)
			
 
				+            except TypeError:
			
 
				+                x = layer(x)
			
 
				+            xl.append(x)
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        x = torch.cat(xl[1:], dim=1)
			
 
				+        x = self.mfa(x)
			
 
				+
			
 
				+        if self.window_size is None:
			
 
				+            # Attentive Statistical Pooling
			
 
				+            x = self.asp(x, lengths=lengths)
			
 
				+            x = self.asp_bn(x)
			
 
				+            # Final linear transformation
			
 
				+            x = self.fc(x)
			
 
				+            # x = x.transpose(1, 2)
			
 
				+            x = x.squeeze(2)  # -> B, C
			
 
				+        else:
			
 
				+            x = self.windowed_pooling(x, lengths)
			
 
				+            x = x.transpose(1, 2)  # -> B, T, C
			
 
				+        return x
			
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -368,7 +368,7 @@ class DiarTask(AbsTask):
 
				             cls, train: bool = True, inference: bool = False
			
 
				     ) -> Tuple[str, ...]:
			
 
				         if not inference:
			
 
				-            retval = ("speech", "profile", "label")
			
 
				+            retval = ("speech", "profile", "binary_labels")
			
 
				         else:
			
 
				             # Recognition mode
			
 
				             retval = ("speech", "profile")