há 3 anos atrás · 777ae05adb
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
@@ -0,0 +1,39 @@
 
															+from modelscope.pipelines import pipeline
														
 
															+from modelscope.utils.constant import Tasks
														
 
															+import numpy as np
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    inference_sv_pipline = pipeline(
														
 
															+        task=Tasks.speaker_verification,
														
 
															+        model='damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
														
 
															+    )
														
 
															+
														
 
															+    # extract speaker embedding
														
 
															+    # for url use "spk_embedding" as key
														
 
															+    rec_result = inference_sv_pipline(
														
 
															+        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
														
 
															+    enroll = rec_result["spk_embedding"]
														
 
															+
														
 
															+    # for local file use "spk_embedding" as key
														
 
															+    rec_result = inference_sv_pipline(audio_in='sv_example_same.wav')["test1"]
														
 
															+    same = rec_result["spk_embedding"]
														
 
															+
														
 
															+    import soundfile
														
 
															+    wav = soundfile.read('sv_example_enroll.wav')[0]
														
 
															+    # for raw inputs use "spk_embedding" as key
														
 
															+    spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]
														
 
															+
														
 
															+    rec_result = inference_sv_pipline(
														
 
															+        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav')
														
 
															+    different = rec_result["spk_embedding"]
														
 
															+
														
 
															+    # calculate cosine similarity for same speaker
														
 
															+    sv_threshold = 0.9465
														
 
															+    same_cos = np.sum(enroll * same) / (np.linalg.norm(enroll) * np.linalg.norm(same))
														
 
															+    same_cos = max(same_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
														
 
															+    print("Similarity:", same_cos)
														
 
															+
														
 
															+    # calculate cosine similarity for different speaker
														
 
															+    diff_cos = np.sum(enroll * different) / (np.linalg.norm(enroll) * np.linalg.norm(different))
														
 
															+    diff_cos = max(diff_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
														
 
															+    print("Similarity:", diff_cos)
														
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
@@ -0,0 +1,21 @@
 
															+from modelscope.pipelines import pipeline
														
 
															+from modelscope.utils.constant import Tasks
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    inference_sv_pipline = pipeline(
														
 
															+        task=Tasks.speaker_verification,
														
 
															+        model='speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
														
 
															+    )
														
 
															+
														
 
															+    # the same speaker
														
 
															+    rec_result = inference_sv_pipline(audio_in=(
														
 
															+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
														
 
															+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
														
 
															+    print("Similarity", rec_result["scores"])
														
 
															+
														
 
															+    # different speakers
														
 
															+    rec_result = inference_sv_pipline(audio_in=(
														
 
															+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
														
 
															+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav'))
														
 
															+
														
 
															+    print("Similarity", rec_result["scores"])
														
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
@@ -12,20 +12,20 @@ if __name__ == '__main__':
 
															     # for url use "utt_id" as key
														
 
															     rec_result = inference_sv_pipline(
														
 
															         audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
														
 
															-    enroll = rec_result["utt_id"]
														
 
															+    enroll = rec_result["spk_embedding"]
														
 
															     # for local file use "utt_id" as key
														
 
															     rec_result = inference_sv_pipline(audio_in='sv_example_same.wav')["test1"]
														
 
															-    same = rec_result["test1"]
														
 
															+    same = rec_result["spk_embedding"]
														
 
															     import soundfile
														
 
															     wav = soundfile.read('sv_example_enroll.wav')[0]
														
 
															     # for raw inputs use "utt_id" as key
														
 
															-    spk_embedding = inference_sv_pipline(audio_in=wav)["utt_id"]
														
 
															+    spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]
														
 
															     rec_result = inference_sv_pipline(
														
 
															         audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav')
														
 
															-    different = rec_result["utt_id"]
														
 
															+    different = rec_result["spk_embedding"]
														
 
															     # 对相同的说话人计算余弦相似度
														
 
															     sv_threshold = 0.9465
														
--- a/funasr/models/encoder/resnet34_encoder.py
+++ b/funasr/models/encoder/resnet34_encoder.py
@@ -387,7 +387,6 @@ class ResNet34_SP_L2Reg(AbsEncoder):
 
															         return var_dict_torch_update
														
 
															-
														
 
															 class ResNet34Diar(ResNet34):
														
 
															     def __init__(
														
 
															             self,
														
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@@ -1,14 +1,18 @@
 
															 import argparse
														
 
															 import logging
														
 
															+import os
														
 
															+from pathlib import Path
														
 
															 from typing import Callable
														
 
															 from typing import Collection
														
 
															 from typing import Dict
														
 
															 from typing import List
														
 
															 from typing import Optional
														
 
															 from typing import Tuple
														
 
															+from typing import Union
														
 
															 import numpy as np
														
 
															 import torch
														
 
															+import yaml
														
 
															 from typeguard import check_argument_types
														
 
															 from typeguard import check_return_type
														
@@ -21,7 +25,7 @@ from funasr.models.e2e_asr import ESPnetASRModel
 
															 from funasr.models.decoder.abs_decoder import AbsDecoder
														
 
															 from funasr.models.encoder.abs_encoder import AbsEncoder
														
 
															 from funasr.models.encoder.rnn_encoder import RNNEncoder
														
 
															-from funasr.models.encoder.resnet34_encoder import ResNet34
														
 
															+from funasr.models.encoder.resnet34_encoder import ResNet34, ResNet34_SP_L2Reg
														
 
															 from funasr.models.pooling.statistic_pooling import StatisticPooling
														
 
															 from funasr.models.decoder.sv_decoder import DenseDecoder
														
 
															 from funasr.models.e2e_sv import ESPnetSVModel
														
@@ -103,6 +107,7 @@ encoder_choices = ClassChoices(
 
															     "encoder",
														
 
															     classes=dict(
														
 
															         resnet34=ResNet34,
														
 
															+        resnet34_sp_l2reg=ResNet34_SP_L2Reg,
														
 
															         rnn=RNNEncoder,
														
 
															     ),
														
 
															     type_check=AbsEncoder,
														
@@ -394,9 +399,16 @@ class SVTask(AbsTask):
 
															         # 7. Pooling layer
														
 
															         pooling_class = pooling_choices.get_class(args.pooling_type)
														
 
															+        pooling_dim = (2, 3)
														
 
															+        eps = 1e-12
														
 
															+        if hasattr(args, "pooling_type_conf"):
														
 
															+            if "pooling_dim" in args.pooling_type_conf:
														
 
															+                pooling_dim = args.pooling_type_conf["pooling_dim"]
														
 
															+            if "eps" in args.pooling_type_conf:
														
 
															+                eps = args.pooling_type_conf["eps"]
														
 
															         pooling_layer = pooling_class(
														
 
															-            pooling_dim=(2, 3),
														
 
															-            eps=1e-12,
														
 
															+            pooling_dim=pooling_dim,
														
 
															+            eps=eps,
														
 
															         )
														
 
															         if args.pooling_type == "statistic":
														
 
															             encoder_output_size *= 2
														
@@ -435,3 +447,95 @@ class SVTask(AbsTask):
 
															         assert check_return_type(model)
														
 
															         return model
														
 
															+
														
 
															+    # ~~~~~~~~~ The methods below are mainly used for inference ~~~~~~~~~
														
 
															+    @classmethod
														
 
															+    def build_model_from_file(
														
 
															+            cls,
														
 
															+            config_file: Union[Path, str] = None,
														
 
															+            model_file: Union[Path, str] = None,
														
 
															+            cmvn_file: Union[Path, str] = None,
														
 
															+            device: str = "cpu",
														
 
															+    ):
														
 
															+        """Build model from the files.
														
 
															+
														
 
															+        This method is used for inference or fine-tuning.
														
 
															+
														
 
															+        Args:
														
 
															+            config_file: The yaml file saved when training.
														
 
															+            model_file: The model file saved when training.
														
 
															+            cmvn_file: The cmvn file for front-end
														
 
															+            device: Device type, "cpu", "cuda", or "cuda:N".
														
 
															+
														
 
															+        """
														
 
															+        assert check_argument_types()
														
 
															+        if config_file is None:
														
 
															+            assert model_file is not None, (
														
 
															+                "The argument 'model_file' must be provided "
														
 
															+                "if the argument 'config_file' is not specified."
														
 
															+            )
														
 
															+            config_file = Path(model_file).parent / "config.yaml"
														
 
															+        else:
														
 
															+            config_file = Path(config_file)
														
 
															+
														
 
															+        with config_file.open("r", encoding="utf-8") as f:
														
 
															+            args = yaml.safe_load(f)
														
 
															+        if cmvn_file is not None:
														
 
															+            args["cmvn_file"] = cmvn_file
														
 
															+        args = argparse.Namespace(**args)
														
 
															+        model = cls.build_model(args)
														
 
															+        if not isinstance(model, AbsESPnetModel):
														
 
															+            raise RuntimeError(
														
 
															+                f"model must inherit {AbsESPnetModel.__name__}, but got {type(model)}"
														
 
															+            )
														
 
															+        model.to(device)
														
 
															+        model_dict = dict()
														
 
															+        model_name_pth = None
														
 
															+        if model_file is not None:
														
 
															+            logging.info("model_file is {}".format(model_file))
														
 
															+            if device == "cuda":
														
 
															+                device = f"cuda:{torch.cuda.current_device()}"
														
 
															+            model_dir = os.path.dirname(model_file)
														
 
															+            model_name = os.path.basename(model_file)
														
 
															+            if "model.ckpt-" in model_name or ".bin" in model_name:
														
 
															+                if ".bin" in model_name:
														
 
															+                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
														
 
															+                else:
														
 
															+                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
														
 
															+                if os.path.exists(model_name_pth):
														
 
															+                    logging.info("model_file is load from pth: {}".format(model_name_pth))
														
 
															+                    model_dict = torch.load(model_name_pth, map_location=device)
														
 
															+                else:
														
 
															+                    model_dict = cls.convert_tf2torch(model, model_file)
														
 
															+                model.load_state_dict(model_dict)
														
 
															+            else:
														
 
															+                model_dict = torch.load(model_file, map_location=device)
														
 
															+        model.load_state_dict(model_dict)
														
 
															+        if model_name_pth is not None and not os.path.exists(model_name_pth):
														
 
															+            torch.save(model_dict, model_name_pth)
														
 
															+            logging.info("model_file is saved to pth: {}".format(model_name_pth))
														
 
															+
														
 
															+        return model, args
														
 
															+
														
 
															+    @classmethod
														
 
															+    def convert_tf2torch(
														
 
															+            cls,
														
 
															+            model,
														
 
															+            ckpt,
														
 
															+    ):
														
 
															+        logging.info("start convert tf model to torch model")
														
 
															+        from funasr.modules.streaming_utils.load_fr_tf import load_tf_dict
														
 
															+        var_dict_tf = load_tf_dict(ckpt)
														
 
															+        var_dict_torch = model.state_dict()
														
 
															+        var_dict_torch_update = dict()
														
 
															+        # speech encoder
														
 
															+        var_dict_torch_update_local = model.encoder.convert_tf2torch(var_dict_tf, var_dict_torch)
														
 
															+        var_dict_torch_update.update(var_dict_torch_update_local)
														
 
															+        # pooling layer
														
 
															+        var_dict_torch_update_local = model.pooling_layer.convert_tf2torch(var_dict_tf, var_dict_torch)
														
 
															+        var_dict_torch_update.update(var_dict_torch_update_local)
														
 
															+        # decoder
														
 
															+        var_dict_torch_update_local = model.decoder.convert_tf2torch(var_dict_tf, var_dict_torch)
														
 
															+        var_dict_torch_update.update(var_dict_torch_update_local)
														
 
															+
														
 
															+        return var_dict_torch_update