3 лет назад · 2f3d36d689
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
 
				 - FunASR supplies a easy-to-use pipeline to finetune pretrained models from [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
			
 
				 - Compared to [Espnet](https://github.com/espnet/espnet) framework, the training speed of large-scale datasets in FunASR is much faster owning to the optimized dataloader.
			
 
				 
			
 
				-## Installation(Training and Developing)
			
 
				+## Installation
			
 
				 
			
 
				 - Install Conda:
			
 
				 ``` sh
			
@@ -40,16 +40,17 @@ pip3 install torch torchvision torchaudio
 
				 ```
			
 
				 For more versions, please see [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
			
 
				 
			
 
				+- Install ModelScope:
			
 
				 
			
 
				-If you are in the area of China, you could set the source to speed the downloading.
			
 
				+If you are in the area of China, you could set the source to speedup the downloading.
			
 
				 
			
 
				 ``` sh
			
 
				 pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
			
 
				 ```
			
 
				 
			
 
				-- Install ModelScope:
			
 
				+Install or upgrade modelscope.
			
 
				 ``` sh
			
 
				-pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
			
 
				+pip install "modelscope[audio]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
			
 
				 ```
			
 
				 
			
 
				 For more details about modelscope, please see [modelscope installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
			
@@ -61,18 +62,14 @@ git clone https://github.com/alibaba/FunASR.git && cd FunASR
 
				 pip install --editable ./
			
 
				 ```
			
 
				 
			
 
				-## Pretrained Model Zoo
			
 
				-
			
 
				-We have trained many academic and industrial models, [model hub](docs/modelscope_models.md)
			
 
				-
			
 
				 ## Contact
			
 
				 
			
 
				 If you have any questions about FunASR, please contact us by
			
 
				 
			
 
				 - email: [funasr@list.alibaba-inc.com](funasr@list.alibaba-inc.com)
			
 
				 
			
 
				-- Dingding group:
			
 
				-<div align="left"><img src="docs/images/dingding.jpg" width="250"/>!<img src="docs/images/wechat.png" width="222"/></div>
			
 
				+- Dingding group and Wechat group:
			
 
				+<div align="left"><img src="docs/images/dingding.jpg" width="250"/> <img src="docs/images/wechat.png" width="222"/></div>
			
 
				 
			
 
				 
			
 
				 ## Acknowledge
			
--- a/docs/images/.DS_Store
+++ b/docs/images/.DS_Store
--- a/docs/images/wechat.png
+++ b/docs/images/wechat.png
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
@@ -0,0 +1,30 @@
 
				+# ModelScope Model
			
 
				+
			
 
				+## How to finetune and infer using a pretrained Paraformer-large Model
			
 
				+
			
 
				+### Finetune
			
 
				+
			
 
				+- Modify finetune training related parameters in `finetune.py`
			
 
				+    - <strong>output_dir:</strong> # result dir
			
 
				+    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
			
 
				+    - <strong>batch_bins:</strong> # batch size
			
 
				+    - <strong>max_epoch:</strong> # number of training epoch
			
 
				+    - <strong>lr:</strong> # learning rate
			
 
				+
			
 
				+- Then you can run the pipeline to finetune with:
			
 
				+```python
			
 
				+    python finetune.py
			
 
				+```
			
 
				+
			
 
				+### Inference
			
 
				+
			
 
				+Or you can use the finetuned model for inference directly.
			
 
				+
			
 
				+- Setting parameters in `infer.py`
			
 
				+    - <strong>data_dir:</strong> # the dataset dir
			
 
				+    - <strong>output_dir:</strong> # result dir
			
 
				+
			
 
				+- Then you can run the pipeline to infer with:
			
 
				+```python
			
 
				+    python infer.py
			
 
				+```
			
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
@@ -0,0 +1,36 @@
 
				+import os
			
 
				+
			
 
				+from modelscope.metainfo import Trainers
			
 
				+from modelscope.trainers import build_trainer
			
 
				+
			
 
				+from funasr.datasets.ms_dataset import MsDataset
			
 
				+from funasr.utils.modelscope_param import modelscope_args
			
 
				+
			
 
				+
			
 
				+def modelscope_finetune(params):
			
 
				+    if not os.path.exists(params.output_dir):
			
 
				+        os.makedirs(params.output_dir, exist_ok=True)
			
 
				+    # dataset split ["train", "validation"]
			
 
				+    ds_dict = MsDataset.load(params.data_path)
			
 
				+    kwargs = dict(
			
 
				+        model=params.model,
			
 
				+        data_dir=ds_dict,
			
 
				+        dataset_type=params.dataset_type,
			
 
				+        work_dir=params.output_dir,
			
 
				+        batch_bins=params.batch_bins,
			
 
				+        max_epoch=params.max_epoch,
			
 
				+        lr=params.lr)
			
 
				+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
			
 
				+    trainer.train()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1", data_path="./data")
			
 
				+    params.output_dir = "./checkpoint"              # m模型保存路径
			
 
				+    params.data_path = "./example_data/"            # 数据路径
			
 
				+    params.dataset_type = "small"                   # 小数据量设置small，若数据量大于1000小时，请使用large
			
 
				+    params.batch_bins = 2000                       # batch size，如果dataset_type="small"，batch_bins单位为fbank特征帧数，如果dataset_type="large"，batch_bins单位为毫秒，
			
 
				+    params.max_epoch = 50                           # 最大训练轮数
			
 
				+    params.lr = 0.00005                             # 设置学习率
			
 
				+    
			
 
				+    modelscope_finetune(params)
			
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -0,0 +1,88 @@
 
				+import os
			
 
				+import shutil
			
 
				+from multiprocessing import Pool
			
 
				+
			
 
				+from modelscope.pipelines import pipeline
			
 
				+from modelscope.utils.constant import Tasks
			
 
				+
			
 
				+from funasr.utils.compute_wer import compute_wer
			
 
				+
			
 
				+
			
 
				+def modelscope_infer_core(output_dir, split_dir, njob, idx):
			
 
				+    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
			
 
				+    gpu_id = (int(idx) - 1) // njob
			
 
				+    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
			
 
				+        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
			
 
				+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
			
 
				+    else:
			
 
				+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
			
 
				+    inference_pipline = pipeline(
			
 
				+        task=Tasks.auto_speech_recognition,
			
 
				+        model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
			
 
				+        output_dir=output_dir_job,
			
 
				+        batch_size=64
			
 
				+    )
			
 
				+    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
			
 
				+    inference_pipline(audio_in=audio_in)
			
 
				+
			
 
				+
			
 
				+def modelscope_infer(params):
			
 
				+    # prepare for multi-GPU decoding
			
 
				+    ngpu = params["ngpu"]
			
 
				+    njob = params["njob"]
			
 
				+    output_dir = params["output_dir"]
			
 
				+    if os.path.exists(output_dir):
			
 
				+        shutil.rmtree(output_dir)
			
 
				+    os.mkdir(output_dir)
			
 
				+    split_dir = os.path.join(output_dir, "split")
			
 
				+    os.mkdir(split_dir)
			
 
				+    nj = ngpu * njob
			
 
				+    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
			
 
				+    with open(wav_scp_file) as f:
			
 
				+        lines = f.readlines()
			
 
				+        num_lines = len(lines)
			
 
				+        num_job_lines = num_lines // nj
			
 
				+    start = 0
			
 
				+    for i in range(nj):
			
 
				+        end = start + num_job_lines
			
 
				+        file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
			
 
				+        with open(file, "w") as f:
			
 
				+            if i == nj - 1:
			
 
				+                f.writelines(lines[start:])
			
 
				+            else:
			
 
				+                f.writelines(lines[start:end])
			
 
				+        start = end
			
 
				+
			
 
				+    p = Pool(nj)
			
 
				+    for i in range(nj):
			
 
				+        p.apply_async(modelscope_infer_core,
			
 
				+                      args=(output_dir, split_dir, njob, str(i + 1)))
			
 
				+    p.close()
			
 
				+    p.join()
			
 
				+
			
 
				+    # combine decoding results
			
 
				+    best_recog_path = os.path.join(output_dir, "1best_recog")
			
 
				+    os.mkdir(best_recog_path)
			
 
				+    files = ["text", "token", "score"]
			
 
				+    for file in files:
			
 
				+        with open(os.path.join(best_recog_path, file), "w") as f:
			
 
				+            for i in range(nj):
			
 
				+                job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
			
 
				+                with open(job_file) as f_job:
			
 
				+                    lines = f_job.readlines()
			
 
				+                f.writelines(lines)
			
 
				+
			
 
				+    # If text exists, compute CER
			
 
				+    text_in = os.path.join(params["data_dir"], "text")
			
 
				+    if os.path.exists(text_in):
			
 
				+        text_proc_file = os.path.join(best_recog_path, "token")
			
 
				+        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    params = {}
			
 
				+    params["data_dir"] = "./data/test"
			
 
				+    params["output_dir"] = "./results"
			
 
				+    params["ngpu"] = 1
			
 
				+    params["njob"] = 1
			
 
				+    modelscope_infer(params)
			
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -0,0 +1,53 @@
 
				+import json
			
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+from modelscope.pipelines import pipeline
			
 
				+from modelscope.utils.constant import Tasks
			
 
				+
			
 
				+from funasr.utils.compute_wer import compute_wer
			
 
				+
			
 
				+
			
 
				+def modelscope_infer_after_finetune(params):
			
 
				+    # prepare for decoding
			
 
				+    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
			
 
				+    for file_name in params["required_files"]:
			
 
				+        if file_name == "configuration.json":
			
 
				+            with open(os.path.join(pretrained_model_path, file_name)) as f:
			
 
				+                config_dict = json.load(f)
			
 
				+                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
			
 
				+            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
			
 
				+                json.dump(config_dict, f, indent=4, separators=(',', ': '))
			
 
				+        else:
			
 
				+            shutil.copy(os.path.join(pretrained_model_path, file_name),
			
 
				+                        os.path.join(params["output_dir"], file_name))
			
 
				+    decoding_path = os.path.join(params["output_dir"], "decode_results")
			
 
				+    if os.path.exists(decoding_path):
			
 
				+        shutil.rmtree(decoding_path)
			
 
				+    os.mkdir(decoding_path)
			
 
				+
			
 
				+    # decoding
			
 
				+    inference_pipeline = pipeline(
			
 
				+        task=Tasks.auto_speech_recognition,
			
 
				+        model=params["output_dir"],
			
 
				+        output_dir=decoding_path,
			
 
				+        batch_size=64
			
 
				+    )
			
 
				+    audio_in = os.path.join(params["data_dir"], "wav.scp")
			
 
				+    inference_pipeline(audio_in=audio_in)
			
 
				+
			
 
				+    # computer CER if GT text is set
			
 
				+    text_in = os.path.join(params["data_dir"], "text")
			
 
				+    if text_in is not None:
			
 
				+        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
			
 
				+        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    params = {}
			
 
				+    params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
			
 
				+    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			
 
				+    params["output_dir"] = "./checkpoint"
			
 
				+    params["data_dir"] = "./data/test"
			
 
				+    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
			
 
				+    modelscope_infer_after_finetune(params)
			
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -95,10 +95,13 @@ class Speech2Text:
 
				         logging.info("asr_train_args: {}".format(asr_train_args))
			
 
				         asr_model.to(dtype=getattr(torch, dtype)).eval()
			
 
				 
			
 
				-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+        if asr_model.ctc != None:
			
 
				+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+            scorers.update(
			
 
				+                ctc=ctc
			
 
				+            )
			
 
				         token_list = asr_model.token_list
			
 
				         scorers.update(
			
 
				-            ctc=ctc,
			
 
				             length_bonus=LengthBonus(len(token_list)),
			
 
				         )
			
 
				 
			
@@ -166,7 +169,7 @@ class Speech2Text:
 
				         self.converter = converter
			
 
				         self.tokenizer = tokenizer
			
 
				         is_use_lm = lm_weight != 0.0 and lm_file is not None
			
 
				-        if ctc_weight == 0.0 and not is_use_lm:
			
 
				+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
			
 
				             beam_search = None
			
 
				         self.beam_search = beam_search
			
 
				         logging.info(f"Beam_search: {self.beam_search}")
			
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -98,10 +98,13 @@ class Speech2Text:
 
				         logging.info("asr_train_args: {}".format(asr_train_args))
			
 
				         asr_model.to(dtype=getattr(torch, dtype)).eval()
			
 
				 
			
 
				-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+        if asr_model.ctc != None:
			
 
				+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+            scorers.update(
			
 
				+                ctc=ctc
			
 
				+            )
			
 
				         token_list = asr_model.token_list
			
 
				         scorers.update(
			
 
				-            ctc=ctc,
			
 
				             length_bonus=LengthBonus(len(token_list)),
			
 
				         )
			
 
				 
			
@@ -169,7 +172,7 @@ class Speech2Text:
 
				         self.converter = converter
			
 
				         self.tokenizer = tokenizer
			
 
				         is_use_lm = lm_weight != 0.0 and lm_file is not None
			
 
				-        if ctc_weight == 0.0 and not is_use_lm:
			
 
				+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
			
 
				             beam_search = None
			
 
				         self.beam_search = beam_search
			
 
				         logging.info(f"Beam_search: {self.beam_search}")
			
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -3,6 +3,7 @@ import argparse
 
				 import logging
			
 
				 import sys
			
 
				 import time
			
 
				+import json
			
 
				 from pathlib import Path
			
 
				 from typing import Optional
			
 
				 from typing import Sequence
			
@@ -100,10 +101,13 @@ class Speech2Text:
 
				         # logging.info("asr_train_args: {}".format(asr_train_args))
			
 
				         asr_model.to(dtype=getattr(torch, dtype)).eval()
			
 
				 
			
 
				-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+        if asr_model.ctc != None:
			
 
				+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+            scorers.update(
			
 
				+                ctc=ctc
			
 
				+            )
			
 
				         token_list = asr_model.token_list
			
 
				         scorers.update(
			
 
				-            ctc=ctc,
			
 
				             length_bonus=LengthBonus(len(token_list)),
			
 
				         )
			
 
				 
			
@@ -171,7 +175,7 @@ class Speech2Text:
 
				         self.converter = converter
			
 
				         self.tokenizer = tokenizer
			
 
				         is_use_lm = lm_weight != 0.0 and lm_file is not None
			
 
				-        if ctc_weight == 0.0 and not is_use_lm:
			
 
				+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
			
 
				             beam_search = None
			
 
				         self.beam_search = beam_search
			
 
				         logging.info(f"Beam_search: {self.beam_search}")
			
@@ -562,6 +566,7 @@ def inference_modelscope(
 
				         length_total = 0.0
			
 
				         finish_count = 0
			
 
				         file_count = 1
			
 
				+        lfr_factor = 6
			
 
				         # 7 .Start for-loop
			
 
				         asr_result_list = []
			
 
				         output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
			
@@ -597,7 +602,7 @@ def inference_modelscope(
 
				                     results = speech2text(**batch)
			
 
				                     if len(results) < 1:
			
 
				                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-                        results = [[" ", ["<space>"], [2], 10, 6]] * nbest
			
 
				+                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
			
 
				                     time_end = time.time()
			
 
				                     forward_time = time_end - time_beg
			
 
				                     lfr_factor = results[0][-1]
			
@@ -615,7 +620,8 @@ def inference_modelscope(
 
				                 
			
 
				                 key = keys[0]
			
 
				                 result = result_segments[0]
			
 
				-                text, token, token_int, time_stamp = result
			
 
				+                text, token, token_int = result[0], result[1], result[2]
			
 
				+                time_stamp = None if len(result) < 4 else result[3]
			
 
				                 
			
 
				                 # Create a directory: outdir/{n}best_recog
			
 
				                 if writer is not None:
			
@@ -630,15 +636,23 @@ def inference_modelscope(
 
				                         text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
			
 
				                                                                                    postprocessed_result[1], \
			
 
				                                                                                    postprocessed_result[2]
			
 
				-                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
			
 
				-                        text_postprocessed_punc_time_stamp = "predictions: {}  time_stamp: {}".format(
			
 
				-                            text_postprocessed_punc, time_stamp_postprocessed)
			
 
				+                        if len(word_lists) > 0: 
			
 
				+                            text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
			
 
				+                            text_postprocessed_punc_time_stamp = json.dumps({"predictions": text_postprocessed_punc,
			
 
				+                                                                             "time_stamp": time_stamp_postprocessed},
			
 
				+                                                                            ensure_ascii=False)
			
 
				+                        else:
			
 
				+                            text_postprocessed_punc = ""
			
 
				+                            punc_id_list = []
			
 
				+                            text_postprocessed_punc_time_stamp = ""
			
 
				+                            
			
 
				                     else:
			
 
				-                        text_postprocessed = postprocessed_result
			
 
				-                        time_stamp_postprocessed = None
			
 
				-                        word_lists = None
			
 
				-                        text_postprocessed_punc_time_stamp = None
			
 
				-                        punc_id_list = None
			
 
				+                        text_postprocessed = ""
			
 
				+                        time_stamp_postprocessed = ""
			
 
				+                        word_lists = ""
			
 
				+                        text_postprocessed_punc_time_stamp = ""
			
 
				+                        punc_id_list = ""
			
 
				+                        text_postprocessed_punc = ""
			
 
				 
			
 
				                     item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed,
			
 
				                             'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list, 'token': token}
			
@@ -660,7 +674,7 @@ def inference_modelscope(
 
				                                                                                          time_stamp_postprocessed))
			
 
				         
			
 
				         logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
			
 
				-                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)))
			
 
				+                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor+1e-6)))
			
 
				         return asr_result_list
			
 
				     return _forward
			
 
				 
			
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -96,11 +96,14 @@ class Speech2Text:
 
				         else:
			
 
				             decoder = asr_model.decoder2
			
 
				 
			
 
				-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+        if asr_model.ctc != None:
			
 
				+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
			
 
				+            scorers.update(
			
 
				+                ctc=ctc
			
 
				+            )
			
 
				         token_list = asr_model.token_list
			
 
				         scorers.update(
			
 
				             decoder=decoder,
			
 
				-            ctc=ctc,
			
 
				             length_bonus=LengthBonus(len(token_list)),
			
 
				         )
			
 
				 
			
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -272,7 +272,7 @@ class E2EVadModel(torch.nn.Module):
 
				         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
			
 
				         frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
			
 
				         self.data_buf = self.waveform[0]  # 指向self.waveform[0]
			
 
				-        for offset in range(0, self.waveform.shape[1] - frame_sample_length, frame_shift_length):
			
 
				+        for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
			
 
				             self.decibel.append(
			
 
				                 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
			
 
				                                 0.000001))