2 лет назад · 3dcfb685a2
--- a/docs/modescope_pipeline/asr_pipeline.md
+++ b/docs/modescope_pipeline/asr_pipeline.md
@@ -82,7 +82,7 @@ Undo
 
															 - `output_dir`: None (Defalut), the output path of results if set
														
 
															 ### Inference with multi-thread CPUs or multi GPUs
														
 
															-FunASR also offer recipes [run.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
														
 
															+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
														
 
															 - Setting parameters in `infer.sh`
														
 
															     - <strong>model:</strong> # model name on ModelScope
														
@@ -123,7 +123,7 @@ If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `
 
															 ## Finetune with pipeline
														
 
															 ### Quick start
														
 
															-[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/finetune.py)
														
 
															+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
														
 
															 ```python
														
 
															 import os
														
 
															 from modelscope.metainfo import Trainers
														
@@ -166,7 +166,7 @@ python finetune.py &> log.txt &
 
															 ### Finetune with your data
														
 
															-- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/finetune.py)
														
 
															+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
														
 
															     - <strong>output_dir:</strong> # result dir
														
 
															     - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
														
 
															     - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
														
@@ -183,7 +183,7 @@ If you want finetune with multi-GPUs, you could:
 
															 CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
														
 
															 ```
														
 
															 ## Inference with your finetuned model
														
 
															-- Modify inference related parameters in `infer_after_finetune.py`
														
 
															+- Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py)
														
 
															     - <strong>modelscope_model_name: </strong> # model name on ModelScope
														
 
															     - <strong>output_dir:</strong> # result dir
														
 
															     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
														
--- a/docs/modescope_pipeline/vad_pipeline.md
+++ b/docs/modescope_pipeline/vad_pipeline.md
@@ -66,7 +66,7 @@ Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/
 
															 - `output_dir`: None (Defalut), the output path of results if set
														
 
															 ### Inference with multi-thread CPUs or multi GPUs
														
 
															-FunASR also offer recipes [run.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
														
 
															+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs.
														
 
															 - Setting parameters in `infer.sh`
														
 
															     - <strong>model:</strong> # model name on ModelScope
														
--- a/egs_modelscope/asr/TEMPLATE/finetune.py
+++ b/egs_modelscope/asr/TEMPLATE/finetune.py
@@ -0,0 +1,36 @@
 
															+import os
														
 
															+
														
 
															+from modelscope.metainfo import Trainers
														
 
															+from modelscope.trainers import build_trainer
														
 
															+
														
 
															+from funasr.datasets.ms_dataset import MsDataset
														
 
															+from funasr.utils.modelscope_param import modelscope_args
														
 
															+
														
 
															+
														
 
															+def modelscope_finetune(params):
														
 
															+    if not os.path.exists(params.output_dir):
														
 
															+        os.makedirs(params.output_dir, exist_ok=True)
														
 
															+    # dataset split ["train", "validation"]
														
 
															+    ds_dict = MsDataset.load(params.data_path)
														
 
															+    kwargs = dict(
														
 
															+        model=params.model,
														
 
															+        data_dir=ds_dict,
														
 
															+        dataset_type=params.dataset_type,
														
 
															+        work_dir=params.output_dir,
														
 
															+        batch_bins=params.batch_bins,
														
 
															+        max_epoch=params.max_epoch,
														
 
															+        lr=params.lr)
														
 
															+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
														
 
															+    trainer.train()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", data_path="./data")
														
 
															+    params.output_dir = "./checkpoint"              # m模型保存路径
														
 
															+    params.data_path = "./example_data/"            # 数据路径
														
 
															+    params.dataset_type = "small"                   # 小数据量设置small，若数据量大于1000小时，请使用large
														
 
															+    params.batch_bins = 2000                       # batch size，如果dataset_type="small"，batch_bins单位为fbank特征帧数，如果dataset_type="large"，batch_bins单位为毫秒，
														
 
															+    params.max_epoch = 50                           # 最大训练轮数
														
 
															+    params.lr = 0.00005                             # 设置学习率
														
 
															+    
														
 
															+    modelscope_finetune(params)
														
--- a/egs_modelscope/asr/TEMPLATE/infer.py
+++ b/egs_modelscope/asr/TEMPLATE/infer.py
@@ -0,0 +1,25 @@
 
															+import os
														
 
															+import shutil
														
 
															+import argparse
														
 
															+from modelscope.pipelines import pipeline
														
 
															+from modelscope.utils.constant import Tasks
														
 
															+
														
 
															+def modelscope_infer(args):
														
 
															+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
														
 
															+    inference_pipeline = pipeline(
														
 
															+        task=Tasks.auto_speech_recognition,
														
 
															+        model=args.model,
														
 
															+        output_dir=args.output_dir,
														
 
															+        batch_size=args.batch_size,
														
 
															+    )
														
 
															+    inference_pipeline(audio_in=args.audio_in)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = argparse.ArgumentParser()
														
 
															+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
														
 
															+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
														
 
															+    parser.add_argument('--output_dir', type=str, default="./results/")
														
 
															+    parser.add_argument('--batch_size', type=int, default=64)
														
 
															+    parser.add_argument('--gpuid', type=str, default="0")
														
 
															+    args = parser.parse_args()
														
 
															+    modelscope_infer(args)
														
--- a/egs_modelscope/asr/TEMPLATE/infer.sh
+++ b/egs_modelscope/asr/TEMPLATE/infer.sh
@@ -0,0 +1,96 @@
 
															+#!/usr/bin/env bash
														
 
															+
														
 
															+set -e
														
 
															+set -u
														
 
															+set -o pipefail
														
 
															+
														
 
															+stage=1
														
 
															+stop_stage=2
														
 
															+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
														
 
															+data_dir="./data/test"
														
 
															+output_dir="./results"
														
 
															+batch_size=64
														
 
															+gpu_inference=true    # whether to perform gpu decoding
														
 
															+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
														
 
															+njob=4    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
														
 
															+
														
 
															+. utils/parse_options.sh || exit 1;
														
 
															+
														
 
															+if ${gpu_inference} == "true"; then
														
 
															+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
														
 
															+else
														
 
															+    nj=$njob
														
 
															+    batch_size=1
														
 
															+    gpuid_list=""
														
 
															+    for JOB in $(seq ${nj}); do
														
 
															+        gpuid_list=$gpuid_list"-1,"
														
 
															+    done
														
 
															+fi
														
 
															+
														
 
															+mkdir -p $output_dir/split
														
 
															+split_scps=""
														
 
															+for JOB in $(seq ${nj}); do
														
 
															+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
														
 
															+done
														
 
															+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
														
 
															+
														
 
															+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
														
 
															+    echo "Decoding ..."
														
 
															+    gpuid_list_array=(${gpuid_list//,/ })
														
 
															+    for JOB in $(seq ${nj}); do
														
 
															+        {
														
 
															+        id=$((JOB-1))
														
 
															+        gpuid=${gpuid_list_array[$id]}
														
 
															+        mkdir -p ${output_dir}/output.$JOB
														
 
															+        python infer.py \
														
 
															+            --model ${model} \
														
 
															+            --audio_in ${output_dir}/split/wav.$JOB.scp \
														
 
															+            --output_dir ${output_dir}/output.$JOB \
														
 
															+            --batch_size ${batch_size} \
														
 
															+            --gpuid ${gpuid}
														
 
															+        }&
														
 
															+    done
														
 
															+    wait
														
 
															+
														
 
															+    mkdir -p ${output_dir}/1best_recog
														
 
															+    for f in token score text; do
														
 
															+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
														
 
															+          for i in $(seq "${nj}"); do
														
 
															+              cat "${output_dir}/output.${i}/1best_recog/${f}"
														
 
															+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
														
 
															+        fi
														
 
															+    done
														
 
															+fi
														
 
															+
														
 
															+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
														
 
															+    echo "Computing WER ..."
														
 
															+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
														
 
															+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
														
 
															+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
														
 
															+    tail -n 3 ${output_dir}/1best_recog/text.cer
														
 
															+fi
														
 
															+
														
 
															+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
														
 
															+    echo "SpeechIO TIOBE textnorm"
														
 
															+    echo "$0 --> Normalizing REF text ..."
														
 
															+    ./utils/textnorm_zh.py \
														
 
															+        --has_key --to_upper \
														
 
															+        ${data_dir}/text \
														
 
															+        ${output_dir}/1best_recog/ref.txt
														
 
															+
														
 
															+    echo "$0 --> Normalizing HYP text ..."
														
 
															+    ./utils/textnorm_zh.py \
														
 
															+        --has_key --to_upper \
														
 
															+        ${output_dir}/1best_recog/text.proc \
														
 
															+        ${output_dir}/1best_recog/rec.txt
														
 
															+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
														
 
															+
														
 
															+    echo "$0 --> computing WER/CER and alignment ..."
														
 
															+    ./utils/error_rate_zh \
														
 
															+        --tokenizer char \
														
 
															+        --ref ${output_dir}/1best_recog/ref.txt \
														
 
															+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
														
 
															+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
														
 
															+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
														
 
															+fi
														
 
															+
														
--- a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
+++ b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
@@ -0,0 +1,48 @@
 
															+import json
														
 
															+import os
														
 
															+import shutil
														
 
															+
														
 
															+from modelscope.pipelines import pipeline
														
 
															+from modelscope.utils.constant import Tasks
														
 
															+from modelscope.hub.snapshot_download import snapshot_download
														
 
															+
														
 
															+from funasr.utils.compute_wer import compute_wer
														
 
															+
														
 
															+def modelscope_infer_after_finetune(params):
														
 
															+    # prepare for decoding
														
 
															+
														
 
															+    try:
														
 
															+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
														
 
															+    except BaseException:
														
 
															+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
														
 
															+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
														
 
															+    decoding_path = os.path.join(params["output_dir"], "decode_results")
														
 
															+    if os.path.exists(decoding_path):
														
 
															+        shutil.rmtree(decoding_path)
														
 
															+    os.mkdir(decoding_path)
														
 
															+
														
 
															+    # decoding
														
 
															+    inference_pipeline = pipeline(
														
 
															+        task=Tasks.auto_speech_recognition,
														
 
															+        model=pretrained_model_path,
														
 
															+        output_dir=decoding_path,
														
 
															+        batch_size=params["batch_size"]
														
 
															+    )
														
 
															+    audio_in = os.path.join(params["data_dir"], "wav.scp")
														
 
															+    inference_pipeline(audio_in=audio_in)
														
 
															+
														
 
															+    # computer CER if GT text is set
														
 
															+    text_in = os.path.join(params["data_dir"], "text")
														
 
															+    if os.path.exists(text_in):
														
 
															+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
														
 
															+        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    params = {}
														
 
															+    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
														
 
															+    params["output_dir"] = "./checkpoint"
														
 
															+    params["data_dir"] = "./data/test"
														
 
															+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
														
 
															+    params["batch_size"] = 64
														
 
															+    modelscope_infer_after_finetune(params)
														
--- a/egs_modelscope/asr/TEMPLATE/utils
+++ b/egs_modelscope/asr/TEMPLATE/utils
@@ -0,0 +1 @@
 
															+../../../egs/aishell/transformer/utils
														
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -7,7 +7,6 @@ from modelscope.utils.constant import Tasks
 
															 from funasr.utils.compute_wer import compute_wer
														
 
															-import pdb;
														
 
															 def modelscope_infer_core(output_dir, split_dir, njob, idx):
														
 
															     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
														
 
															     gpu_id = (int(idx) - 1) // njob