Просмотр исходного кода

Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add

游雁 2 лет назад
Родитель
Сommit
5b7c0c17b4
18 измененных файлов с 234 добавлено и 12 удалено
  1. 15 0
      egs/aishell/branchformer/README.md
  2. 15 0
      egs/aishell/e_branchformer/README.md
  3. 1 1
      egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
  4. 2 2
      egs_modelscope/punctuation/TEMPLATE/infer.py
  5. 2 2
      egs_modelscope/punctuation/TEMPLATE/infer.sh
  6. 1 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md
  7. 3 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt
  8. 22 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py
  9. 25 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py
  10. 68 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh
  11. 1 0
      egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils
  12. 3 0
      egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
  13. 1 0
      egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils
  14. 1 2
      funasr/datasets/preprocessor.py
  15. 2 1
      funasr/runtime/python/onnxruntime/demo_punc_offline.py
  16. 11 2
      funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
  17. 60 1
      funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
  18. 1 1
      funasr/runtime/websocket/readme.md

+ 15 - 0
egs/aishell/branchformer/README.md

@@ -0,0 +1,15 @@
+# Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+- Decode config: conf/decode_asr_transformer.yaml (ctc weight:0.4)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.15   |
+|    test     |  4.51   |

+ 15 - 0
egs/aishell/e_branchformer/README.md

@@ -0,0 +1,15 @@
+# E-Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_e_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+- Decode config: conf/decode_asr_transformer.yaml (ctc weight:0.4)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.10   |
+|    test     |  4.52   |

+ 1 - 1
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py

@@ -8,7 +8,7 @@ if __name__ == '__main__':
         task=Tasks.auto_speech_recognition,
         model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
         vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+        punc_model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
         output_dir=output_dir,
     )
     rec_result = inference_pipeline(audio_in=audio_in, batch_size_token=5000, batch_size_token_threshold_s=40)

+ 2 - 2
egs_modelscope/punctuation/TEMPLATE/infer.py

@@ -15,9 +15,9 @@ def modelscope_infer(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
     parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
     parser.add_argument('--output_dir', type=str, default="./results/")
     parser.add_argument('--gpuid', type=str, default="0")
     args = parser.parse_args()
-    modelscope_infer(args)
+    modelscope_infer(args)

+ 2 - 2
egs_modelscope/punctuation/TEMPLATE/infer.sh

@@ -7,7 +7,7 @@ set -o pipefail
 stage=1
 stop_stage=2
 model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-data_dir="./data/test"
+data_dir="./data"
 output_dir="./results"
 gpu_inference=true    # whether to perform gpu decoding
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
@@ -32,7 +32,7 @@ split_scps=""
 for JOB in $(seq ${nj}); do
     split_scps="$split_scps $output_dir/split/text.$JOB.scp"
 done
-perl utils/split_scp.pl ${data_dir}/punc.txt ${split_scps}
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
 
 if [ -n "${checkpoint_dir}" ]; then
   python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}

+ 1 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md

@@ -0,0 +1 @@
+../TEMPLATE/README.md

+ 3 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt

@@ -0,0 +1,3 @@
+1	跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益
+2	从存储上来说仅仅是全景图片它就会是图片的四倍的容量然后全景的视频会是普通视频八倍的这个存储的容要求而三d的模型会是图片的十倍这都对我们今天运行在的云计算的平台存储的平台提出了更高的要求
+3	那今天的会就到这里吧 happy new year 明年见

+ 22 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py

@@ -0,0 +1,22 @@
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
+    model_revision="v1.0.0",
+    output_dir="./tmp/"
+)
+
+##################text.scp###################
+# inputs = "./egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt"
+
+##################text#####################
+#inputs = "我们都是木头人不会讲话不会动"
+
+##################text file url#######################
+inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+
+rec_result = inference_pipeline(text_in=inputs)
+print(rec_result)

+ 25 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py

@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.punctuation,
+        model=args.model,
+        model_revision=args.model_revision,        
+        output_dir=args.output_dir,
+    )
+    inference_pipeline(text_in=args.text_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
+    parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
+    parser.add_argument('--model_revision', type=str, default=None)
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)

+ 68 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh

@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/punc_ct-transformer_cn-en-common-vocab471067-large"
+model_revision="v1.0.0"
+data_dir="./data"
+output_dir="./results"
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="punc.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/text.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --text_in ${output_dir}/split/text.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --model_revision ${model_revision}
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/final_res
+    if [ -f "${output_dir}/output.1/infer.out" ]; then
+      for i in $(seq "${nj}"); do
+          cat "${output_dir}/output.${i}/infer.out"
+      done | sort -k1 >"${output_dir}/final_res/infer.out"
+    fi
+fi
+

+ 1 - 0
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils

@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils

+ 3 - 0
egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt

@@ -0,0 +1,3 @@
+1	跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益
+2	从存储上来说仅仅是全景图片它就会是图片的四倍的容量然后全景的视频会是普通视频八倍的这个存储的容要求而三d的模型会是图片的十倍这都对我们今天运行在的云计算的平台存储的平台提出了更高的要求
+3	那今天的会就到这里吧 happy new year 明年见

+ 1 - 0
egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils

@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils

+ 1 - 2
funasr/datasets/preprocessor.py

@@ -11,7 +11,7 @@ from typing import Union
 import numpy as np
 import scipy.signal
 import soundfile
-
+import jieba
 
 from funasr.text.build_tokenizer import build_tokenizer
 from funasr.text.cleaner import TextCleaner
@@ -659,7 +659,6 @@ class CodeMixTokenizerCommonPreprocessor(CommonPreprocessor):
         self.split_text_name = split_text_name
         self.seg_jieba = seg_jieba
         if self.seg_jieba:
-            import jieba
             jieba.load_userdict(seg_dict_file)
 
     @classmethod

+ 2 - 1
funasr/runtime/python/onnxruntime/demo_punc_offline.py

@@ -1,6 +1,7 @@
 from funasr_onnx import CT_Transformer
 
-model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+#model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model_dir = "damo/punc_ct-transformer_cn-en-common-vocab471067-large"
 model = CT_Transformer(model_dir)
 
 text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"

+ 11 - 2
funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py

@@ -10,7 +10,7 @@ import numpy as np
 from .utils.utils import (ONNXRuntimeError,
                           OrtInferSession, get_logger,
                           read_yaml)
-from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words)
+from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words,code_mix_split_words_jieba)
 logging = get_logger()
 
 
@@ -65,9 +65,18 @@ class CT_Transformer():
                 self.punc_list[i] = "?"
             elif self.punc_list[i] == "。":
                 self.period = i
+        if "seg_jieba" in config:
+            self.seg_jieba = True
+            self.jieba_usr_dict_path = os.path.join(model_dir, 'jieba_usr_dict')
+            self.code_mix_split_words_jieba = code_mix_split_words_jieba(self.jieba_usr_dict_path)
+        else:
+            self.seg_jieba = False
 
     def __call__(self, text: Union[list, str], split_size=20):
-        split_text = code_mix_split_words(text)
+        if self.seg_jieba:
+            split_text = self.code_mix_split_words_jieba(text)
+        else:
+            split_text = code_mix_split_words(text)
         split_text_id = self.converter.tokens2ids(split_text)
         mini_sentences = split_to_mini_sentence(split_text, split_size)
         mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)

+ 60 - 1
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py

@@ -6,11 +6,12 @@ import pickle
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
 
+import re
 import numpy as np
 import yaml
 from onnxruntime import (GraphOptimizationLevel, InferenceSession,
                          SessionOptions, get_available_providers, get_device)
-
+import jieba
 import warnings
 
 root_dir = Path(__file__).resolve().parent
@@ -230,6 +231,64 @@ def code_mix_split_words(text: str):
             words.append(current_word)
     return words
 
+def isEnglish(text:str):
+    if re.search('^[a-zA-Z\']+$', text):
+        return True
+    else:
+        return False
+
+def join_chinese_and_english(input_list):
+    line = ''
+    for token in input_list:
+        if isEnglish(token):
+            line = line + ' ' + token
+        else:
+            line = line + token
+
+    line = line.strip()
+    return line
+
+def code_mix_split_words_jieba(seg_dict_file: str):
+    jieba.load_userdict(seg_dict_file)
+
+    def _fn(text: str):
+        input_list = text.split()
+        token_list_all = []
+        langauge_list = []
+        token_list_tmp = []
+        language_flag = None
+        for token in input_list:
+            if isEnglish(token) and language_flag == 'Chinese':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('Chinese')
+                token_list_tmp = []
+            elif not isEnglish(token) and language_flag == 'English':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('English')
+                token_list_tmp = []
+    
+            token_list_tmp.append(token)
+    
+            if isEnglish(token):
+                language_flag = 'English'
+            else:
+                language_flag = 'Chinese'
+    
+        if token_list_tmp:
+            token_list_all.append(token_list_tmp)
+            langauge_list.append(language_flag)
+    
+        result_list = []
+        for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+            if language_flag == 'English':
+                result_list.extend(token_list_tmp)
+            else:
+                seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+                result_list.extend(seg_list)
+    
+        return result_list
+    return _fn
+
 def read_yaml(yaml_path: Union[str, Path]) -> Dict:
     if not Path(yaml_path).exists():
         raise FileExistsError(f'The {yaml_path} does not exist.')

+ 1 - 1
funasr/runtime/websocket/readme.md

@@ -38,7 +38,7 @@ Download the client test tool directory samples:
 wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/sample/funasr_samples.tar.gz
 ```
 
-We take the Python language client as an example to explain. It supports various audio formats (.wav, .pcm, .mp3, etc.), video input (.mp4, etc.), and multi-file list wav.scp input. For other versions of clients, please refer to the ([docs](##client-usage)).
+We take the Python language client as an example to explain. It supports various audio formats (.wav, .pcm, .mp3, etc.), video input (.mp4, etc.), and multi-file list wav.scp input. For other versions of clients, please refer to the ([docs](#client-usage)).
 
 ```shell
 python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"