Browse Source

update dev_lyh (#982)

* Update SDK_advanced_guide_offline_zh.md

* Update SDK_advanced_guide_online_zh.md

* update paraformer online recipe

* update paraformer online README

* update paraformer online README

* Update SDK_advanced_guide_offline_zh.md

* update paraformer online python websocket code

* Update SDK_advanced_guide_offline_zh.md

* Update SDK_tutorial_online_zh.md

* Update SDK_tutorial_zh.md

* Update SDK_tutorial_online_zh.md

---------

Co-authored-by: Yabin Li <wucong.lyb@alibaba-inc.com>
Co-authored-by: haoneng.lhn <haoneng.lhn@alibaba-inc.com>
yhliang 2 years ago
parent
commit
9366bd9bcf

+ 7 - 4
egs_modelscope/asr/TEMPLATE/README.md

@@ -27,15 +27,18 @@ print(rec_result)
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode='paraformer_streaming'
     )
 import soundfile
 speech, sample_rate = soundfile.read("example/asr_example.wav")
 
-chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
-param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size,
+              "encoder_chunk_look_back": encoder_chunk_look_back, "decoder_chunk_look_back": decoder_chunk_look_back}
 chunk_stride = chunk_size[1] * 960 # 600ms、480ms
 # first chunk, 600ms
 speech_chunk = speech[0:chunk_stride]
@@ -55,7 +58,7 @@ from modelscope.utils.constant import Tasks
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode="paraformer_fake_streaming"
 )

+ 7 - 4
egs_modelscope/asr/TEMPLATE/README_zh.md

@@ -27,15 +27,18 @@ print(rec_result)
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode='paraformer_streaming'
     )
 import soundfile
 speech, sample_rate = soundfile.read("example/asr_example.wav")
 
-chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
-param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size,
+              "encoder_chunk_look_back": encoder_chunk_look_back, "decoder_chunk_look_back": decoder_chunk_look_back}
 chunk_stride = chunk_size[1] * 960 # 600ms、480ms
 # first chunk, 600ms
 speech_chunk = speech[0:chunk_stride]
@@ -55,7 +58,7 @@ from modelscope.utils.constant import Tasks
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode="paraformer_fake_streaming"
 )

+ 1 - 1
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py

@@ -4,7 +4,7 @@ from modelscope.utils.constant import Tasks
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode="paraformer_fake_streaming"
 )

+ 1 - 1
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py

@@ -14,7 +14,7 @@ os.environ["MODELSCOPE_CACHE"] = "./"
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.6',
+    model_revision='v1.0.7',
     update_model=False,
     mode="paraformer_streaming"
 )

+ 5 - 2
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online_v2.py

@@ -24,9 +24,12 @@ speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_exampl
 speech_length = speech.shape[0]
 
 sample_offset = 0
-chunk_size = [0, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
 stride_size =  chunk_size[1] * 960
-param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size, "encoder_chunk_look_back": 4, "decoder_chunk_look_back": 1}
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size, 
+              "encoder_chunk_look_back": encoder_chunk_look_back, "decoder_chunk_look_back": decoder_chunk_look_back}
 final_result = ""
 
 for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):

+ 1 - 1
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py

@@ -14,7 +14,7 @@ def modelscope_finetune(params):
     ds_dict = MsDataset.load(params.data_path)
     kwargs = dict(
         model=params.model,
-        model_revision='v1.0.6',
+        model_revision='v1.0.7',
         update_model=False,
         data_dir=ds_dict,
         dataset_type=params.dataset_type,

+ 1 - 1
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py

@@ -11,7 +11,7 @@ def modelscope_infer(args):
         model=args.model,
         output_dir=args.output_dir,
         batch_size=args.batch_size,
-        model_revision='v1.0.6',
+        model_revision='v1.0.7',
         update_model=False,
         mode="paraformer_fake_streaming",
         param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}

+ 13 - 7
funasr/runtime/docs/SDK_advanced_guide_offline_zh.md

@@ -22,9 +22,12 @@ FunASR提供可一键本地或者云端服务器部署的中文离线文件转
 通过下述命令拉取并启动FunASR runtime-SDK的docker镜像:
 
 ```shell
-sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.2.2
+sudo docker pull \
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.2.2
 mkdir -p ./funasr-runtime-resources/models
-sudo docker run -p 10095:10095 -it --privileged=true -v ./funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.2.2
+sudo docker run -p 10095:10095 -it --privileged=true \
+  -v ./funasr-runtime-resources/models:/workspace/models \
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.2.2
 ```
 如果您没有安装docker,可参考[Docker安装](#Docker安装)
 
@@ -100,18 +103,20 @@ sudo systemctl start docker
 若想直接运行client进行测试,可参考如下简易说明,以python版本为例:
 
 ```shell
-python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav" --output_dir "./results"
+python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline \
+        --audio_in "../audio/asr_example.wav" --output_dir "./results"
 ```
 
 命令参数说明:
 ```text
---host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+       需要改为部署机器ip
 --port 10095 部署端口号
 --mode offline表示离线文件转写
 --audio_in 需要进行转写的音频文件,支持文件路径,文件列表wav.scp
 --thread_num 设置并发发送线程数,默认为1
 --ssl 设置是否开启ssl证书校验,默认1开启,设置为0关闭
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串(阿里巴巴 达摩院)
 --use_itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 
@@ -124,10 +129,11 @@ python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio
 命令参数说明:
 
 ```text
---server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+            需要改为部署机器ip
 --port 10095 部署端口号
 --wav-path 需要进行转写的音频文件,支持文件路径
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (阿里巴巴 达摩院)
 --use-itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 

+ 5 - 2
funasr/runtime/docs/SDK_advanced_guide_online_zh.md

@@ -11,9 +11,12 @@ FunASR提供可便捷本地或者云端服务器部署的实时语音听写服
 通过下述命令拉取并启动FunASR软件包的docker镜像:
 
 ```shell
-sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.2
+sudo docker pull \
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.2
 mkdir -p ./funasr-runtime-resources/models
-sudo docker run -p 10095:10095 -it --privileged=true -v ./funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.2
+sudo docker run -p 10095:10095 -it --privileged=true \
+  -v ./funasr-runtime-resources/models:/workspace/models \
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.2
 ```
 如果您没有安装docker,可参考[Docker安装](https://alibaba-damo-academy.github.io/FunASR/en/installation/docker_zh.html)
 

+ 12 - 7
funasr/runtime/docs/SDK_tutorial_online_zh.md

@@ -67,34 +67,39 @@ python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass
 
 命令参数说明:
 ```text
---host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+       需要改为部署机器ip
 --port 10095 部署端口号
---mode:`offline`表示推理模式为一句话识别;`online`表示推理模式为实时语音识别;`2pass`表示为实时语音识别,并且说话句尾采用离线模型进行纠错。
+--mode:`offline`表示推理模式为一句话识别;`online`表示推理模式为实时语音识别;`2pass`表示为实时语音识别,
+       并且说话句尾采用离线模型进行纠错。
 --chunk_size:表示流式模型latency配置`[5,10,5]`,表示当前音频解码片段为600ms,并且回看300ms,右看300ms。
 --audio_in 需要进行转写的音频文件,支持文件路径,文件列表wav.scp
 --thread_num 设置并发发送线程数,默认为1
 --ssl 设置是否开启ssl证书校验,默认1开启,设置为0关闭+
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (阿里巴巴 达摩院)
 --use_itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 
 ### cpp-client
 进入samples/cpp目录后,可以用cpp进行测试,指令如下:
 ```shell
-./funasr-wss-client-2pass --server-ip 127.0.0.1 --port 10095 --mode 2pass --wav-path ../audio/asr_example.wav
+./funasr-wss-client-2pass --server-ip 127.0.0.1 --port 10095 --mode 2pass \
+   --wav-path ../audio/asr_example.wav
 ```
 
 命令参数说明:
 
 ```text
---server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+            需要改为部署机器ip
 --port 10095 部署端口号
---mode:`offline`表示推理模式为一句话识别;`online`表示推理模式为实时语音识别;`2pass`表示为实时语音识别,并且说话句尾采用离线模型进行纠错。
+--mode:`offline`表示推理模式为一句话识别;`online`表示推理模式为实时语音识别;`2pass`表示为实时语音识别,
+        并且说话句尾采用离线模型进行纠错。
 --chunk-size:表示流式模型latency配置`[5,10,5]`,表示当前音频解码片段为600ms,并且回看300ms,右看300ms。
 --wav-path 需要进行转写的音频文件,支持文件路径
 --thread-num 设置并发发送线程数,默认为1
 --is-ssl 设置是否开启ssl证书校验,默认1开启,设置为0关闭
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (阿里巴巴 达摩院)
 --use-itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 

+ 6 - 4
funasr/runtime/docs/SDK_tutorial_zh.md

@@ -69,13 +69,14 @@ python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --au
 
 命令参数说明:
 ```text
---host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--host 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+        需要改为部署机器ip
 --port 10095 部署端口号
 --mode offline表示离线文件转写
 --audio_in 需要进行转写的音频文件,支持文件路径,文件列表wav.scp
 --thread_num 设置并发发送线程数,默认为1
 --ssl 设置是否开启ssl证书校验,默认1开启,设置为0关闭
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (阿里巴巴 达摩院)
 --use_itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 
@@ -88,12 +89,13 @@ python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --au
 命令参数说明:
 
 ```text
---server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,需要改为部署机器ip
+--server-ip 为FunASR runtime-SDK服务部署机器ip,默认为本机ip(127.0.0.1),如果client与服务不在同一台服务器,
+            需要改为部署机器ip
 --port 10095 部署端口号
 --wav-path 需要进行转写的音频文件,支持文件路径
 --thread_num 设置并发发送线程数,默认为1
 --ssl 设置是否开启ssl证书校验,默认1开启,设置为0关闭
---hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (could be: 阿里巴巴 达摩院)
+--hotword 如果模型为热词模型,可以设置热词: *.txt(每行一个热词) 或者空格分隔的热词字符串 (阿里巴巴 达摩院)
 --use-itn 设置是否使用itn,默认1开启,设置为0关闭
 ```
 

+ 3 - 2
funasr/runtime/python/websocket/funasr_client_api.py

@@ -51,7 +51,8 @@ class Funasr_websocket_recognizer():
         stride = int(60 *  chunk_size[1]/  chunk_interval / 1000 * 16000 * 2)
         chunk_num = (len(audio_bytes) - 1) // stride + 1
        
-        message = json.dumps({"mode":  mode, "chunk_size":  chunk_size, "chunk_interval":  chunk_interval,
+        message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "encoder_chunk_look_back": 4,
+                              "decoder_chunk_look_back": 1, "chunk_interval": args.chunk_interval, 
                               "wav_name": wav_name, "is_speaking": True})
  
         self.websocket.send(message)
@@ -131,4 +132,4 @@ if __name__ == '__main__':
     print("text",text)
  
     
-            
+            

+ 10 - 1
funasr/runtime/python/websocket/funasr_wss_client.py

@@ -29,6 +29,14 @@ parser.add_argument("--chunk_size",
                     type=str,
                     default="5, 10, 5",
                     help="chunk")
+parser.add_argument("--encoder_chunk_look_back",
+                    type=int,
+                    default=4,
+                    help="number of chunks to lookback for encoder self-attention")
+parser.add_argument("--decoder_chunk_look_back",
+                    type=int,
+                    default=1,
+                    help="number of encoder chunks to lookback for decoder cross-attention")
 parser.add_argument("--chunk_interval",
                     type=int,
                     default=10,
@@ -99,7 +107,8 @@ async def record_microphone():
                     input=True,
                     frames_per_buffer=CHUNK)
 
-    message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval,
+    message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "encoder_chunk_look_back": args.encoder_chunk_look_back,
+                          "decoder_chunk_look_back": args.decoder_chunk_look_back, "chunk_interval": args.chunk_interval, 
                           "wav_name": "microphone", "is_speaking": True})
     #voices.put(message)
     await websocket.send(message)

+ 6 - 2
funasr/runtime/python/websocket/funasr_wss_server.py

@@ -103,8 +103,8 @@ inference_pipeline_asr_online = pipeline(
     model=args.asr_model_online,
     ngpu=args.ngpu,
     ncpu=args.ncpu,
-    model_revision='v1.0.4',
-    update_model='v1.0.4',
+    model_revision='v1.0.7',
+    update_model='v1.0.7',
     mode='paraformer_streaming')
 
 print("model loaded! only support one client at the same time now!!!!")
@@ -159,6 +159,10 @@ async def ws_serve(websocket, path):
                     websocket.wav_name = messagejson.get("wav_name")
                 if "chunk_size" in messagejson:
                     websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
+                if "encoder_chunk_look_back" in messagejson:
+                    websocket.param_dict_asr_online["encoder_chunk_look_back"] = messagejson["encoder_chunk_look_back"]
+                if "decoder_chunk_look_back" in messagejson:
+                    websocket.param_dict_asr_online["decoder_chunk_look_back"] = messagejson["decoder_chunk_look_back"]
                 if "mode" in messagejson:
                     websocket.mode = messagejson["mode"]
             if len(frames_asr_online) > 0 or len(frames_asr) > 0 or not isinstance(message, str):