| 12345678910111213141516171819202122232425262728293031323334353637 |
- import os
- import json
- import torch
- import logging
- import concurrent.futures
- import librosa
- import torch.distributed as dist
- from typing import Collection
- import torch
- import torchaudio
- from torch import nn
- import random
- import re
- import string
- from funasr.tokenizer.cleaner import TextCleaner
- from funasr.register import tables
- @tables.register("preprocessor_classes", "TextPreprocessRemovePunctuation")
- class TextPreprocessSegDict(nn.Module):
- def __init__(self,
- **kwargs):
- super().__init__()
-
-
- def forward(self, text, **kwargs):
- # 定义英文标点符号
- en_punct = string.punctuation
- # 定义中文标点符号(部分常用的)
- cn_punct = '。?!,、;:“”‘’()《》【】…—~·'
- # 合并英文和中文标点符号
- all_punct = en_punct + cn_punct
- # 创建正则表达式模式,匹配任何在all_punct中的字符
- punct_pattern = re.compile('[{}]'.format(re.escape(all_punct)))
- # 使用正则表达式的sub方法替换掉这些字符
- return punct_pattern.sub('', text)
|