| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376 |
- import argparse
- from typing import Callable
- from typing import Collection
- from typing import Dict
- from typing import List
- from typing import Optional
- from typing import Tuple
- import numpy as np
- import torch
- from typeguard import check_argument_types
- from typeguard import check_return_type
- from funasr.datasets.collate_fn import CommonCollateFn
- from funasr.datasets.preprocessor import CommonPreprocessor
- from funasr.layers.abs_normalize import AbsNormalize
- from funasr.layers.global_mvn import GlobalMVN
- from funasr.layers.utterance_mvn import UtteranceMVN
- from funasr.models.data2vec import Data2VecPretrainModel
- from funasr.models.encoder.abs_encoder import AbsEncoder
- from funasr.models.encoder.data2vec_encoder import Data2VecEncoder
- from funasr.models.frontend.abs_frontend import AbsFrontend
- from funasr.models.frontend.default import DefaultFrontend
- from funasr.models.frontend.windowing import SlidingWindow
- from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
- from funasr.models.preencoder.sinc import LightweightSincConvs
- from funasr.models.specaug.abs_specaug import AbsSpecAug
- from funasr.models.specaug.specaug import SpecAug
- from funasr.tasks.abs_task import AbsTask
- from funasr.text.phoneme_tokenizer import g2p_choices
- from funasr.torch_utils.initialize import initialize
- from funasr.train.class_choices import ClassChoices
- from funasr.train.trainer import Trainer
- from funasr.utils.types import float_or_none
- from funasr.utils.types import int_or_none
- from funasr.utils.types import str2bool
- from funasr.utils.types import str_or_none
- frontend_choices = ClassChoices(
- name="frontend",
- classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow),
- type_check=AbsFrontend,
- default="default",
- )
- specaug_choices = ClassChoices(
- name="specaug",
- classes=dict(specaug=SpecAug),
- type_check=AbsSpecAug,
- default=None,
- optional=True,
- )
- normalize_choices = ClassChoices(
- "normalize",
- classes=dict(
- global_mvn=GlobalMVN,
- utterance_mvn=UtteranceMVN,
- ),
- type_check=AbsNormalize,
- default=None,
- optional=True,
- )
- preencoder_choices = ClassChoices(
- name="preencoder",
- classes=dict(
- sinc=LightweightSincConvs,
- ),
- type_check=AbsPreEncoder,
- default=None,
- optional=True,
- )
- encoder_choices = ClassChoices(
- "encoder",
- classes=dict(
- data2vec_encoder=Data2VecEncoder,
- ),
- type_check=AbsEncoder,
- default="data2vec_encoder",
- )
- model_choices = ClassChoices(
- "model",
- classes=dict(
- data2vec=Data2VecPretrainModel,
- ),
- default="data2vec",
- )
- class Data2VecTask(AbsTask):
- # If you need more than one optimizers, change this value
- num_optimizers: int = 1
- # Add variable objects configurations
- class_choices_list = [
- # --frontend and --frontend_conf
- frontend_choices,
- # --specaug and --specaug_conf
- specaug_choices,
- # --normalize and --normalize_conf
- normalize_choices,
- # --preencoder and --preencoder_conf
- preencoder_choices,
- # --encoder and --encoder_conf
- encoder_choices,
- # --model and --model_conf
- model_choices,
- ]
- # If you need to modify train() or eval() procedures, change Trainer class here
- trainer = Trainer
- @classmethod
- def add_task_arguments(cls, parser: argparse.ArgumentParser):
- group = parser.add_argument_group(description="Task related")
- # NOTE(kamo): add_arguments(..., required=True) can't be used
- # to provide --print_config mode. Instead of it, do as
- group.add_argument(
- "--token_list",
- type=str_or_none,
- default=None,
- help="A text mapping int-id to token",
- )
- group.add_argument(
- "--init",
- type=lambda x: str_or_none(x.lower()),
- default=None,
- help="The initialization method",
- choices=[
- "chainer",
- "xavier_uniform",
- "xavier_normal",
- "kaiming_uniform",
- "kaiming_normal",
- None,
- ],
- )
- group.add_argument(
- "--input_size",
- type=int_or_none,
- default=None,
- help="The number of input dimension of the feature",
- )
- group = parser.add_argument_group(description="Preprocess related")
- group.add_argument(
- "--use_preprocessor",
- type=str2bool,
- default=True,
- help="Apply preprocessing to data or not",
- )
- group.add_argument(
- "--token_type",
- type=str,
- default=None,
- choices=["bpe", "char", "word", "phn"],
- help="The text will be tokenized " "in the specified level token",
- )
- group.add_argument(
- "--feats_type",
- type=str,
- default='fbank',
- help="feats type, e.g. fbank, wav, ark_wav(needed to be scale normalization)",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model file of sentencepiece",
- )
- parser.add_argument(
- "--non_linguistic_symbols",
- type=str_or_none,
- help="non_linguistic_symbols file path",
- )
- parser.add_argument(
- "--cleaner",
- type=str_or_none,
- choices=[None, "tacotron", "jaconv", "vietnamese"],
- default=None,
- help="Apply text cleaning",
- )
- parser.add_argument(
- "--g2p",
- type=str_or_none,
- choices=g2p_choices,
- default=None,
- help="Specify g2p method if --token_type=phn",
- )
- parser.add_argument(
- "--speech_volume_normalize",
- type=float_or_none,
- default=None,
- help="Scale the maximum amplitude to the given value.",
- )
- parser.add_argument(
- "--rir_scp",
- type=str_or_none,
- default=None,
- help="The file path of rir scp file.",
- )
- parser.add_argument(
- "--rir_apply_prob",
- type=float,
- default=1.0,
- help="THe probability for applying RIR convolution.",
- )
- parser.add_argument(
- "--noise_scp",
- type=str_or_none,
- default=None,
- help="The file path of noise scp file.",
- )
- parser.add_argument(
- "--noise_apply_prob",
- type=float,
- default=1.0,
- help="The probability applying Noise adding.",
- )
- parser.add_argument(
- "--noise_db_range",
- type=str,
- default="13_15",
- help="The range of noise decibel level.",
- )
- parser.add_argument(
- "--pred_masked_weight",
- type=float,
- default=1.0,
- help="weight for predictive loss for masked frames",
- )
- parser.add_argument(
- "--pred_nomask_weight",
- type=float,
- default=0.0,
- help="weight for predictive loss for unmasked frames",
- )
- parser.add_argument(
- "--loss_weights",
- type=float,
- default=0.0,
- help="weights for additional loss terms (not first one)",
- )
- for class_choices in cls.class_choices_list:
- # Append --<name> and --<name>_conf.
- # e.g. --encoder and --encoder_conf
- class_choices.add_arguments(group)
- @classmethod
- def build_collate_fn(
- cls, args: argparse.Namespace, train: bool
- ) -> Callable[
- [Collection[Tuple[str, Dict[str, np.ndarray]]]],
- Tuple[List[str], Dict[str, torch.Tensor]],
- ]:
- assert check_argument_types()
- return CommonCollateFn(clipping=True)
- @classmethod
- def build_preprocess_fn(
- cls, args: argparse.Namespace, train: bool
- ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
- assert check_argument_types()
- if args.use_preprocessor:
- retval = CommonPreprocessor(
- train=train,
- bpemodel=args.bpemodel,
- non_linguistic_symbols=args.non_linguistic_symbols,
- text_cleaner=args.cleaner,
- g2p_type=args.g2p,
- # NOTE(kamo): Check attribute existence for backward compatibility
- rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
- rir_apply_prob=args.rir_apply_prob
- if hasattr(args, "rir_apply_prob")
- else 1.0,
- noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
- noise_apply_prob=args.noise_apply_prob
- if hasattr(args, "noise_apply_prob")
- else 1.0,
- noise_db_range=args.noise_db_range
- if hasattr(args, "noise_db_range")
- else "13_15",
- speech_volume_normalize=args.speech_volume_normalize
- if hasattr(args, "rir_scp")
- else None,
- )
- else:
- retval = None
- assert check_return_type(retval)
- return retval
- @classmethod
- def required_data_names(
- cls, train: bool = True, inference: bool = False
- ) -> Tuple[str, ...]:
- # for pre-training
- retval = ("speech",)
- return retval
- @classmethod
- def optional_data_names(
- cls, train: bool = True, inference: bool = False
- ) -> Tuple[str, ...]:
- retval = ()
- assert check_return_type(retval)
- return retval
- @classmethod
- def build_model(cls, args: argparse.Namespace):
- assert check_argument_types()
- # 1. frontend
- if args.input_size is None:
- # Extract features in the model
- frontend_class = frontend_choices.get_class(args.frontend)
- frontend = frontend_class(**args.frontend_conf)
- input_size = frontend.output_size()
- else:
- # Give features from data-loader
- args.frontend = None
- args.frontend_conf = {}
- frontend = None
- input_size = args.input_size
- # 2. Data augmentation for spectrogram
- if args.specaug is not None:
- specaug_class = specaug_choices.get_class(args.specaug)
- specaug = specaug_class(**args.specaug_conf)
- else:
- specaug = None
- # 3. Normalization layer
- if args.normalize is not None:
- normalize_class = normalize_choices.get_class(args.normalize)
- normalize = normalize_class(**args.normalize_conf)
- else:
- normalize = None
- # 4. Pre-encoder input block
- # NOTE(kan-bayashi): Use getattr to keep the compatibility
- if getattr(args, "preencoder", None) is not None:
- preencoder_class = preencoder_choices.get_class(args.preencoder)
- preencoder = preencoder_class(**args.preencoder_conf)
- input_size = preencoder.output_size()
- else:
- preencoder = None
- # 5. Encoder
- encoder_class = encoder_choices.get_class(args.encoder)
- encoder = encoder_class(
- input_size=input_size,
- **args.encoder_conf,
- )
- # 6. Build model
- try:
- model_class = model_choices.get_class(args.model)
- except AttributeError:
- model_class = model_choices.get_class("data2vec")
- model = model_class(
- frontend=frontend,
- specaug=specaug,
- normalize=normalize,
- preencoder=preencoder,
- encoder=encoder,
- )
- # 7. Initialize
- if args.init is not None:
- initialize(model, args.init)
- assert check_return_type(model)
- return model
|