| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 |
- # This is an example that demonstrates how to configure a model file.
- # You can modify the configuration according to your own requirements.
- # to print the register_table:
- # from funasr.register import tables
- # tables.print()
- # network architecture
- #model: funasr.models.paraformer.model:Paraformer
- model: BiCifParaformer
- model_conf:
- ctc_weight: 0.0
- lsm_weight: 0.1
- length_normalized_loss: true
- predictor_weight: 1.0
- predictor_bias: 1
- sampling_ratio: 0.75
- # encoder
- encoder: SANMEncoder
- encoder_conf:
- output_size: 512
- attention_heads: 4
- linear_units: 2048
- num_blocks: 50
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.1
- input_layer: pe
- pos_enc_class: SinusoidalPositionEncoder
- normalize_before: true
- kernel_size: 11
- sanm_shfit: 0
- selfattention_layer_type: sanm
- # decoder
- decoder: ParaformerSANMDecoder
- decoder_conf:
- attention_heads: 4
- linear_units: 2048
- num_blocks: 16
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- self_attention_dropout_rate: 0.1
- src_attention_dropout_rate: 0.1
- att_layer_num: 16
- kernel_size: 11
- sanm_shfit: 0
- predictor: CifPredictorV3
- predictor_conf:
- idim: 512
- threshold: 1.0
- l_order: 1
- r_order: 1
- tail_threshold: 0.45
- smooth_factor2: 0.25
- noise_threshold2: 0.01
- upsample_times: 3
- use_cif1_cnn: false
- upsample_type: cnn_blstm
- # frontend related
- frontend: WavFrontend
- frontend_conf:
- fs: 16000
- window: hamming
- n_mels: 80
- frame_length: 25
- frame_shift: 10
- lfr_m: 7
- lfr_n: 6
- specaug: SpecAugLFR
- specaug_conf:
- apply_time_warp: false
- time_warp_window: 5
- time_warp_mode: bicubic
- apply_freq_mask: true
- freq_mask_width_range:
- - 0
- - 30
- lfr_rate: 6
- num_freq_mask: 1
- apply_time_mask: true
- time_mask_width_range:
- - 0
- - 12
- num_time_mask: 1
- train_conf:
- accum_grad: 1
- grad_clip: 5
- max_epoch: 150
- val_scheduler_criterion:
- - valid
- - acc
- best_model_criterion:
- - - valid
- - acc
- - max
- keep_nbest_models: 10
- log_interval: 50
- optim: adam
- optim_conf:
- lr: 0.0005
- scheduler: warmuplr
- scheduler_conf:
- warmup_steps: 30000
- dataset: AudioDataset
- dataset_conf:
- index_ds: IndexDSJsonl
- batch_sampler: DynamicBatchLocalShuffleSampler
- batch_type: example # example or length
- batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
- max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
- buffer_size: 500
- shuffle: True
- num_workers: 0
- tokenizer: CharTokenizer
- tokenizer_conf:
- unk_symbol: <unk>
- split_with_space: true
- ctc_conf:
- dropout_rate: 0.0
- ctc_type: builtin
- reduce: true
- ignore_nan_grad: true
- normalize: null
|