| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- # This is an example that demonstrates how to configure a model file.
- # You can modify the configuration according to your own requirements.
- # to print the register_table:
- # from funasr.register import tables
- # tables.print()
- # network architecture
- model: UniASR
- model_conf:
- ctc_weight: 0.0
- lsm_weight: 0.1
- length_normalized_loss: true
- predictor_weight: 1.0
- decoder_attention_chunk_type: chunk
- ctc_weight2: 0.0
- predictor_weight2: 1.0
- decoder_attention_chunk_type2: chunk
- loss_weight_model1: 0.5
- # encoder
- encoder: SANMEncoderChunkOpt
- encoder_conf:
- output_size: 320
- attention_heads: 4
- linear_units: 1280
- num_blocks: 35
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.1
- input_layer: pe
- pos_enc_class: SinusoidalPositionEncoder
- normalize_before: true
- kernel_size: 11
- sanm_shfit: 0
- selfattention_layer_type: sanm
- chunk_size:
- - 20
- - 60
- stride:
- - 10
- - 40
- pad_left:
- - 5
- - 10
- encoder_att_look_back_factor:
- - 0
- - 0
- decoder_att_look_back_factor:
- - 0
- - 0
- # decoder
- decoder: FsmnDecoderSCAMAOpt
- decoder_conf:
- attention_dim: 256
- attention_heads: 4
- linear_units: 1024
- num_blocks: 12
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- self_attention_dropout_rate: 0.1
- src_attention_dropout_rate: 0.1
- att_layer_num: 6
- kernel_size: 11
- concat_embeds: true
- # predictor
- predictor: CifPredictorV2
- predictor_conf:
- idim: 320
- threshold: 1.0
- l_order: 1
- r_order: 1
- # encoder2
- encoder2: SANMEncoderChunkOpt
- encoder2_conf:
- output_size: 320
- attention_heads: 4
- linear_units: 1280
- num_blocks: 20
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.1
- input_layer: pe
- pos_enc_class: SinusoidalPositionEncoder
- normalize_before: true
- kernel_size: 21
- sanm_shfit: 0
- selfattention_layer_type: sanm
- chunk_size:
- - 45
- - 70
- stride:
- - 35
- - 50
- pad_left:
- - 5
- - 10
- encoder_att_look_back_factor:
- - 0
- - 0
- decoder_att_look_back_factor:
- - 0
- - 0
- # decoder
- decoder2: FsmnDecoderSCAMAOpt
- decoder2_conf:
- attention_dim: 320
- attention_heads: 4
- linear_units: 1280
- num_blocks: 12
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- self_attention_dropout_rate: 0.1
- src_attention_dropout_rate: 0.1
- att_layer_num: 6
- kernel_size: 11
- concat_embeds: true
- predictor2: CifPredictorV2
- predictor2_conf:
- idim: 320
- threshold: 1.0
- l_order: 1
- r_order: 1
- stride_conv: stride_conv1d
- stride_conv_conf:
- kernel_size: 2
- stride: 2
- pad:
- - 0
- - 1
- # frontend related
- frontend: WavFrontend
- frontend_conf:
- fs: 16000
- window: hamming
- n_mels: 80
- frame_length: 25
- frame_shift: 10
- lfr_m: 7
- lfr_n: 6
- dither: 0.0
- specaug: SpecAugLFR
- specaug_conf:
- apply_time_warp: false
- time_warp_window: 5
- time_warp_mode: bicubic
- apply_freq_mask: true
- freq_mask_width_range:
- - 0
- - 30
- lfr_rate: 6
- num_freq_mask: 1
- apply_time_mask: true
- time_mask_width_range:
- - 0
- - 12
- num_time_mask: 1
- train_conf:
- accum_grad: 1
- grad_clip: 5
- max_epoch: 150
- keep_nbest_models: 10
- avg_nbest_model: 5
- log_interval: 50
- optim: adam
- optim_conf:
- lr: 0.0001
- scheduler: warmuplr
- scheduler_conf:
- warmup_steps: 30000
- dataset: AudioDataset
- dataset_conf:
- index_ds: IndexDSJsonl
- batch_sampler: DynamicBatchLocalShuffleSampler
- batch_type: example # example or length
- batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
- max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
- buffer_size: 500
- shuffle: True
- num_workers: 0
- tokenizer: CharTokenizer
- tokenizer_conf:
- unk_symbol: <unk>
- split_with_space: true
- ctc_conf:
- dropout_rate: 0.0
- ctc_type: builtin
- reduce: true
- ignore_nan_grad: true
- normalize: null
|