template.yaml 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # This is an example that demonstrates how to configure a model file.
  2. # You can modify the configuration according to your own requirements.
  3. # to print the register_table:
  4. # from funasr.register import tables
  5. # tables.print()
  6. # network architecture
  7. #model: funasr.models.paraformer.model:Paraformer
  8. model: BiCifParaformer
  9. model_conf:
  10. ctc_weight: 0.0
  11. lsm_weight: 0.1
  12. length_normalized_loss: true
  13. predictor_weight: 1.0
  14. predictor_bias: 1
  15. sampling_ratio: 0.75
  16. # encoder
  17. encoder: SANMEncoder
  18. encoder_conf:
  19. output_size: 512
  20. attention_heads: 4
  21. linear_units: 2048
  22. num_blocks: 50
  23. dropout_rate: 0.1
  24. positional_dropout_rate: 0.1
  25. attention_dropout_rate: 0.1
  26. input_layer: pe
  27. pos_enc_class: SinusoidalPositionEncoder
  28. normalize_before: true
  29. kernel_size: 11
  30. sanm_shfit: 0
  31. selfattention_layer_type: sanm
  32. # decoder
  33. decoder: ParaformerSANMDecoder
  34. decoder_conf:
  35. attention_heads: 4
  36. linear_units: 2048
  37. num_blocks: 16
  38. dropout_rate: 0.1
  39. positional_dropout_rate: 0.1
  40. self_attention_dropout_rate: 0.1
  41. src_attention_dropout_rate: 0.1
  42. att_layer_num: 16
  43. kernel_size: 11
  44. sanm_shfit: 0
  45. predictor: CifPredictorV3
  46. predictor_conf:
  47. idim: 512
  48. threshold: 1.0
  49. l_order: 1
  50. r_order: 1
  51. tail_threshold: 0.45
  52. smooth_factor2: 0.25
  53. noise_threshold2: 0.01
  54. upsample_times: 3
  55. use_cif1_cnn: false
  56. upsample_type: cnn_blstm
  57. # frontend related
  58. frontend: WavFrontend
  59. frontend_conf:
  60. fs: 16000
  61. window: hamming
  62. n_mels: 80
  63. frame_length: 25
  64. frame_shift: 10
  65. lfr_m: 7
  66. lfr_n: 6
  67. specaug: SpecAugLFR
  68. specaug_conf:
  69. apply_time_warp: false
  70. time_warp_window: 5
  71. time_warp_mode: bicubic
  72. apply_freq_mask: true
  73. freq_mask_width_range:
  74. - 0
  75. - 30
  76. lfr_rate: 6
  77. num_freq_mask: 1
  78. apply_time_mask: true
  79. time_mask_width_range:
  80. - 0
  81. - 12
  82. num_time_mask: 1
  83. train_conf:
  84. accum_grad: 1
  85. grad_clip: 5
  86. max_epoch: 150
  87. val_scheduler_criterion:
  88. - valid
  89. - acc
  90. best_model_criterion:
  91. - - valid
  92. - acc
  93. - max
  94. keep_nbest_models: 10
  95. log_interval: 50
  96. optim: adam
  97. optim_conf:
  98. lr: 0.0005
  99. scheduler: warmuplr
  100. scheduler_conf:
  101. warmup_steps: 30000
  102. dataset: AudioDataset
  103. dataset_conf:
  104. index_ds: IndexDSJsonl
  105. batch_sampler: DynamicBatchLocalShuffleSampler
  106. batch_type: example # example or length
  107. batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
  108. max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
  109. buffer_size: 500
  110. shuffle: True
  111. num_workers: 0
  112. tokenizer: CharTokenizer
  113. tokenizer_conf:
  114. unk_symbol: <unk>
  115. split_with_space: true
  116. ctc_conf:
  117. dropout_rate: 0.0
  118. ctc_type: builtin
  119. reduce: true
  120. ignore_nan_grad: true
  121. normalize: null