template.yaml 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. # This is an example that demonstrates how to configure a model file.
  2. # You can modify the configuration according to your own requirements.
  3. # to print the register_table:
  4. # from funasr.register import tables
  5. # tables.print()
  6. # network architecture
  7. model: UniASR
  8. model_conf:
  9. ctc_weight: 0.0
  10. lsm_weight: 0.1
  11. length_normalized_loss: true
  12. predictor_weight: 1.0
  13. decoder_attention_chunk_type: chunk
  14. ctc_weight2: 0.0
  15. predictor_weight2: 1.0
  16. decoder_attention_chunk_type2: chunk
  17. loss_weight_model1: 0.5
  18. # encoder
  19. encoder: SANMEncoderChunkOpt
  20. encoder_conf:
  21. output_size: 320
  22. attention_heads: 4
  23. linear_units: 1280
  24. num_blocks: 35
  25. dropout_rate: 0.1
  26. positional_dropout_rate: 0.1
  27. attention_dropout_rate: 0.1
  28. input_layer: pe
  29. pos_enc_class: SinusoidalPositionEncoder
  30. normalize_before: true
  31. kernel_size: 11
  32. sanm_shfit: 0
  33. selfattention_layer_type: sanm
  34. chunk_size:
  35. - 20
  36. - 60
  37. stride:
  38. - 10
  39. - 40
  40. pad_left:
  41. - 5
  42. - 10
  43. encoder_att_look_back_factor:
  44. - 0
  45. - 0
  46. decoder_att_look_back_factor:
  47. - 0
  48. - 0
  49. # decoder
  50. decoder: FsmnDecoderSCAMAOpt
  51. decoder_conf:
  52. attention_dim: 256
  53. attention_heads: 4
  54. linear_units: 1024
  55. num_blocks: 12
  56. dropout_rate: 0.1
  57. positional_dropout_rate: 0.1
  58. self_attention_dropout_rate: 0.1
  59. src_attention_dropout_rate: 0.1
  60. att_layer_num: 6
  61. kernel_size: 11
  62. concat_embeds: true
  63. # predictor
  64. predictor: CifPredictorV2
  65. predictor_conf:
  66. idim: 320
  67. threshold: 1.0
  68. l_order: 1
  69. r_order: 1
  70. # encoder2
  71. encoder2: SANMEncoderChunkOpt
  72. encoder2_conf:
  73. output_size: 320
  74. attention_heads: 4
  75. linear_units: 1280
  76. num_blocks: 20
  77. dropout_rate: 0.1
  78. positional_dropout_rate: 0.1
  79. attention_dropout_rate: 0.1
  80. input_layer: pe
  81. pos_enc_class: SinusoidalPositionEncoder
  82. normalize_before: true
  83. kernel_size: 21
  84. sanm_shfit: 0
  85. selfattention_layer_type: sanm
  86. chunk_size:
  87. - 45
  88. - 70
  89. stride:
  90. - 35
  91. - 50
  92. pad_left:
  93. - 5
  94. - 10
  95. encoder_att_look_back_factor:
  96. - 0
  97. - 0
  98. decoder_att_look_back_factor:
  99. - 0
  100. - 0
  101. # decoder
  102. decoder2: FsmnDecoderSCAMAOpt
  103. decoder2_conf:
  104. attention_dim: 320
  105. attention_heads: 4
  106. linear_units: 1280
  107. num_blocks: 12
  108. dropout_rate: 0.1
  109. positional_dropout_rate: 0.1
  110. self_attention_dropout_rate: 0.1
  111. src_attention_dropout_rate: 0.1
  112. att_layer_num: 6
  113. kernel_size: 11
  114. concat_embeds: true
  115. predictor2: CifPredictorV2
  116. predictor2_conf:
  117. idim: 320
  118. threshold: 1.0
  119. l_order: 1
  120. r_order: 1
  121. stride_conv: stride_conv1d
  122. stride_conv_conf:
  123. kernel_size: 2
  124. stride: 2
  125. pad:
  126. - 0
  127. - 1
  128. # frontend related
  129. frontend: WavFrontend
  130. frontend_conf:
  131. fs: 16000
  132. window: hamming
  133. n_mels: 80
  134. frame_length: 25
  135. frame_shift: 10
  136. lfr_m: 7
  137. lfr_n: 6
  138. dither: 0.0
  139. specaug: SpecAugLFR
  140. specaug_conf:
  141. apply_time_warp: false
  142. time_warp_window: 5
  143. time_warp_mode: bicubic
  144. apply_freq_mask: true
  145. freq_mask_width_range:
  146. - 0
  147. - 30
  148. lfr_rate: 6
  149. num_freq_mask: 1
  150. apply_time_mask: true
  151. time_mask_width_range:
  152. - 0
  153. - 12
  154. num_time_mask: 1
  155. train_conf:
  156. accum_grad: 1
  157. grad_clip: 5
  158. max_epoch: 150
  159. keep_nbest_models: 10
  160. avg_nbest_model: 5
  161. log_interval: 50
  162. optim: adam
  163. optim_conf:
  164. lr: 0.0001
  165. scheduler: warmuplr
  166. scheduler_conf:
  167. warmup_steps: 30000
  168. dataset: AudioDataset
  169. dataset_conf:
  170. index_ds: IndexDSJsonl
  171. batch_sampler: DynamicBatchLocalShuffleSampler
  172. batch_type: example # example or length
  173. batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
  174. max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
  175. buffer_size: 500
  176. shuffle: True
  177. num_workers: 0
  178. tokenizer: CharTokenizer
  179. tokenizer_conf:
  180. unk_symbol: <unk>
  181. split_with_space: true
  182. ctc_conf:
  183. dropout_rate: 0.0
  184. ctc_type: builtin
  185. reduce: true
  186. ignore_nan_grad: true
  187. normalize: null