train_conformer_bat.yaml 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. encoder: chunk_conformer
  2. encoder_conf:
  3. activation_type: swish
  4. positional_dropout_rate: 0.5
  5. time_reduction_factor: 2
  6. embed_vgg_like: false
  7. subsampling_factor: 4
  8. linear_units: 2048
  9. output_size: 512
  10. attention_heads: 8
  11. dropout_rate: 0.5
  12. positional_dropout_rate: 0.5
  13. attention_dropout_rate: 0.5
  14. cnn_module_kernel: 15
  15. num_blocks: 12
  16. # decoder related
  17. rnnt_decoder: rnnt
  18. rnnt_decoder_conf:
  19. embed_size: 512
  20. hidden_size: 512
  21. embed_dropout_rate: 0.5
  22. dropout_rate: 0.5
  23. use_embed_mask: true
  24. predictor: bat_predictor
  25. predictor_conf:
  26. idim: 512
  27. threshold: 1.0
  28. l_order: 1
  29. r_order: 1
  30. return_accum: true
  31. joint_network_conf:
  32. joint_space_size: 512
  33. # frontend related
  34. frontend: wav_frontend
  35. frontend_conf:
  36. fs: 16000
  37. window: hamming
  38. n_mels: 80
  39. frame_length: 25
  40. frame_shift: 10
  41. lfr_m: 1
  42. lfr_n: 1
  43. # Auxiliary CTC
  44. model: bat
  45. model_conf:
  46. auxiliary_ctc_weight: 0.0
  47. cif_weight: 1.0
  48. r_d: 3
  49. r_u: 5
  50. # minibatch related
  51. use_amp: true
  52. # optimization related
  53. accum_grad: 1
  54. grad_clip: 5
  55. max_epoch: 100
  56. val_scheduler_criterion:
  57. - valid
  58. - loss
  59. best_model_criterion:
  60. - - valid
  61. - cer_transducer
  62. - min
  63. keep_nbest_models: 10
  64. optim: adam
  65. optim_conf:
  66. lr: 0.001
  67. scheduler: warmuplr
  68. scheduler_conf:
  69. warmup_steps: 25000
  70. specaug: specaug
  71. specaug_conf:
  72. apply_time_warp: true
  73. time_warp_window: 5
  74. time_warp_mode: bicubic
  75. apply_freq_mask: true
  76. freq_mask_width_range:
  77. - 0
  78. - 40
  79. num_freq_mask: 2
  80. apply_time_mask: true
  81. time_mask_width_range:
  82. - 0
  83. - 50
  84. num_time_mask: 5
  85. dataset_conf:
  86. data_names: speech,text
  87. data_types: sound,text
  88. shuffle: True
  89. shuffle_conf:
  90. shuffle_size: 2048
  91. sort_size: 500
  92. batch_conf:
  93. batch_type: token
  94. batch_size: 25000
  95. num_workers: 8
  96. log_interval: 50