2 лет назад · 280593676b
--- a/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
+++ b/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
@@ -1,80 +0,0 @@
 
				-encoder: conformer
			
 
				-encoder_conf:
			
 
				-    output_size: 512
			
 
				-    attention_heads: 8
			
 
				-    linear_units: 2048
			
 
				-    num_blocks: 12
			
 
				-    dropout_rate: 0.1
			
 
				-    positional_dropout_rate: 0.1
			
 
				-    attention_dropout_rate: 0.1
			
 
				-    input_layer: conv2d
			
 
				-    normalize_before: true
			
 
				-    macaron_style: true
			
 
				-    rel_pos_type: latest
			
 
				-    pos_enc_layer_type: rel_pos
			
 
				-    selfattention_layer_type: rel_selfattn
			
 
				-    activation_type: swish
			
 
				-    use_cnn_module: true
			
 
				-    cnn_module_kernel: 31
			
 
				-
			
 
				-decoder: transformer
			
 
				-decoder_conf:
			
 
				-    attention_heads: 8
			
 
				-    linear_units: 2048
			
 
				-    num_blocks: 6
			
 
				-    dropout_rate: 0.1
			
 
				-    positional_dropout_rate: 0.1
			
 
				-    self_attention_dropout_rate: 0.1
			
 
				-    src_attention_dropout_rate: 0.1
			
 
				-
			
 
				-model_conf:
			
 
				-    ctc_weight: 0.3
			
 
				-    lsm_weight: 0.1
			
 
				-    length_normalized_loss: false
			
 
				-
			
 
				-accum_grad: 2
			
 
				-max_epoch: 50
			
 
				-patience: none
			
 
				-init: none
			
 
				-best_model_criterion:
			
 
				--   - valid
			
 
				-    - acc
			
 
				-    - max
			
 
				-keep_nbest_models: 10
			
 
				-
			
 
				-optim: adam
			
 
				-optim_conf:
			
 
				-    lr: 0.0025
			
 
				-    weight_decay: 0.000001
			
 
				-scheduler: warmuplr
			
 
				-scheduler_conf:
			
 
				-    warmup_steps: 40000
			
 
				-
			
 
				-specaug: specaug
			
 
				-specaug_conf:
			
 
				-    apply_time_warp: true
			
 
				-    time_warp_window: 5
			
 
				-    time_warp_mode: bicubic
			
 
				-    apply_freq_mask: true
			
 
				-    freq_mask_width_range:
			
 
				-    - 0
			
 
				-    - 27
			
 
				-    num_freq_mask: 2
			
 
				-    apply_time_mask: true
			
 
				-    time_mask_width_ratio_range:
			
 
				-    - 0.
			
 
				-    - 0.05
			
 
				-    num_time_mask: 10
			
 
				-
			
 
				-dataset_conf:
			
 
				-    shuffle: True
			
 
				-    shuffle_conf:
			
 
				-        shuffle_size: 1024
			
 
				-        sort_size: 500
			
 
				-    batch_conf:
			
 
				-        batch_type: token
			
 
				-        batch_size: 10000
			
 
				-    num_workers: 8
			
 
				-
			
 
				-log_interval: 50
			
 
				-normalize: utterance_mvn
			
--- a/egs/librispeech_100h/conformer/conf/train_asr_conformer.yaml
+++ b/egs/librispeech_100h/conformer/conf/train_asr_conformer.yaml
@@ -1,8 +1,8 @@
 
				 encoder: conformer
			
 
				 encoder_conf:
			
 
				-    output_size: 512
			
 
				-    attention_heads: 8
			
 
				-    linear_units: 2048
			
 
				+    output_size: 256
			
 
				+    attention_heads: 4
			
 
				+    linear_units: 1024
			
 
				     num_blocks: 12
			
 
				     dropout_rate: 0.1
			
 
				     positional_dropout_rate: 0.1
			
@@ -19,7 +19,7 @@ encoder_conf:
 
				 
			
 
				 decoder: transformer
			
 
				 decoder_conf:
			
 
				-    attention_heads: 8
			
 
				+    attention_heads: 4
			
 
				     linear_units: 2048
			
 
				     num_blocks: 6
			
 
				     dropout_rate: 0.1
			
@@ -27,13 +27,25 @@ decoder_conf:
 
				     self_attention_dropout_rate: 0.1
			
 
				     src_attention_dropout_rate: 0.1
			
 
				 
			
 
				+# frontend related
			
 
				+frontend: wav_frontend
			
 
				+frontend_conf:
			
 
				+    fs: 16000
			
 
				+    window: hamming
			
 
				+    n_mels: 80
			
 
				+    frame_length: 25
			
 
				+    frame_shift: 10
			
 
				+    lfr_m: 1
			
 
				+    lfr_n: 1
			
 
				+
			
 
				+# hybrid CTC/attention
			
 
				 model_conf:
			
 
				     ctc_weight: 0.3
			
 
				     lsm_weight: 0.1
			
 
				     length_normalized_loss: false
			
 
				 
			
 
				-accum_grad: 2
			
 
				-max_epoch: 50
			
 
				+accum_grad: 1
			
 
				+max_epoch: 210
			
 
				 patience: none
			
 
				 init: none
			
 
				 best_model_criterion:
			
@@ -44,11 +56,11 @@ keep_nbest_models: 10
 
				 
			
 
				 optim: adam
			
 
				 optim_conf:
			
 
				-    lr: 0.0025
			
 
				+    lr: 0.002
			
 
				     weight_decay: 0.000001
			
 
				 scheduler: warmuplr
			
 
				 scheduler_conf:
			
 
				-    warmup_steps: 40000
			
 
				+    warmup_steps: 15000
			
 
				 
			
 
				 specaug: specaug
			
 
				 specaug_conf:
			
@@ -64,7 +76,7 @@ specaug_conf:
 
				     time_mask_width_ratio_range:
			
 
				     - 0.
			
 
				     - 0.05
			
 
				-    num_time_mask: 10
			
 
				+    num_time_mask: 5
			
 
				 
			
 
				 dataset_conf:
			
 
				     shuffle: True
			
--- a/egs/librispeech_100h/conformer/conf/train_asr_conformer_uttnorm.yaml
+++ b/egs/librispeech_100h/conformer/conf/train_asr_conformer_uttnorm.yaml
@@ -1,80 +0,0 @@
 
				-encoder: conformer
			
 
				-encoder_conf:
			
 
				-    output_size: 512
			
 
				-    attention_heads: 8
			
 
				-    linear_units: 2048
			
 
				-    num_blocks: 12
			
 
				-    dropout_rate: 0.1
			
 
				-    positional_dropout_rate: 0.1
			
 
				-    attention_dropout_rate: 0.1
			
 
				-    input_layer: conv2d
			
 
				-    normalize_before: true
			
 
				-    macaron_style: true
			
 
				-    rel_pos_type: latest
			
 
				-    pos_enc_layer_type: rel_pos
			
 
				-    selfattention_layer_type: rel_selfattn
			
 
				-    activation_type: swish
			
 
				-    use_cnn_module: true
			
 
				-    cnn_module_kernel: 31
			
 
				-
			
 
				-decoder: transformer
			
 
				-decoder_conf:
			
 
				-    attention_heads: 8
			
 
				-    linear_units: 2048
			
 
				-    num_blocks: 6
			
 
				-    dropout_rate: 0.1
			
 
				-    positional_dropout_rate: 0.1
			
 
				-    self_attention_dropout_rate: 0.1
			
 
				-    src_attention_dropout_rate: 0.1
			
 
				-
			
 
				-model_conf:
			
 
				-    ctc_weight: 0.3
			
 
				-    lsm_weight: 0.1
			
 
				-    length_normalized_loss: false
			
 
				-
			
 
				-accum_grad: 2
			
 
				-max_epoch: 50
			
 
				-patience: none
			
 
				-init: none
			
 
				-best_model_criterion:
			
 
				--   - valid
			
 
				-    - acc
			
 
				-    - max
			
 
				-keep_nbest_models: 10
			
 
				-
			
 
				-optim: adam
			
 
				-optim_conf:
			
 
				-    lr: 0.0025
			
 
				-    weight_decay: 0.000001
			
 
				-scheduler: warmuplr
			
 
				-scheduler_conf:
			
 
				-    warmup_steps: 40000
			
 
				-
			
 
				-specaug: specaug
			
 
				-specaug_conf:
			
 
				-    apply_time_warp: true
			
 
				-    time_warp_window: 5
			
 
				-    time_warp_mode: bicubic
			
 
				-    apply_freq_mask: true
			
 
				-    freq_mask_width_range:
			
 
				-    - 0
			
 
				-    - 27
			
 
				-    num_freq_mask: 2
			
 
				-    apply_time_mask: true
			
 
				-    time_mask_width_ratio_range:
			
 
				-    - 0.
			
 
				-    - 0.05
			
 
				-    num_time_mask: 10
			
 
				-
			
 
				-dataset_conf:
			
 
				-    shuffle: True
			
 
				-    shuffle_conf:
			
 
				-        shuffle_size: 1024
			
 
				-        sort_size: 500
			
 
				-    batch_conf:
			
 
				-        batch_type: token
			
 
				-        batch_size: 10000
			
 
				-    num_workers: 8
			
 
				-
			
 
				-log_interval: 50
			
 
				-normalize: utterance_mvn