template.yaml 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # This is an example that demonstrates how to configure a model file.
  2. # You can modify the configuration according to your own requirements.
  3. # to print the register_table:
  4. # from funasr.register import tables
  5. # tables.print()
  6. # network architecture
  7. model: Emotion2vec
  8. model_conf:
  9. loss_beta: 0.0
  10. loss_scale: null
  11. depth: 8
  12. start_drop_path_rate: 0.0
  13. end_drop_path_rate: 0.0
  14. num_heads: 12
  15. norm_eps: 1e-05
  16. norm_affine: true
  17. encoder_dropout: 0.1
  18. post_mlp_drop: 0.1
  19. attention_dropout: 0.1
  20. activation_dropout: 0.0
  21. dropout_input: 0.0
  22. layerdrop: 0.05
  23. embed_dim: 768
  24. mlp_ratio: 4.0
  25. layer_norm_first: false
  26. average_top_k_layers: 8
  27. end_of_block_targets: false
  28. clone_batch: 8
  29. layer_norm_target_layer: false
  30. batch_norm_target_layer: false
  31. instance_norm_target_layer: true
  32. instance_norm_targets: false
  33. layer_norm_targets: false
  34. ema_decay: 0.999
  35. ema_same_dtype: true
  36. log_norms: true
  37. ema_end_decay: 0.99999
  38. ema_anneal_end_step: 20000
  39. ema_encoder_only: false
  40. max_update: 100000
  41. extractor_mode: layer_norm
  42. shared_decoder: null
  43. min_target_var: 0.1
  44. min_pred_var: 0.01
  45. supported_modality: AUDIO
  46. mae_init: false
  47. seed: 1
  48. skip_ema: false
  49. cls_loss: 1.0
  50. recon_loss: 0.0
  51. d2v_loss: 1.0
  52. decoder_group: false
  53. adversarial_training: false
  54. adversarial_hidden_dim: 128
  55. adversarial_weight: 0.1
  56. cls_type: chunk
  57. normalize: true
  58. modalities:
  59. audio:
  60. type: AUDIO
  61. prenet_depth: 4
  62. prenet_layerdrop: 0.05
  63. prenet_dropout: 0.1
  64. start_drop_path_rate: 0.0
  65. end_drop_path_rate: 0.0
  66. num_extra_tokens: 10
  67. init_extra_token_zero: true
  68. mask_noise_std: 0.01
  69. mask_prob_min: null
  70. mask_prob: 0.5
  71. inverse_mask: false
  72. mask_prob_adjust: 0.05
  73. keep_masked_pct: 0.0
  74. mask_length: 5
  75. add_masks: false
  76. remove_masks: false
  77. mask_dropout: 0.0
  78. encoder_zero_mask: true
  79. mask_channel_prob: 0.0
  80. mask_channel_length: 64
  81. ema_local_encoder: false
  82. local_grad_mult: 1.0
  83. use_alibi_encoder: true
  84. alibi_scale: 1.0
  85. learned_alibi: false
  86. alibi_max_pos: null
  87. learned_alibi_scale: true
  88. learned_alibi_scale_per_head: true
  89. learned_alibi_scale_per_layer: false
  90. num_alibi_heads: 12
  91. model_depth: 8
  92. decoder:
  93. decoder_dim: 384
  94. decoder_groups: 16
  95. decoder_kernel: 7
  96. decoder_layers: 4
  97. input_dropout: 0.1
  98. add_positions_masked: false
  99. add_positions_all: false
  100. decoder_residual: true
  101. projection_layers: 1
  102. projection_ratio: 2.0
  103. extractor_mode: layer_norm
  104. feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
  105. conv_pos_width: 95
  106. conv_pos_groups: 16
  107. conv_pos_depth: 5
  108. conv_pos_pre_ln: false