| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- # This is an example that demonstrates how to configure a model file.
- # You can modify the configuration according to your own requirements.
- # to print the register_table:
- # from funasr.register import tables
- # tables.print()
- # network architecture
- model: Emotion2vec
- model_conf:
- loss_beta: 0.0
- loss_scale: null
- depth: 8
- start_drop_path_rate: 0.0
- end_drop_path_rate: 0.0
- num_heads: 12
- norm_eps: 1e-05
- norm_affine: true
- encoder_dropout: 0.1
- post_mlp_drop: 0.1
- attention_dropout: 0.1
- activation_dropout: 0.0
- dropout_input: 0.0
- layerdrop: 0.05
- embed_dim: 768
- mlp_ratio: 4.0
- layer_norm_first: false
- average_top_k_layers: 8
- end_of_block_targets: false
- clone_batch: 8
- layer_norm_target_layer: false
- batch_norm_target_layer: false
- instance_norm_target_layer: true
- instance_norm_targets: false
- layer_norm_targets: false
- ema_decay: 0.999
- ema_same_dtype: true
- log_norms: true
- ema_end_decay: 0.99999
- ema_anneal_end_step: 20000
- ema_encoder_only: false
- max_update: 100000
- extractor_mode: layer_norm
- shared_decoder: null
- min_target_var: 0.1
- min_pred_var: 0.01
- supported_modality: AUDIO
- mae_init: false
- seed: 1
- skip_ema: false
- cls_loss: 1.0
- recon_loss: 0.0
- d2v_loss: 1.0
- decoder_group: false
- adversarial_training: false
- adversarial_hidden_dim: 128
- adversarial_weight: 0.1
- cls_type: chunk
- normalize: true
- modalities:
- audio:
- type: AUDIO
- prenet_depth: 4
- prenet_layerdrop: 0.05
- prenet_dropout: 0.1
- start_drop_path_rate: 0.0
- end_drop_path_rate: 0.0
- num_extra_tokens: 10
- init_extra_token_zero: true
- mask_noise_std: 0.01
- mask_prob_min: null
- mask_prob: 0.5
- inverse_mask: false
- mask_prob_adjust: 0.05
- keep_masked_pct: 0.0
- mask_length: 5
- add_masks: false
- remove_masks: false
- mask_dropout: 0.0
- encoder_zero_mask: true
- mask_channel_prob: 0.0
- mask_channel_length: 64
- ema_local_encoder: false
- local_grad_mult: 1.0
- use_alibi_encoder: true
- alibi_scale: 1.0
- learned_alibi: false
- alibi_max_pos: null
- learned_alibi_scale: true
- learned_alibi_scale_per_head: true
- learned_alibi_scale_per_layer: false
- num_alibi_heads: 12
- model_depth: 8
- decoder:
- decoder_dim: 384
- decoder_groups: 16
- decoder_kernel: 7
- decoder_layers: 4
- input_dropout: 0.1
- add_positions_masked: false
- add_positions_all: false
- decoder_residual: true
- projection_layers: 1
- projection_ratio: 2.0
- extractor_mode: layer_norm
- feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
- conv_pos_width: 95
- conv_pos_groups: 16
- conv_pos_depth: 5
- conv_pos_pre_ln: false
|