model: Paraformer
model_conf:
  ctc_weight: 0.0
  lsm_weight: 0.1
  length_normalized_loss: true
  predictor_weight: 1.0
  predictor_bias: 1
  sampling_ratio: 0.75
encoder: SANMEncoder
encoder_conf:
  output_size: 512
  attention_heads: 4
  linear_units: 2048
  num_blocks: 50
  dropout_rate: 0.1
  positional_dropout_rate: 0.1
  attention_dropout_rate: 0.1
  input_layer: pe
  pos_enc_class: SinusoidalPositionEncoder
  normalize_before: true
  kernel_size: 11
  sanm_shfit: 0
  selfattention_layer_type: sanm
decoder: ParaformerSANMDecoder
decoder_conf:
  attention_heads: 4
  linear_units: 2048
  num_blocks: 16
  dropout_rate: 0.1
  positional_dropout_rate: 0.1
  self_attention_dropout_rate: 0.1
  src_attention_dropout_rate: 0.1
  att_layer_num: 16
  kernel_size: 11
  sanm_shfit: 0
predictor: CifPredictorV2
predictor_conf:
  idim: 512
  threshold: 1.0
  l_order: 1
  r_order: 1
  tail_threshold: 0.45
frontend: WavFrontend
frontend_conf:
  fs: 16000
  window: hamming
  n_mels: 80
  frame_length: 25
  frame_shift: 10
  lfr_m: 7
  lfr_n: 6
  cmvn_file: ./speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn
specaug: SpecAugLFR
specaug_conf:
  apply_time_warp: false
  time_warp_window: 5
  time_warp_mode: bicubic
  apply_freq_mask: true
  freq_mask_width_range:
  - 0
  - 30
  lfr_rate: 6
  num_freq_mask: 1
  apply_time_mask: true
  time_mask_width_range:
  - 0
  - 12
  num_time_mask: 1
train_conf:
  accum_grad: 1
  grad_clip: 5
  max_epoch: 5
  val_scheduler_criterion:
  - valid
  - acc
  best_model_criterion:
  - - valid
    - acc
    - max
  keep_nbest_models: 100
  log_interval: 500
  resume: true
  validate_interval: 5000
  save_checkpoint_interval: 5000
  avg_nbest_model: 10
  use_deepspeed: false
  deepspeed_config: ./config/ds_stage1.json
optim: adam
optim_conf:
  lr: 0.0002
scheduler: warmuplr
scheduler_conf:
  warmup_steps: 30000
dataset: AudioDataset
dataset_conf:
  index_ds: IndexDSJsonl
  batch_sampler: BatchSampler
  batch_type: token
  batch_size: 300
  max_token_length: 2048
  buffer_size: 500
  shuffle: true
  num_workers: 4
  data_split_num: 1
  sort_size: 1024
tokenizer: CharTokenizer
tokenizer_conf:
  unk_symbol: <unk>
  split_with_space: true
  token_list: ./speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/tokens.json
  seg_dict_file: ./speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/seg_dict
input_size: 560
ctc_conf:
  dropout_rate: 0.0
  ctc_type: builtin
  reduce: true
  ignore_nan_grad: true
normalize: null
init_param: /home/work_nfs9/sywang/code/paraformer/outputs/model.pt
config: /home/work_nfs9/sywang/code/paraformer/outputs/config.yaml
is_training: true
train_data_set_list: data/train.jsonl
valid_data_set_list: data/val.jsonl
output_dir: ./outputs
model_path: /home/work_nfs9/sywang/code/paraformer/outputs
device: cpu