| | |
| |
|
| | |
| | model: |
| | _target_: sam2.modeling.sam2_base.SAM2Base |
| | image_encoder: |
| | _target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
| | scalp: 1 |
| | trunk: |
| | _target_: sam2.modeling.backbones.hieradet.Hiera |
| | embed_dim: 144 |
| | num_heads: 2 |
| | stages: [2, 6, 36, 4] |
| | global_att_blocks: [23, 33, 43] |
| | window_pos_embed_bkg_spatial_size: [7, 7] |
| | window_spec: [8, 4, 16, 8] |
| | neck: |
| | _target_: sam2.modeling.backbones.image_encoder.FpnNeck |
| | position_encoding: |
| | _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| | num_pos_feats: 256 |
| | normalize: true |
| | scale: null |
| | temperature: 10000 |
| | d_model: 256 |
| | backbone_channel_list: [1152, 576, 288, 144] |
| | fpn_top_down_levels: [2, 3] |
| | fpn_interp_model: nearest |
| |
|
| | memory_attention: |
| | _target_: sam2.modeling.memory_attention.MemoryAttention |
| | d_model: 256 |
| | pos_enc_at_input: true |
| | layer: |
| | _target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
| | activation: relu |
| | dim_feedforward: 2048 |
| | dropout: 0.1 |
| | pos_enc_at_attn: false |
| | self_attention: |
| | _target_: sam2.modeling.sam.transformer.RoPEAttention |
| | rope_theta: 10000.0 |
| | feat_sizes: [32, 32] |
| | embedding_dim: 256 |
| | num_heads: 1 |
| | downsample_rate: 1 |
| | dropout: 0.1 |
| | d_model: 256 |
| | pos_enc_at_cross_attn_keys: true |
| | pos_enc_at_cross_attn_queries: false |
| | cross_attention: |
| | _target_: sam2.modeling.sam.transformer.RoPEAttention |
| | rope_theta: 10000.0 |
| | feat_sizes: [32, 32] |
| | rope_k_repeat: True |
| | embedding_dim: 256 |
| | num_heads: 1 |
| | downsample_rate: 1 |
| | dropout: 0.1 |
| | kv_in_dim: 64 |
| | num_layers: 4 |
| |
|
| | memory_encoder: |
| | _target_: sam2.modeling.memory_encoder.MemoryEncoder |
| | out_dim: 64 |
| | position_encoding: |
| | _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| | num_pos_feats: 64 |
| | normalize: true |
| | scale: null |
| | temperature: 10000 |
| | mask_downsampler: |
| | _target_: sam2.modeling.memory_encoder.MaskDownSampler |
| | kernel_size: 3 |
| | stride: 2 |
| | padding: 1 |
| | fuser: |
| | _target_: sam2.modeling.memory_encoder.Fuser |
| | layer: |
| | _target_: sam2.modeling.memory_encoder.CXBlock |
| | dim: 256 |
| | kernel_size: 7 |
| | padding: 3 |
| | layer_scale_init_value: 1e-6 |
| | use_dwconv: True |
| | num_layers: 2 |
| |
|
| | num_maskmem: 7 |
| | image_size: 1024 |
| | |
| | sigmoid_scale_for_mem_enc: 20.0 |
| | sigmoid_bias_for_mem_enc: -10.0 |
| | use_mask_input_as_output_without_sam: true |
| | |
| | directly_add_no_mem_embed: true |
| | no_obj_embed_spatial: true |
| | |
| | use_high_res_features_in_sam: true |
| | |
| | multimask_output_in_sam: true |
| | |
| | iou_prediction_use_sigmoid: True |
| | |
| | use_obj_ptrs_in_encoder: true |
| | add_tpos_enc_to_obj_ptrs: true |
| | proj_tpos_enc_in_obj_ptrs: true |
| | use_signed_tpos_enc_to_obj_ptrs: true |
| | only_obj_ptrs_in_the_past_for_eval: true |
| | |
| | pred_obj_scores: true |
| | pred_obj_scores_mlp: true |
| | fixed_no_obj_ptr: true |
| | |
| | multimask_output_for_tracking: true |
| | use_multimask_token_for_obj_ptr: true |
| | multimask_min_pt_num: 0 |
| | multimask_max_pt_num: 1 |
| | use_mlp_for_obj_ptr_proj: true |
| | |
| | compile_image_encoder: False |
| |
|