This repository contains the code for the paper "How Can Contrastive Pre-training Benefit Audio-Visual Segmentation? A Study from Supervised and Zero-shot Perspectives", published at BMVC 2023. This work mainly explores two-part benefits of contrastive pre-training for audio-visual segmentation (AVS).
- Zero-shot setting. In this setting, the pre-trained models work together with the Segment Anything Model (SAM) to achieve Zero-shot AVS.
 - Supervised setting. In this setting, the work mainly explores how much the segmentor can gain when using contrastively pre-trained model weights to init the backbone.
 
./environment_configthe yml files of conda environments used in the work.
conda env create -n ZSAVS -f Zero_shot_AVS.ymlconda env create -n audio_seg -f audio_seg_config.ymlFor the dataset, we follow AVSBench, ECCV 2022. You can also access data via our google drive. Then, put the data into the avsbench_data folder.
python preprocess_s4.py
python preprocess_ms3.pyPretrained models we used in this work include AudioCLIP, CLIP, ESResNeXt, SAM, VGGish, resnet50 and PVT.
For AudioCLIP, CLIP and ESResNeXt, you can access the pre-trained checkpoints from AudioCLIP releases.
For SAM, you can access the pre-trained checkpoint from SAM-VIT-H
For VGGish, resnet50 and PVT, we follow AVSbench pretrained backbones
Sam4AVS
├─ avs_scripts
 ├─ avs_ms3
 ├─ avs_ms3_aclp
 ├─ avs_s4
 ├─ avs_s4_aclp
 ├─ avs_ms3_aclp_ablation
 ├─ avs_s4_aclp_ablation
 └─ avs_s4_zero_shot_sam
├─ avsbench_data
 ├─ Multi-sources
  └─ ms3_data
   ├─ audio_log_mel
   ├─ audio_wav
   ├─ gt_masks
   ├─ raw_videos
   └─ visual_frames
 ├─ Single-source
  └─ s4_data
   ├─ audio_log_mel
   ├─ audio_wav
   ├─ gt_masks
   ├─ raw_videos
   └─ visual_frames
 └─ train_logs
├─ preprocess_scripts
 ├─ preprocess_ms3.py
 └─ preprocess_s4.py
└─ pretrained_backbones
 ├─ AudioCLIP-Full-Training.pt
 ├─ AudioCLIP-Partial-Training.pt
 ├─ bpe_simple_vocab_16e6.txt.gz
 ├─ CLIP.pt
 ├─ ESRNXFBSP.pt
 ├─ pvt_v2_b5.pth
 ├─ resnet50-19c8e357.pth
 ├─ sam_vit_h_4b8939.pth
 └─ vggish-10086976.pth
conda activate audio_segFor single-source AVS
cd avs_scripts/avs_s4_aclp
bash train_bashes/train_fully_audiocliprealfpn_visual_training_Adam0.00005_lr_mult_batch_4_concate_fusion_bilinear.shFor multi-source AVS
cd avs_scripts/avs_ms3_aclp
bash train_bashes/train_fully_audiocliprealfpn_visual_training_Adam0.00005_lr_mult_batch_4_concate_fusion_bilinear.shFor single-source AVS ablation
cd avs_scripts/avs_s4_aclp_ablation
bash train_bashes/*.shFor multi-source AVS ablation
cd avs_scripts/avs_ms3_aclp_ablation
bash train_bashes/*.shconda activate ZSAVSFor single-source AVS
cd ./zero_shot/S4
python no_prompt.pyFor multi-source AVS
cd ./zero_shot/MS3
python no_prompt.pyFor single-source AVS
- Point-prompt(local)
 
cd avs_scripts/avs_s4_zero_shot_sam
bash train_bashes/CLIP_surgery_reverse_0.6_peak_maxscore_Full_none.sh- Point-prompt(global)
 
cd avs_scripts/avs_s4_zero_shot_sam
bash train_bashes/CLIP_surgery_reverse_top_maxscore_Full_none.sh- Point-prompt(dense)
 
cd avs_scripts/avs_s4_zero_shot_sam
bash train_bashes/CLIP_surgery_reverse_0.85_dense_maxscore_Full_none.shFor multi-source AVS
- Point-prompt(local)
 
cd avs_scripts/avs_ms3_zero_shot_sam
bash train_bashes/Multi_CLIP_surgery_reverse_0.65_peak_maxscore_Full_none.sh- Point-prompt(global)
 
cd avs_scripts/avs_ms3_zero_shot_sam
bash train_bashes/Multi_CLIP_surgery_reverse_top_maxscore_Full_none.sh- Point-prompt(dense)
 
cd avs_scripts/avs_ms3_zero_shot_sam
bash train_bashes/Multi_CLIP_surgery_reverse_0.7_dense_maxscore_Full_none.shFor single-source AVS
cd avs_scripts/avs_s4_zero_shot_sam
bash heatmap_box_prompt/box_prompt_CLIP_surgery_reverse_0.55_single_box_maxarea_Full_none.shFor multi-source AVS
cd avs_scripts/avs_ms3_zero_shot_sam
bash heatmap_box_prompt/multi_box_prompt_CLIP_surgery_reverse_0.55_single_box_maxarea_Full_none.shFor single-source AVS
cd ./zero_shot/S4
python box_prompt.pyFor multi-source AVS
cd ./zero_shot/MS3
python box_prompt.py