_base_: 'default.yml' model_name: 'test_ade20k' # display name in the logger output: /mnt/petrelfs/xujilan/exps/cc12m_100/ print_freq: 100 data: with_dc: False train: root_dir: [ 's3://GCC/GCC12m/', ] meta_file: [ '/mnt/cache/share_data/DSK_datasets/cc12m/subset/cc12m_top100_coconouns.json', ] read_from: petrel use_dali: True batch_size: 256 input_size: 224 test_resize: 256 image_reader: type: pil sampler: type: distributed_epoch transforms: type: STANDARD fseek: True use_ranked: False ### for entity loss ### use_entity: ${model.use_entityloss} mask_type: class use_distilbert: True ### for mask loss ### cross_image: ${model.use_maskloss} val: type: clip read_from: petrel use_dali: True batch_size: 64 num_workers: 4 pin_memory: False input_size: 224 test_resize: 256 root_dir: '/mnt/cache/share/images/val/' meta_file: 'imagenet_info/val.json' # you can change it to imagenet_info relative path, file already in gitlab image_reader: type: pil sampler: type: distributed transforms: type: ONECROP evaluator: type: imagenet kwargs: topk: [1, 5] label_texts_ensemble: 'prompt1' img_aug: deit_aug: false img_size: 224 img_scale: [0.4, 1.0] interpolation: 'bilinear' color_jitter: 0.4 auto_augment: 'rand-m9-mstd0.5-inc1' re_prob: 0.25 re_mode: 'pixel' re_count: 1 text_aug: max_seq_len: 77 multi_label: 0 #changed to singlelabel word_type: 'noun' model: type: MultiLabelContrastive img_encoder: type: GroupViT embed_dim: 768 num_heads: [8, 8] embed_factors: [1, 1] depths: [6, 6] num_group_tokens: [64, 0] num_output_groups: [8] drop_rate: 0.0 drop_path_rate: 0.1 patch_norm: false imgnet_pretrained: 'dino' fixed: false text_encoder: type: Bert context_length: 77 width: 768 layers: 6 vocab_size: 49408 pretrained: true fixed: true contrast_temperature: 0.07 proj_num_layers: 2 output_dim: 256 multi_label: ${data.text_aug.multi_label} use_entityloss: true use_maskloss: true cross_threshold: 0.6 train: epochs: 50 base_lr: 1.6e-4 checkpoint: save_freq: 1 evaluate: eval_freq: 1 seg: cfg: segmentation/configs/_base_/datasets/ade20k.py