123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- _base_: 'default.yml'
- model_name: 'test_context' # display name in the logger
- output: /mnt/petrelfs/xujilan/exps/cc12m_100/
- print_freq: 100
- data:
- with_dc: False
- train:
- root_dir: [
- 's3://GCC/GCC12m/',
- ]
- meta_file: [
- '/mnt/cache/share_data/DSK_datasets/cc12m/subset/cc12m_top100_coconouns.json',
- ]
- read_from: petrel
- use_dali: True
- batch_size: 256
- input_size: 224
- test_resize: 256
- image_reader:
- type: pil
- sampler:
- type: distributed_epoch
- transforms:
- type: STANDARD
- fseek: True
- use_ranked: False
- ### for entity loss ###
- use_entity: ${model.use_entityloss}
- mask_type: class
- use_distilbert: True
- ### for mask loss ###
- cross_image: ${model.use_maskloss}
-
-
- val:
- type: clip
- read_from: petrel
- use_dali: True
- batch_size: 64
- num_workers: 4
- pin_memory: False
- input_size: 224
- test_resize: 256
-
- root_dir: '/mnt/cache/share/images/val/'
- meta_file: 'imagenet_info/val.json'
- # you can change it to imagenet_info relative path, file already in gitlab
- image_reader:
- type: pil
- sampler:
- type: distributed
- transforms:
- type: ONECROP
- evaluator:
- type: imagenet
- kwargs:
- topk: [1, 5]
- label_texts_ensemble: 'prompt1'
-
- img_aug:
- deit_aug: false
- img_size: 224
- img_scale: [0.4, 1.0]
- interpolation: 'bilinear'
- color_jitter: 0.4
- auto_augment: 'rand-m9-mstd0.5-inc1'
- re_prob: 0.25
- re_mode: 'pixel'
- re_count: 1
- text_aug:
- max_seq_len: 77
- multi_label: 0 #changed to singlelabel
- word_type: 'noun'
- model:
- type: MultiLabelContrastive
- img_encoder:
- type: GroupViT
- embed_dim: 768
- num_heads: [8, 8]
- embed_factors: [1, 1]
- depths: [6, 6]
- num_group_tokens: [64, 0]
- num_output_groups: [8]
- drop_rate: 0.0
- drop_path_rate: 0.1
- patch_norm: false
- imgnet_pretrained: 'dino'
- fixed: false
- text_encoder:
- type: Bert
- context_length: 77
- width: 768
- layers: 6
- vocab_size: 49408
- pretrained: true
- fixed: true
- contrast_temperature: 0.07
- proj_num_layers: 2
- output_dim: 256
- multi_label: ${data.text_aug.multi_label}
-
- use_entityloss: true
- use_maskloss: true
- cross_threshold: 0.6
-
- train:
- epochs: 50
- base_lr: 1.6e-4
- checkpoint:
- save_freq: 1
- evaluate:
- eval_freq: 1
- seg:
- cfg: segmentation/configs/_base_/datasets/pascal_context.py
- # vis: ['input_pred']
|