1234567891011121314151617181920212223242526272829 |
- _base_: 'default.yml'
- model:
- type: MultiLabelContrastive
- img_encoder:
- type: GroupViT
- embed_dim: 384
- num_heads: [6, 6, 6]
- depths: [6, 3, 3]
- num_group_tokens: [64, 8, 0]
- num_output_groups: [64, 8]
- drop_rate: 0.0
- drop_path_rate: 0.1
- text_encoder:
- type: TextTransformer
- context_length: 77
- width: 256
- layers: 12
- vocab_size: 49408
- contrast_temperature: 0.07
- proj_num_layers: 2
- output_dim: 256
- multi_label: ${data.text_aug.multi_label} # multi_label=0 is better for RedCap
- data:
- dataset:
- train:
- - gcc3m
- - gcc12m
- - redcap12m
|