group_vit_gcc_redcap_30e.yml 608 B

1234567891011121314151617181920212223242526272829
  1. _base_: 'default.yml'
  2. model:
  3. type: MultiLabelContrastive
  4. img_encoder:
  5. type: GroupViT
  6. embed_dim: 384
  7. num_heads: [6, 6, 6]
  8. depths: [6, 3, 3]
  9. num_group_tokens: [64, 8, 0]
  10. num_output_groups: [64, 8]
  11. drop_rate: 0.0
  12. drop_path_rate: 0.1
  13. text_encoder:
  14. type: TextTransformer
  15. context_length: 77
  16. width: 256
  17. layers: 12
  18. vocab_size: 49408
  19. contrast_temperature: 0.07
  20. proj_num_layers: 2
  21. output_dim: 256
  22. multi_label: ${data.text_aug.multi_label} # multi_label=0 is better for RedCap
  23. data:
  24. dataset:
  25. train:
  26. - gcc3m
  27. - gcc12m
  28. - redcap12m