group_vit_gcc_yfcc_30e.yml 533 B

1234567891011121314151617181920212223242526
  1. _base_: 'default.yml'
  2. data:
  3. text_aug:
  4. multi_label: 3
  5. model:
  6. type: MultiLabelContrastive
  7. img_encoder:
  8. type: GroupViT
  9. embed_dim: 384
  10. num_heads: [6, 6, 6]
  11. depths: [6, 3, 3]
  12. num_group_tokens: [64, 8, 0]
  13. num_output_groups: [64, 8]
  14. drop_rate: 0.0
  15. drop_path_rate: 0.1
  16. text_encoder:
  17. type: TextTransformer
  18. context_length: 77
  19. width: 256
  20. layers: 12
  21. vocab_size: 49408
  22. contrast_temperature: 0.07
  23. proj_num_layers: 2
  24. output_dim: 256
  25. multi_label: ${data.text_aug.multi_label}