group_vit_gcc_yfcc_cuhkpedes_30e.yml 598 B

12345678910111213141516171819202122232425262728293031
  1. _base_: 'default.yml'
  2. data:
  3. text_aug:
  4. multi_label: 3
  5. # batch_size: 128
  6. model:
  7. type: MultiLabelContrastive
  8. img_encoder:
  9. type: GroupViT
  10. embed_dim: 384
  11. num_heads: [6, 6, 6]
  12. depths: [6, 3, 3]
  13. num_group_tokens: [64, 8, 0]
  14. num_output_groups: [64, 8]
  15. drop_rate: 0.0
  16. drop_path_rate: 0.1
  17. text_encoder:
  18. type: TextTransformer
  19. context_length: 77
  20. width: 256
  21. layers: 12
  22. vocab_size: 49408
  23. contrast_temperature: 0.07
  24. proj_num_layers: 2
  25. output_dim: 256
  26. multi_label: ${data.text_aug.multi_label}
  27. data:
  28. dataset:
  29. train:
  30. - cuhkpedes