debug.yml 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. _base_: '../default.yml'
  2. model_name: 'debug' # display name in the logger
  3. output: /mnt/petrelfs/xujilan/exps/
  4. print_freq: 100
  5. data:
  6. with_dc: False
  7. train:
  8. root_dir: [
  9. 's3://GCC/GCC12m/',
  10. ]
  11. meta_file: [
  12. '/mnt/petrelfs/xujilan/data/cc12m_100/cc4m.json',
  13. ]
  14. read_from: petrel
  15. use_dali: True
  16. batch_size: 256
  17. input_size: 224
  18. test_resize: 256
  19. image_reader:
  20. type: pil
  21. sampler:
  22. type: distributed_epoch
  23. transforms:
  24. type: STANDARD
  25. fseek: True
  26. use_ranked: False
  27. use_entity: ${model.use_entityloss}
  28. mask_type: class
  29. use_distilbert: True
  30. val:
  31. type: clip
  32. read_from: petrel
  33. use_dali: True
  34. batch_size: 64
  35. num_workers: 4
  36. pin_memory: False
  37. input_size: 224
  38. test_resize: 256
  39. root_dir: '/mnt/cache/share/images/val/'
  40. meta_file: 'imagenet_info/val.json'
  41. # you can change it to imagenet_info relative path, file already in gitlab
  42. image_reader:
  43. type: pil
  44. sampler:
  45. type: distributed
  46. transforms:
  47. type: ONECROP
  48. evaluator:
  49. type: imagenet
  50. kwargs:
  51. topk: [1, 5]
  52. label_texts_ensemble: 'prompt1'
  53. img_aug:
  54. deit_aug: true
  55. img_size: 224
  56. img_scale: [0.08, 1.0]
  57. interpolation: 'bilinear'
  58. # interpolation: 2
  59. color_jitter: 0.4
  60. auto_augment: 'rand-m9-mstd0.5-inc1'
  61. re_prob: 0.25
  62. re_mode: 'pixel'
  63. re_count: 1
  64. text_aug:
  65. max_seq_len: 77
  66. multi_label: 0 # we do not use multi-label contrastive
  67. word_type: 'noun'
  68. model:
  69. type: MultiLabelContrastive
  70. img_encoder:
  71. type: GroupViT
  72. embed_dim: 768
  73. num_heads: [8, 8]
  74. embed_factors: [1, 1]
  75. depths: [6, 6]
  76. num_group_tokens: [64, 0]
  77. num_output_groups: [8]
  78. drop_rate: 0.0
  79. drop_path_rate: 0.1
  80. patch_norm: false
  81. imgnet_pretrained: 'dino'
  82. fixed: false
  83. imgnet_pretrained_checkpoint: '/mnt/petrelfs/xujilan/checkpoints/dino_vitbase16_pretrain.pth'
  84. text_encoder:
  85. type: Bert
  86. context_length: 77
  87. width: 768
  88. layers: 6
  89. vocab_size: 49408
  90. pretrained: true
  91. fixed: false
  92. contrast_temperature: 0.07
  93. proj_num_layers: 2
  94. output_dim: 256
  95. multi_label: ${data.text_aug.multi_label}
  96. use_entityloss: true
  97. train:
  98. epochs: 30
  99. base_lr: 6.4e-4
  100. warmup_lr: 1.6e-5
  101. min_lr: 1.6e-4
  102. checkpoint:
  103. save_freq: 1
  104. evaluate:
  105. eval_freq: 1