test_voc_context.yml 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. _base_: 'default.yml'
  2. model_name: 'test_context' # display name in the logger
  3. output: /mnt/petrelfs/xujilan/exps/cc12m_100/
  4. print_freq: 100
  5. data:
  6. with_dc: False
  7. train:
  8. root_dir: [
  9. 's3://GCC/GCC12m/',
  10. ]
  11. meta_file: [
  12. '/mnt/cache/share_data/DSK_datasets/cc12m/subset/cc12m_top100_coconouns.json',
  13. ]
  14. read_from: petrel
  15. use_dali: True
  16. batch_size: 256
  17. input_size: 224
  18. test_resize: 256
  19. image_reader:
  20. type: pil
  21. sampler:
  22. type: distributed_epoch
  23. transforms:
  24. type: STANDARD
  25. fseek: True
  26. use_ranked: False
  27. ### for entity loss ###
  28. use_entity: ${model.use_entityloss}
  29. mask_type: class
  30. use_distilbert: True
  31. ### for mask loss ###
  32. cross_image: ${model.use_maskloss}
  33. val:
  34. type: clip
  35. read_from: petrel
  36. use_dali: True
  37. batch_size: 64
  38. num_workers: 4
  39. pin_memory: False
  40. input_size: 224
  41. test_resize: 256
  42. root_dir: '/mnt/cache/share/images/val/'
  43. meta_file: 'imagenet_info/val.json'
  44. # you can change it to imagenet_info relative path, file already in gitlab
  45. image_reader:
  46. type: pil
  47. sampler:
  48. type: distributed
  49. transforms:
  50. type: ONECROP
  51. evaluator:
  52. type: imagenet
  53. kwargs:
  54. topk: [1, 5]
  55. label_texts_ensemble: 'prompt1'
  56. img_aug:
  57. deit_aug: false
  58. img_size: 224
  59. img_scale: [0.4, 1.0]
  60. interpolation: 'bilinear'
  61. color_jitter: 0.4
  62. auto_augment: 'rand-m9-mstd0.5-inc1'
  63. re_prob: 0.25
  64. re_mode: 'pixel'
  65. re_count: 1
  66. text_aug:
  67. max_seq_len: 77
  68. multi_label: 0 #changed to singlelabel
  69. word_type: 'noun'
  70. model:
  71. type: MultiLabelContrastive
  72. img_encoder:
  73. type: GroupViT
  74. embed_dim: 768
  75. num_heads: [8, 8]
  76. embed_factors: [1, 1]
  77. depths: [6, 6]
  78. num_group_tokens: [64, 0]
  79. num_output_groups: [8]
  80. drop_rate: 0.0
  81. drop_path_rate: 0.1
  82. patch_norm: false
  83. imgnet_pretrained: 'dino'
  84. fixed: false
  85. text_encoder:
  86. type: Bert
  87. context_length: 77
  88. width: 768
  89. layers: 6
  90. vocab_size: 49408
  91. pretrained: true
  92. fixed: true
  93. contrast_temperature: 0.07
  94. proj_num_layers: 2
  95. output_dim: 256
  96. multi_label: ${data.text_aug.multi_label}
  97. use_entityloss: true
  98. use_maskloss: true
  99. cross_threshold: 0.6
  100. train:
  101. epochs: 50
  102. base_lr: 1.6e-4
  103. checkpoint:
  104. save_freq: 1
  105. evaluate:
  106. eval_freq: 1
  107. seg:
  108. cfg: segmentation/configs/_base_/datasets/pascal_context.py
  109. # vis: ['input_pred']