Parcourir la source

refactor: 更新数据集路径和模型配置

- 修改了数据集路径和 BPE 路径
- 调整了 num_workers 参数
- 移除了部分冗余代码
- 更新了模型训练的相关配置
Yijun Fu il y a 1 mois
Parent
commit
fd5498a4f0
3 fichiers modifiés avec 25 ajouts et 23 suppressions
  1. 4 3
      configs/default.yml
  2. 2 1
      datasets/bases.py
  3. 19 19
      main_group_vit.py

+ 4 - 3
configs/default.yml

@@ -1,12 +1,12 @@
 data:
   batch_size: 256
   pin_memory: true
-  num_workers: 6
+  num_workers: 1
   # Thomas said it should be at least about 5-10x your batch size; beyond that,
   # the differences become academic.
   shuffle_buffer: 10000
   seed: ${train.seed}
-  bpe_path: /home/linkslinks/文档/ai/GroupViT/datasets/bpe_simple_vocab_16e6.txt.gz
+  bpe_path: /mnt/vos-s9gjtkm2/reid/groupvit/GroupViT/datasets/bpe_simple_vocab_16e6.txt.gz
   dataset:
     meta:
       gcc3m:
@@ -41,7 +41,8 @@ data:
         prefix: cuhkpedes-train-{000000..000004}.tar
         length: 34054
       cuhkpedes_val:
-        raw_path: /home/linkslinks/dataset/
+        # raw_path: /home/linkslinks/dataset/
+        raw_path: /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/
         name: CUHK-PEDES
         type: img_txt_pair
         path: local_data/cuhkpedes_shards

+ 2 - 1
datasets/bases.py

@@ -117,7 +117,8 @@ class TextDataset(Dataset):
         self.captions = captions
         self.text_length = text_length
         self.truncate = truncate
-        self.tokenizer = SimpleTokenizer(bpe_path="/home/linkslinks/文档/ai/GroupViT/datasets/bpe_simple_vocab_16e6.txt.gz")
+        # self.tokenizer = SimpleTokenizer(bpe_path="/home/linkslinks/文档/ai/GroupViT/datasets/bpe_simple_vocab_16e6.txt.gz")
+        self.tokenizer = SimpleTokenizer(bpe_path="/mnt/vos-s9gjtkm2/reid/groupvit/GroupViT/datasets/bpe_simple_vocab_16e6.txt.gz")
 
     def __len__(self):
         return len(self.caption_pids)

+ 19 - 19
main_group_vit.py

@@ -121,21 +121,21 @@ def train(cfg):
     logger.info(f'Creating model:{cfg.model.type}/{cfg.model_name}')
     model = build_model(cfg.model)
     
-    # load_checkpoint(cfg, model, None, None)
+    # # load_checkpoint(cfg, model, None, None)
     
-        # 冻结所有层
-    for param in model.parameters():
-        param.requires_grad = False
+    #     # 冻结所有层
+    # for param in model.parameters():
+    #     param.requires_grad = False
         
-    # 如果你只想冻结特定的层,可以按照以下方式进行
-    # 例如,冻结所有的 img_projector 层
-    for param in model.img_projector.parameters():
-        param.requires_grad = True
-
-    # 如果你只想冻结特定的层,可以按照以下方式进行
-    # 例如,冻结所有的 text_projector 层
-    for param in model.text_projector.parameters():
-        param.requires_grad = True
+    # # 如果你只想冻结特定的层,可以按照以下方式进行
+    # # 例如,冻结所有的 img_projector 层
+    # for param in model.img_projector.parameters():
+    #     param.requires_grad = True
+
+    # # 如果你只想冻结特定的层,可以按照以下方式进行
+    # # 例如,冻结所有的 text_projector 层
+    # for param in model.text_projector.parameters():
+    #     param.requires_grad = True
     
     model.cuda()
     logger.info(str(model))
@@ -266,7 +266,7 @@ def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler):
     start = time.time()
     end = time.time()
     for idx, samples in enumerate(data_loader):
-        print('\n\n1\n\n')
+        # print('\n\n1\n\n')
 
         batch_size = config.data.batch_size
 
@@ -312,7 +312,7 @@ def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler):
             lr_scheduler.step_update(epoch * num_steps + idx)
 
         torch.cuda.synchronize()
-        print('\n\n2\n\n')
+        # print('\n\n2\n\n')
 
         loss_meter.update(loss.item(), batch_size)
         for loss_name in log_vars:
@@ -320,10 +320,10 @@ def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler):
         norm_meter.update(grad_norm)
         batch_time.update(time.time() - end)
         end = time.time()
-        print('\n\n3\n\n')
+        # print('\n\n3\n\n')
 
         if idx % config.print_freq == 0:
-            print('\n\n4\n\n')
+            # print('\n\n4\n\n')
             lr = optimizer.param_groups[0]['lr']
             memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
             etas = batch_time.avg * (num_steps - idx)
@@ -335,14 +335,14 @@ def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler):
                         f'{log_vars_str}\t'
                         f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
                         f'mem {memory_used:.0f}MB')
-            print('\n\n5\n\n')
+            # print('\n\n5\n\n')
             if wandb is not None:
                 log_stat = {f'iter/train_{n}': m.avg for n, m in log_vars_meters.items()}
                 log_stat['iter/train_total_loss'] = loss_meter.avg
                 log_stat['iter/learning_rate'] = lr
                 wandb.log(log_stat)
 
-    print('\n\n6\n\n')
+    # print('\n\n6\n\n')
     epoch_time = time.time() - start
     logger.info(f'EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}')
     result_dict = dict(total_loss=loss_meter.avg)