train.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import os
  2. import os.path as op
  3. import torch
  4. import numpy as np
  5. import random
  6. import time
  7. from datasets import build_dataloader
  8. from processor.processor import do_train
  9. from utils.checkpoint import Checkpointer
  10. from utils.iotools import save_train_configs
  11. from utils.logger import setup_logger
  12. from solver import build_optimizer, build_lr_scheduler
  13. from model import build_model
  14. from utils.metrics import Evaluator
  15. from utils.options import get_args
  16. from utils.comm import get_rank, synchronize
  17. def set_seed(seed=0):
  18. torch.manual_seed(seed)
  19. torch.cuda.manual_seed(seed)
  20. torch.cuda.manual_seed_all(seed)
  21. np.random.seed(seed)
  22. random.seed(seed)
  23. torch.backends.cudnn.deterministic = True
  24. torch.backends.cudnn.benchmark = True
  25. if __name__ == '__main__':
  26. args = get_args()
  27. set_seed(1+get_rank())
  28. name = args.name
  29. num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
  30. args.distributed = num_gpus > 1
  31. if args.distributed:
  32. torch.cuda.set_device(args.local_rank)
  33. torch.distributed.init_process_group(backend="nccl", init_method="env://")
  34. synchronize()
  35. device = "cuda"
  36. cur_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
  37. args.output_dir = op.join(args.output_dir, args.dataset_name, f'{cur_time}_{name}')
  38. logger = setup_logger('IRRA', save_dir=args.output_dir, if_train=args.training, distributed_rank=get_rank())
  39. logger.info("Using {} GPUs".format(num_gpus))
  40. logger.info(str(args).replace(',', '\n'))
  41. save_train_configs(args.output_dir, args)
  42. # get image-text pair datasets dataloader
  43. train_loader, val_img_loader, val_txt_loader, num_classes = build_dataloader(args)
  44. model = build_model(args, num_classes)
  45. logger.info('Total params: %2.fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
  46. model.to(device)
  47. if args.distributed:
  48. model = torch.nn.parallel.DistributedDataParallel(
  49. model,
  50. device_ids=[args.local_rank],
  51. output_device=args.local_rank,
  52. # this should be removed if we update BatchNorm stats
  53. broadcast_buffers=False,
  54. )
  55. optimizer = build_optimizer(args, model)
  56. scheduler = build_lr_scheduler(args, optimizer)
  57. is_master = get_rank() == 0
  58. checkpointer = Checkpointer(model, optimizer, scheduler, args.output_dir, is_master)
  59. evaluator = Evaluator(val_img_loader, val_txt_loader)
  60. start_epoch = 1
  61. if args.resume:
  62. checkpoint = checkpointer.resume(args.resume_ckpt_file)
  63. start_epoch = checkpoint['epoch']
  64. do_train(start_epoch, args, model, train_loader, evaluator, optimizer, scheduler, checkpointer)