Files
SimSwapPlus/train_multigpu.py
T
chenxuanhong 99ed65aaa3 update
2022-03-25 18:52:25 +08:00

250 lines
10 KiB
Python

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################
# File: train.py
# Created Date: Tuesday April 28th 2020
# Author: Chen Xuanhong
# Email: chenxuanhongzju@outlook.com
# Last Modified: Thursday, 24th March 2022 2:14:07 pm
# Modified By: Chen Xuanhong
# Copyright (c) 2020 Shanghai Jiao Tong University
#############################################################
import os
import shutil
import argparse
from torch.backends import cudnn
from utilities.json_config import readConfig, writeConfig
from utilities.reporter import Reporter
from utilities.yaml_config import getConfigYaml
def str2bool(v):
return v.lower() in ('true')
####################################################################################
# To configure the seting of training\finetune\test
#
####################################################################################
def getParameters():
parser = argparse.ArgumentParser()
# general settings
parser.add_argument('-v', '--version', type=str, default='cycle_res3',
help="version name for train, test, finetune")
parser.add_argument('-t', '--tag', type=str, default='cycle',
help="tag for current experiment")
parser.add_argument('-p', '--phase', type=str, default="train",
choices=['train', 'finetune','debug'],
help="The phase of current project")
parser.add_argument('-c', '--gpus', type=int, nargs='+', default=[0,1,2,3]) # <0 if it is set as -1, program will use CPU
parser.add_argument('-e', '--ckpt', type=int, default=74,
help="checkpoint epoch for test phase or finetune phase")
# training
parser.add_argument('--experiment_description', type=str,
default="cycle配合残差decoder,改用starganv2的generator结构")
parser.add_argument('--train_yaml', type=str, default="train_cycleloss_res.yaml")
# system logger
parser.add_argument('--logger', type=str,
default="wandb", choices=['tensorboard', 'wandb','none'], help='system logger')
# # logs (does not to be changed in most time)
# parser.add_argument('--dataloader_workers', type=int, default=6)
# parser.add_argument('--use_tensorboard', type=str2bool, default='True',
# choices=['True', 'False'], help='enable the tensorboard')
# parser.add_argument('--log_step', type=int, default=100)
# parser.add_argument('--sample_step', type=int, default=100)
# # template (onece editing finished, it should be deleted)
# parser.add_argument('--str_parameter', type=str, default="default", help='str parameter')
# parser.add_argument('--str_parameter_choices', type=str,
# default="default", choices=['choice1', 'choice2','choice3'], help='str parameter with choices list')
# parser.add_argument('--int_parameter', type=int, default=0, help='int parameter')
# parser.add_argument('--float_parameter', type=float, default=0.0, help='float parameter')
# parser.add_argument('--bool_parameter', type=str2bool, default='True', choices=['True', 'False'], help='bool parameter')
# parser.add_argument('--list_str_parameter', type=str, nargs='+', default=["element1","element2"], help='str list parameter')
# parser.add_argument('--list_int_parameter', type=int, nargs='+', default=[0,1], help='int list parameter')
return parser.parse_args()
ignoreKey = [
"dataloader_workers",
"log_root_path",
"project_root",
"project_summary",
"project_checkpoints",
"project_samples",
"project_scripts",
"reporter_path",
"use_specified_data",
"specified_data_paths",
"dataset_path","cuda",
"test_script_name",
"test_dataloader",
"test_dataset_path",
"save_test_result",
"test_batch_size",
"node_name",
"checkpoint_epoch",
"test_dataset_path",
"test_dataset_name",
"use_my_test_date"]
####################################################################################
# This function will create the related directories before the
# training\fintune\test starts
# Your_log_root (version name)
# |---summary/...
# |---samples/... (save evaluated images)
# |---checkpoints/...
# |---scripts/...
#
####################################################################################
def createDirs(sys_state):
# the base dir
if not os.path.exists(sys_state["log_root_path"]):
os.makedirs(sys_state["log_root_path"])
# create dirs
sys_state["project_root"] = os.path.join(sys_state["log_root_path"],
sys_state["version"])
project_root = sys_state["project_root"]
if not os.path.exists(project_root):
os.makedirs(project_root)
sys_state["project_summary"] = os.path.join(project_root, "summary")
if not os.path.exists(sys_state["project_summary"]):
os.makedirs(sys_state["project_summary"])
sys_state["project_checkpoints"] = os.path.join(project_root, "checkpoints")
if not os.path.exists(sys_state["project_checkpoints"]):
os.makedirs(sys_state["project_checkpoints"])
sys_state["project_samples"] = os.path.join(project_root, "samples")
if not os.path.exists(sys_state["project_samples"]):
os.makedirs(sys_state["project_samples"])
sys_state["project_scripts"] = os.path.join(project_root, "scripts")
if not os.path.exists(sys_state["project_scripts"]):
os.makedirs(sys_state["project_scripts"])
sys_state["reporter_path"] = os.path.join(project_root,sys_state["version"]+"_report")
def main():
config = getParameters()
# speed up the program
cudnn.benchmark = True
cudnn.enabled = True
from utilities.logo_class import logo_class
logo_class.print_group_logo()
sys_state = {}
# set the GPU number
gpus = [str(i) for i in config.gpus]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpus)
# read system environment paths
env_config = readConfig('env/env.json')
env_config = env_config["path"]
# obtain all configurations in argparse
config_dic = vars(config)
for config_key in config_dic.keys():
sys_state[config_key] = config_dic[config_key]
#=======================Train Phase=========================#
if config.phase == "train":
# read training configurations from yaml file
ymal_config = getConfigYaml(os.path.join(env_config["train_config_path"], config.train_yaml))
for item in ymal_config.items():
sys_state[item[0]] = item[1]
# create related dirs
sys_state["log_root_path"] = env_config["train_log_root"]
createDirs(sys_state)
# create reporter file
reporter = Reporter(sys_state["reporter_path"])
# save the config json
config_json = os.path.join(sys_state["project_root"], env_config["config_json_name"])
writeConfig(config_json, sys_state)
# save the dependent scripts
# TODO and copy the scripts to the project dir
# save the trainer script into [train_logs_root]\[version name]\scripts\
file1 = os.path.join(env_config["train_scripts_path"],
"trainer_%s.py"%sys_state["train_script_name"])
tgtfile1 = os.path.join(sys_state["project_scripts"],
"trainer_%s.py"%sys_state["train_script_name"])
shutil.copyfile(file1,tgtfile1)
# save the yaml file
file1 = os.path.join(env_config["train_config_path"], config.train_yaml)
tgtfile1 = os.path.join(sys_state["project_scripts"], config.train_yaml)
shutil.copyfile(file1,tgtfile1)
# TODO replace below lines, here to save the critical scripts
#=====================Finetune Phase=====================#
elif config.phase == "finetune":
sys_state["log_root_path"] = env_config["train_log_root"]
sys_state["project_root"] = os.path.join(sys_state["log_root_path"], sys_state["version"])
config_json = os.path.join(sys_state["project_root"], env_config["config_json_name"])
train_config = readConfig(config_json)
for item in train_config.items():
if item[0] in ignoreKey:
pass
else:
sys_state[item[0]] = item[1]
createDirs(sys_state)
reporter = Reporter(sys_state["reporter_path"])
sys_state["com_base"] = "train_logs.%s.scripts."%sys_state["version"]
# get the dataset path
sys_state["dataset_paths"] = {}
for data_key in env_config["dataset_paths"].keys():
sys_state["dataset_paths"][data_key] = env_config["dataset_paths"][data_key]
# display the training information
moduleName = "train_scripts.trainer_" + sys_state["train_script_name"]
if config.phase == "finetune":
moduleName = sys_state["com_base"] + "trainer_" + sys_state["train_script_name"]
# print some important information
# TODO
# print("Start to run training script: {}".format(moduleName))
# print("Traning version: %s"%sys_state["version"])
# print("Dataloader Name: %s"%sys_state["dataloader"])
# # print("Image Size: %d"%sys_state["imsize"])
# print("Batch size: %d"%(sys_state["batch_size"]))
# print("GPUs:", gpus)
print("\n========================================================================\n")
print(sys_state)
print("\n========================================================================\n")
# Load the training script and start to train
reporter.writeConfig(sys_state)
package = __import__(moduleName, fromlist=True)
trainerClass= getattr(package, 'Trainer')
trainer = trainerClass(sys_state, reporter)
trainer.train()
if __name__ == '__main__':
main()