Files
SimSwapPlus/train_distillation_mgpu.py
T
chenxuanhong a0428c8c73 distillation
2022-02-23 14:36:26 +08:00

329 lines
14 KiB
Python

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################
# File: train.py
# Created Date: Tuesday April 28th 2020
# Author: Chen Xuanhong
# Email: chenxuanhongzju@outlook.com
# Last Modified: Wednesday, 23rd February 2022 2:30:03 am
# Modified By: Chen Xuanhong
# Copyright (c) 2020 Shanghai Jiao Tong University
#############################################################
from curses.panel import version
import os
import shutil
import argparse
from torch.backends import cudnn
from utilities.json_config import readConfig, writeConfig
from utilities.reporter import Reporter
from utilities.yaml_config import getConfigYaml
def str2bool(v):
return v.lower() in ('true')
####################################################################################
# To configure the seting of training\finetune\test
#
####################################################################################
def getParameters():
parser = argparse.ArgumentParser()
# general settings
parser.add_argument('-v', '--version', type=str, default='distillation',
help="version name for train, test, finetune")
parser.add_argument('-t', '--tag', type=str, default='distillation',
help="tag for current experiment")
parser.add_argument('-p', '--phase', type=str, default="train",
choices=['train', 'finetune','debug'],
help="The phase of current project")
parser.add_argument('-c', '--gpus', type=int, nargs='+', default=[0,1]) # <0 if it is set as -1, program will use CPU
parser.add_argument('-e', '--ckpt', type=int, default=74,
help="checkpoint epoch for test phase or finetune phase")
# training
parser.add_argument('--experiment_description', type=str,
default="测试蒸馏代码")
parser.add_argument('--train_yaml', type=str, default="train_distillation.yaml")
# system logger
parser.add_argument('--logger', type=str,
default="none", choices=['tensorboard', 'wandb','none'], help='system logger')
# # logs (does not to be changed in most time)
# parser.add_argument('--dataloader_workers', type=int, default=6)
# parser.add_argument('--use_tensorboard', type=str2bool, default='True',
# choices=['True', 'False'], help='enable the tensorboard')
# parser.add_argument('--log_step', type=int, default=100)
# parser.add_argument('--sample_step', type=int, default=100)
# # template (onece editing finished, it should be deleted)
# parser.add_argument('--str_parameter', type=str, default="default", help='str parameter')
# parser.add_argument('--str_parameter_choices', type=str,
# default="default", choices=['choice1', 'choice2','choice3'], help='str parameter with choices list')
# parser.add_argument('--int_parameter', type=int, default=0, help='int parameter')
# parser.add_argument('--float_parameter', type=float, default=0.0, help='float parameter')
# parser.add_argument('--bool_parameter', type=str2bool, default='True', choices=['True', 'False'], help='bool parameter')
# parser.add_argument('--list_str_parameter', type=str, nargs='+', default=["element1","element2"], help='str list parameter')
# parser.add_argument('--list_int_parameter', type=int, nargs='+', default=[0,1], help='int list parameter')
return parser.parse_args()
ignoreKey = [
"dataloader_workers",
"log_root_path",
"project_root",
"project_summary",
"project_checkpoints",
"project_samples",
"project_scripts",
"reporter_path",
"use_specified_data",
"specified_data_paths",
"dataset_path","cuda",
"test_script_name",
"test_dataloader",
"test_dataset_path",
"save_test_result",
"test_batch_size",
"node_name",
"checkpoint_epoch",
"test_dataset_path",
"test_dataset_name",
"use_my_test_date"]
####################################################################################
# This function will create the related directories before the
# training\fintune\test starts
# Your_log_root (version name)
# |---summary/...
# |---samples/... (save evaluated images)
# |---checkpoints/...
# |---scripts/...
#
####################################################################################
def createDirs(sys_state):
# the base dir
if not os.path.exists(sys_state["log_root_path"]):
os.makedirs(sys_state["log_root_path"])
# create dirs
sys_state["project_root"] = os.path.join(sys_state["log_root_path"],
sys_state["version"])
project_root = sys_state["project_root"]
if not os.path.exists(project_root):
os.makedirs(project_root)
sys_state["project_summary"] = os.path.join(project_root, "summary")
if not os.path.exists(sys_state["project_summary"]):
os.makedirs(sys_state["project_summary"])
sys_state["project_checkpoints"] = os.path.join(project_root, "checkpoints")
if not os.path.exists(sys_state["project_checkpoints"]):
os.makedirs(sys_state["project_checkpoints"])
sys_state["project_samples"] = os.path.join(project_root, "samples")
if not os.path.exists(sys_state["project_samples"]):
os.makedirs(sys_state["project_samples"])
sys_state["project_scripts"] = os.path.join(project_root, "scripts")
if not os.path.exists(sys_state["project_scripts"]):
os.makedirs(sys_state["project_scripts"])
sys_state["reporter_path"] = os.path.join(project_root,sys_state["version"]+"_report")
def fetch_teacher_files(sys_state, env_config):
version = sys_state["teacher_model"]["version"]
if not os.path.exists(sys_state["log_root_path"]):
os.makedirs(sys_state["log_root_path"])
# create dirs
sys_state["teacher_model"]["project_root"] = os.path.join(sys_state["log_root_path"], version)
project_root = sys_state["teacher_model"]["project_root"]
if not os.path.exists(project_root):
os.makedirs(project_root)
sys_state["teacher_model"]["project_checkpoints"] = os.path.join(project_root, "checkpoints")
if not os.path.exists(sys_state["teacher_model"]["project_checkpoints"]):
os.makedirs(sys_state["teacher_model"]["project_checkpoints"])
sys_state["teacher_model"]["project_scripts"] = os.path.join(project_root, "scripts")
if not os.path.exists(sys_state["teacher_model"]["project_scripts"]):
os.makedirs(sys_state["teacher_model"]["project_scripts"])
if sys_state["teacher_model"]["node_ip"] != "localhost":
from utilities.sshupload import fileUploaderClass
machine_config = env_config["machine_config"]
machine_config = readConfig(machine_config)
nodeinf = None
for item in machine_config:
if item["ip"] == sys_state["teacher_model"]["node_ip"]:
nodeinf = item
break
if not nodeinf:
raise Exception(print("Configuration of node %s is unavaliable"%sys_state["node_ip"]))
print("ready to fetch related files from server: %s ......"%nodeinf["ip"])
uploader = fileUploaderClass(nodeinf["ip"],nodeinf["user"],nodeinf["passwd"])
remotebase = os.path.join(nodeinf['path'],"train_logs",version).replace('\\','/')
# Get the config.json
print("ready to get the teacher's config.json...")
remoteFile = os.path.join(remotebase, env_config["config_json_name"]).replace('\\','/')
localFile = os.path.join(project_root, env_config["config_json_name"])
ssh_state = uploader.sshScpGet(remoteFile, localFile)
if not ssh_state:
raise Exception(print("Get file %s failed! config.json does not exist!"%remoteFile))
print("success get the teacher's config.json from server %s"%nodeinf['ip'])
# Get scripts
remoteDir = os.path.join(remotebase, "scripts").replace('\\','/')
localDir = os.path.join(sys_state["teacher_model"]["project_scripts"])
ssh_state = uploader.sshScpGetDir(remoteDir, localDir)
if not ssh_state:
raise Exception(print("Get file %s failed! Program exists!"%remoteFile))
print("Get the teacher's scripts successful!")
# Read model_config.json
config_json = os.path.join(project_root, env_config["config_json_name"])
json_obj = readConfig(config_json)
for item in json_obj.items():
if item[0] in ignoreKey:
pass
else:
sys_state["teacher_model"][item[0]] = item[1]
# Get checkpoints
if sys_state["teacher_model"]["node_ip"] != "localhost":
ckpt_name = "step%d_%s.pth"%(sys_state["teacher_model"]["checkpoint_step"],
sys_state["teacher_model"]["checkpoint_names"]["generator_name"])
localFile = os.path.join(sys_state["teacher_model"]["project_checkpoints"],ckpt_name)
if not os.path.exists(localFile):
remoteFile = os.path.join(remotebase, "checkpoints", ckpt_name).replace('\\','/')
ssh_state = uploader.sshScpGet(remoteFile, localFile, True)
if not ssh_state:
raise Exception(print("Get file %s failed! Checkpoint file does not exist!"%remoteFile))
print("Get the teacher's checkpoint %s successfully!"%(ckpt_name))
else:
print("%s exists!"%(ckpt_name))
def main():
config = getParameters()
# speed up the program
cudnn.benchmark = True
cudnn.enabled = True
from utilities.logo_class import logo_class
logo_class.print_group_logo()
sys_state = {}
# set the GPU number
gpus = [str(i) for i in config.gpus]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpus)
# read system environment paths
env_config = readConfig('env/env.json')
env_config = env_config["path"]
# obtain all configurations in argparse
config_dic = vars(config)
for config_key in config_dic.keys():
sys_state[config_key] = config_dic[config_key]
#=======================Train Phase=========================#
if config.phase == "train":
# read training configurations from yaml file
ymal_config = getConfigYaml(os.path.join(env_config["train_config_path"], config.train_yaml))
for item in ymal_config.items():
sys_state[item[0]] = item[1]
# create related dirs
sys_state["log_root_path"] = env_config["train_log_root"]
createDirs(sys_state)
# create reporter file
reporter = Reporter(sys_state["reporter_path"])
# save the config json
config_json = os.path.join(sys_state["project_root"], env_config["config_json_name"])
writeConfig(config_json, sys_state)
# save the dependent scripts
# TODO and copy the scripts to the project dir
# save the trainer script into [train_logs_root]\[version name]\scripts\
file1 = os.path.join(env_config["train_scripts_path"],
"trainer_%s.py"%sys_state["train_script_name"])
tgtfile1 = os.path.join(sys_state["project_scripts"],
"trainer_%s.py"%sys_state["train_script_name"])
shutil.copyfile(file1,tgtfile1)
# save the yaml file
file1 = os.path.join(env_config["train_config_path"], config.train_yaml)
tgtfile1 = os.path.join(sys_state["project_scripts"], config.train_yaml)
shutil.copyfile(file1,tgtfile1)
# TODO replace below lines, here to save the critical scripts
#=====================Finetune Phase=====================#
elif config.phase == "finetune":
sys_state["log_root_path"] = env_config["train_log_root"]
sys_state["project_root"] = os.path.join(sys_state["log_root_path"], sys_state["version"])
config_json = os.path.join(sys_state["project_root"], env_config["config_json_name"])
train_config = readConfig(config_json)
for item in train_config.items():
if item[0] in ignoreKey:
pass
else:
sys_state[item[0]] = item[1]
createDirs(sys_state)
reporter = Reporter(sys_state["reporter_path"])
sys_state["com_base"] = "train_logs.%s.scripts."%sys_state["version"]
fetch_teacher_files(sys_state,env_config)
# get the dataset path
sys_state["dataset_paths"] = {}
for data_key in env_config["dataset_paths"].keys():
sys_state["dataset_paths"][data_key] = env_config["dataset_paths"][data_key]
# display the training information
moduleName = "train_scripts.trainer_" + sys_state["train_script_name"]
if config.phase == "finetune":
moduleName = sys_state["com_base"] + "trainer_" + sys_state["train_script_name"]
# print some important information
# TODO
# print("Start to run training script: {}".format(moduleName))
# print("Traning version: %s"%sys_state["version"])
# print("Dataloader Name: %s"%sys_state["dataloader"])
# # print("Image Size: %d"%sys_state["imsize"])
# print("Batch size: %d"%(sys_state["batch_size"]))
# print("GPUs:", gpus)
print("\n========================================================================\n")
print(sys_state)
for data_key in sys_state.keys():
print("[%s]---[%s]"%(data_key,sys_state[data_key]))
print("\n========================================================================\n")
# Load the training script and start to train
reporter.writeConfig(sys_state)
package = __import__(moduleName, fromlist=True)
trainerClass= getattr(package, 'Trainer')
trainer = trainerClass(sys_state, reporter)
trainer.train()
if __name__ == '__main__':
main()