from __future__ import print_function, division import shutil import torch import torch.nn as nn import torch.optim as optim from torch.optim import lr_scheduler import torch.backends.cudnn as cudnn import numpy as np from torchvision import datasets, models, transforms import time import os import zipfile import copy import platform from torch.utils.tensorboard import SummaryWriter cudnn.benchmark = True data_phase = ['train', 'val'] database_dir = './data' output_dir = './runs' # 模型保存和日志备份大目录 newest_log = './newest_log' # 最新日志保存目录 log_port = 6667 # tensorboard日志端口 writer: SummaryWriter # 用异步接收请求判断 task_list = ['train', 'data_process'] running_task = set() exit_flag = False def print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█', print_end="\r"): # 计算完成百分比 percent_complete = f"{(100 * (iteration / float(total))):.{decimals}f}" # 计算进度条填充长度 filled_length = int(length * iteration // total) # 创建进度条字符串 bar = fill * filled_length + '-' * (length - filled_length) # 打印进度条 print(f'\r{prefix} |{bar}| {percent_complete}% {suffix}', end=print_end) # 完成时打印新行 if iteration == total: print() # 备份log def move_log(model_save_dir): log_name = os.listdir(newest_log)[0] log_path = os.path.join(newest_log, log_name) save_path = os.path.join(model_save_dir, log_name) shutil.copy(log_path, save_path) print('log 已备份') def train_model(model, criterion, optimizer, scheduler, num_epochs=25): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print(f'Epoch {epoch + 1}/{num_epochs}') print('-' * 10) for phase in data_phase: if phase == 'train': model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 l = len(dataloaders[phase]) print_progress_bar(0, l, prefix='进度:', suffix='完成', length=50) # Iterate over data. for i, (inputs, labels) in enumerate(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) # 更新进度条 print_progress_bar(i + 1, l, prefix='进度:', suffix='完成', length=50) if phase == 'train': scheduler.step() epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] writer.add_scalar(phase + " loss", epoch_loss, epoch + 1) writer.add_scalar(phase + " accuracy", epoch_acc, epoch + 1) print(f'{phase} Loss: {epoch_loss:.6f} Acc: {epoch_acc:.6f}') # deep copy the model if phase == 'val' and epoch_acc >= best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) print('copy best model') print() time_elapsed = time.time() - since print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s') print(f'Best val Acc: {best_acc:4f}') # 加载在验证集中表现最好的模型 model.load_state_dict(best_model_wts) return model def train(epoch=30, save_path='resnet.pt', model_path=None, freeze=7, learn_rate=0.01, momentum=0.9, decay=0.7): # 如果不加载训练过的模型则加载预训练模型 if model_path is None or model_path == '': model = models.resnet50(pretrained=True) # 修改最后一层 num_features = model.fc.in_features model.fc = nn.Linear(num_features, class_num) else: model = torch.load(model_path) old_cls_num = model.fc.out_features if class_num == old_cls_num: print('分类头适合, 进行训练') else: # 修改最后一层 num_features = model.fc.in_features model.fc = nn.Linear(num_features, class_num) print(f"修改分类头: {old_cls_num} --> {class_num}") model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer_ft = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) # 冻结部分参数 for i, c in enumerate(model.children()): if i == freeze: break for param in c.parameters(): param.requires_grad = False for param in model.parameters(): print(param.requires_grad) model = train_model(model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=epoch) torch.save(model, save_path) ''' epoch: 训练次数 save_path: 模型保存路径 model_path: 加载的模型路径 freeze_num: 冻结层数 ''' def load_param(epoch, data_dir, model_path, freeze, learn_rate, momentum, decay): global device, class_num, dataloaders, dataset_sizes, output_dir, writer device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") data_transforms = { 'train': transforms.Compose([ transforms.Resize((224, 224)), transforms.RandAugment(), transforms.RandomPerspective(distortion_scale=0.5, p=0.5), transforms.ColorJitter(brightness=0.2, contrast=0.2), transforms.RandomApply([transforms.GaussianBlur(5)], p=0.3), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in data_phase} dataloaders = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=32, shuffle=True, num_workers=16, pin_memory=True) for x in data_phase} dataset_sizes = {x: len(image_datasets[x]) for x in data_phase} class_names = image_datasets['train'].classes class_num = len(class_names) print('class:', class_num) print(f"输入参数: 训练次数: {epoch}, 模型路径: {model_path}") model_id = len(os.listdir(output_dir)) + 1 model_save_dir = os.path.join(output_dir, str(model_id)) if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) # 删除旧的log if len(os.listdir(newest_log)) > 0: os.remove(os.path.join(newest_log, os.listdir(newest_log)[0])) writer = SummaryWriter(newest_log) writer.add_text('model', "model id: " + str(model_id)) save_path = os.path.join(model_save_dir, 'resnet50_out' + str(class_num) + '.pt') train(epoch=epoch, save_path=save_path, model_path=model_path, freeze=freeze, learn_rate=learn_rate, momentum=momentum, decay=decay) writer.flush() writer.close() move_log(model_save_dir)