参考资料
pytorch框架基础
Pycharm 页面卡住解决方案
使用ps命令结合grep来查找PyCharm相关的进程
ps aux | grep pycharm
kill -9 [PID]
关于怎么找这个卡住的进程,据初步观察,卡住进程打印的信息是最长的,此外,在卡住进程的打印信息结尾处会提示工程名称
TensorBoard
记录loss
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(comment='test_your_comment', filename_suffix="_test_your_filename_suffix")writer.add_scalars("Loss", {"Valid": np.mean(valid_curve)}, iter_count)
writer.add_scalars("Loss", {"Train": loss.item()}, iter_count)
记录feature_map
fmap_1_grid = vutils.make_grid(fmap_1, normalize=True, scale_each=True, nrow=8)
writer.add_image('feature map in conv1', fmap_1_grid, global_step=322)
writer.close()
记录kernel
flag = 1
if flag:writer = SummaryWriter(comment='test_your_comment', filename_suffix="_test_your_filename_suffix")alexnet = models.alexnet(pretrained=True)kernel_num = -1vis_max = 1for sub_module in alexnet.modules():if isinstance(sub_module, nn.Conv2d):kernel_num += 1if kernel_num > vis_max:breakkernels = sub_module.weightc_out, c_int, k_w, k_h = tuple(kernels.shape)for o_idx in range(c_out):kernel_idx = kernels[o_idx, :, :, :].unsqueeze(1) # make_grid需要 BCHW,这里拓展C维度kernel_grid = vutils.make_grid(kernel_idx, normalize=True, scale_each=True, nrow=c_int)writer.add_image('{}_Convlayer_split_in_channel'.format(kernel_num), kernel_grid, global_step=o_idx)kernel_all = kernels.view(-1, 3, k_h, k_w) # 3, h, wkernel_grid = vutils.make_grid(kernel_all, normalize=True, scale_each=True, nrow=8) # c, h, wwriter.add_image('{}_all'.format(kernel_num), kernel_grid, global_step=322)print("{}_convlayer shape:{}".format(kernel_num, tuple(kernels.shape)))writer.close()
启动tensorboard
tensorboard --logdir=logs --port=6006 #指定文件路径 指定端口
GPU的使用
多GPU训练时,利用环境变量指定可见GPU
gpu_list = [0,1]
gpu_list_str = ','.join(map(str, gpu_list))
os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
正则化weight_decay
optim_wdecay = torch.optim.SGD(net_weight_decay.parameters(), lr=lr_init, momentum=0.9, weight_decay=1e-2)