寒假前主要遇到的问题
寒假前主要遇到的是在最后一个pt的张量处理的时候会出现呃矩阵大小不同,无法相乘的问题
pt文件处理代码
import torch import osdef process_and_save_embeddings(input_dir, output_dir, target_dim=1536):"""处理 .pt 文件中的嵌入,将其转换为指定的 target_dim 格式,并保存到新的目录。"""os.makedirs(output_dir, exist_ok=True)for file_name in os.listdir(input_dir):if file_name.endswith('.pt'):input_path = os.path.join(input_dir, file_name)embeddings = torch.load(input_path) # 加载嵌入# 处理嵌入,根据需求整合多层嵌入if isinstance(embeddings, dict):# 假设使用最后一层嵌入,或者合并所有层if 'last_layer_embed' in embeddings:processed_embedding = embeddings['last_layer_embed']else:# 将所有层拼接为一个大张量processed_embedding = torch.cat([embeddings[key] for key in embeddings.keys()], dim=-1)elif isinstance(embeddings, torch.Tensor):processed_embedding = embeddingselse:raise ValueError(f"Unsupported embedding format in {file_name}")# 调整嵌入维度到 target_dimprocessed_embedding = adjust_embedding_dim(processed_embedding, target_dim)# 保存处理后的嵌入output_path = os.path.join(output_dir, file_name)torch.save(processed_embedding, output_path)print(f"Processed and saved: {output_path}")def adjust_embedding_dim(embedding, target_dim):"""调整嵌入维度到指定的 target_dim。如果嵌入维度小于 target_dim,则进行拼接或重复。如果嵌入维度大于 target_dim,则进行截断。"""current_dim = embedding.shape[-1]if current_dim == target_dim:return embeddingelif current_dim < target_dim:# 重复或补零扩展repeat_times = target_dim // current_dimremainder = target_dim % current_dimextended_embedding = embedding.repeat(1, repeat_times)if remainder > 0:extended_embedding = torch.cat([extended_embedding, embedding[:, :remainder]], dim=-1)return extended_embeddingelse:# 截断return embedding[:, :target_dim]# 使用示例 input_dir = "mayoo/1" # 原始嵌入目录 output_dir = "mayoolast/1" # 处理后嵌入保存目录 target_dim = 1536 # 模型期望的输入维度 process_and_save_embeddings(input_dir, output_dir, target_dim)
我利用以上方法将pt文件的最后两层给它融合在一起,然后作为一个1536长度的向量,这样能最大保留向量的高维度特征,然后进行然后用来进行模型训练,然后呃这个时候会出现一个新问题,就是就是模型的向量,它这个时候就是虽然虽然已经压缩成一行了,但是它还会有另外一个空行,如下
在 train
函数中,target
的形状是 [batch_size]
,而 CrossEntropyLoss
期望的目标张量是类别索引形式(即每个样本的类别标签,而不是 one-hot 编码形式)。如果 target
是 one-hot 编码形式,会导致以下错误:
RuntimeError: Expected target size [batch_size, num_classes], got [batch_size]
我们需要确保 target
是类别索引形式,而不是 one-hot 编码形式。以下是具体的修改步骤:
在 EmbeddingDataset
类的 __getitem__
方法中,确保 target
是类别索引形式(即整数标签,而不是 one-hot 编码)。
修改后的代码如下:
class EmbeddingDataset(Dataset):def __getitem__(self, index):sample, target = self.samples[index], self.labels[index]embed = self.embeds[sample]if self.z_score:embed = (embed - embed.mean()) / embed.std()target = self.label_dict[target] # 将标签转换为类别索引target = torch.tensor(target, dtype=torch.long) # 确保 target 是整数类型embed = torch.tensor(embed, dtype=torch.float32) # 确保 embed 是浮点类型return embed, target
呃,将这个120呃1536×1的向量转化成一个15单纯1536的向量,因为它多出了一个空行。
如下
Processor
类中的 load_embeddings_from_zip
方法:
-
-
在加载
.pt
文件后,使用squeeze()
方法将形状为[1, 1536]
的张量转换为[1536]
。
-
-
EmbeddingDataset
类中的__getitem__
方法:-
确保从
self.embeds
中获取的张量是形状为[1536]
的张量
-
修改后的完整代码
import os import io import argparse import zipfile import pandas as pd import torch import itertools import numpy as np from torch import nn from tqdm import tqdm from torch.utils.data import DataLoader, Dataset import torch.utils.tensorboard as tensorboard from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_fscore_support def seed_torch(device, seed=11):import randomrandom.seed(seed)os.environ['PYTHONHASHSEED'] = str(seed)np.random.seed(seed)torch.manual_seed(seed)if device.type == 'cuda':torch.cuda.manual_seed(seed)torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.torch.backends.cudnn.benchmark = Falsetorch.backends.cudnn.deterministic = True argparser = argparse.ArgumentParser(description='Linear Probe') argparser.add_argument('--dataset_csv', type=str, default='', help='The csv file indicating input samples and labels') argparser.add_argument('--input_path', type=str, default='', help='The input embedding files') argparser.add_argument('--embed_dim', type=int, default=1536, help='The dimension of the embeddings') argparser.add_argument('--batch_size', type=int, default=512, help='Batch size') argparser.add_argument('--train_iters', type=int, default=12500, help='Number of epochs') argparser.add_argument('--lr', type=float, default=0.01, help='Learning rate') argparser.add_argument('--min_lr', type=float, default=0.0, help='Minimum learning rate') argparser.add_argument('--optim', type=str, default='sgd', help='Optimizer') argparser.add_argument('--momentum', type=float, default=0.0, help='Momentum') argparser.add_argument('--weight_decay', type=float, default=0.0, help='Weight decay') argparser.add_argument('--eval_interval', type=int, default=10000, help='Evaluation interval') argparser.add_argument('--model_select', type=str, default='best', help='Model selection') argparser.add_argument('--num_workers', type=int, default=2, help='Number of workers') argparser.add_argument('--seed', type=int, default=420, help='Random seed') argparser.add_argument('--z_score', action='store_true', default=False, help='Whether use z-score normalization') argparser.add_argument('--output_dir', type=str, default='outputs', help='Output directory') def to_onehot(labels: np.ndarray, num_classes: int) -> np.ndarray:'''Convert the labels to one-hot encoding'''onehot = np.zeros((labels.shape[0], num_classes))onehot[np.arange(labels.shape[0]), labels] = 1return onehot def train(model,train_loader,val_loader,test_loader,train_iters,lr, min_lr,optim,weight_decay,output_dir,eval_interval,momentum,**kwargs):device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')model = model.to(device)tensorboard_dir = os.path.join(output_dir, 'tensorboard')os.makedirs(tensorboard_dir, exist_ok=True)writer = tensorboard.SummaryWriter(tensorboard_dir)if optim == 'sgd':optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)elif optim == 'adam':optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)else:raise ValueError('Invalid optimizer')print('Set the optimizer as {}'.format(optim))lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=train_iters, eta_min=min_lr)criterion = nn.CrossEntropyLoss()infinite_train_loader = itertools.cycle(train_loader)best_f1 = 0print('Start training')for i, (embed, target) in enumerate(infinite_train_loader):if i >= train_iters:break#print(embed)embed, target = embed.to(device), target.to(device)output = model(embed)loss = criterion(output, target)optimizer.zero_grad()loss.backward()optimizer.step()lr_scheduler.step()if (i + 1) % 10 == 0:lr = optimizer.param_groups[0]['lr']print(f'Iteration [{i}/{train_iters}]\tLoss: {loss.item()}\tLR: {lr}')writer.add_scalar('Train Loss', loss.item(), i)writer.add_scalar('Learning Rate', lr, i)if (i + 1) % eval_interval == 0 or (i + 1) == train_iters:print(f'Start evaluating ...')accuracy, f1, precision, recall, auroc, auprc = evaluate(model, criterion, val_loader, device)print(f'Val [{i}/{train_iters}] Accuracy: {accuracy} f1: {f1} Precision: {precision} Recall: {recall} AUROC: {auroc} AUPRC: {auprc}')writer.add_scalar('Val Accuracy', accuracy, i)writer.add_scalar('Val f1', f1, i)writer.add_scalar('Val AUROC', auroc, i)writer.add_scalar('Val AUPRC', auprc, i)writer.add_scalar('Val Precision', precision, i)writer.add_scalar('Val Recall', recall, i)writer.add_scalar('Best f1', best_f1, i)if f1 > best_f1:print('Best f1 increase from {} to {}'.format(best_f1, f1))best_f1 = f1torch.save(model.state_dict(), f'{output_dir}/best_model.pth')torch.save(model.state_dict(), f'{output_dir}/model.pth')if kwargs.get('model_select') == 'best':val_f1 = best_f1model.load_state_dict(torch.load(f'{output_dir}/best_model.pth'))else:val_f1 = f1model.load_state_dict(torch.load(f'{output_dir}/model.pth'))accuracy, f1, precision, recall, auroc, auprc = evaluate(model, criterion, test_loader, device)print(f'Test Accuracy: {accuracy} f1: {f1} Precision: {precision} Recall: {recall} AUROC: {auroc} AUPRC: {auprc}')writer.add_scalar('Test Accuracy', accuracy, i)writer.add_scalar('Test f1', f1, i)writer.add_scalar('Test AUROC', auroc, i)writer.add_scalar('Test AUPRC', auprc, i)writer.add_scalar('Test Precision', precision, i)writer.add_scalar('Test Recall', recall, i)f = open(f'{output_dir}/results.txt', 'w')f.write(f'Val f1: {val_f1}\n')f.write(f'Test f1: {f1} Test AUROC: {auroc} Test AUPRC: {auprc}\n')f.close() def evaluate(model, criterion, val_loader, device):model.eval()with torch.no_grad():total_loss = 0pred_gather, target_gather = [], []for _, (embed, target) in enumerate(val_loader):embed, target = embed.to(device), target.to(device)output = model(embed)loss = criterion(output, target)total_loss += loss.item()pred_gather.append(output.cpu().numpy())target_gather.append(target.cpu().numpy())pred_gather = np.concatenate(pred_gather)target_gather = np.concatenate(target_gather)accuracy = (pred_gather.argmax(1) == target_gather).mean()f1 = f1_score(target_gather, pred_gather.argmax(1), average='weighted')precision, recall, _, _ = precision_recall_fscore_support(target_gather, pred_gather.argmax(1), average='macro')auroc = roc_auc_score(to_onehot(target_gather, pred_gather.shape[1]), pred_gather, average='macro')auprc = average_precision_score(to_onehot(target_gather, pred_gather.shape[1]), pred_gather, average='macro')return accuracy, f1, precision, recall, auroc, auprc def main():args = argparser.parse_args()print(args)seed_torch(torch.device('cuda'), args.seed)processor = Processor()splits = ['train', 'val', 'test']train_dataset, val_dataset, test_dataset = [EmbeddingDataset(args.dataset_csv, args.input_path, \split=split, z_score=args.z_score, processor=processor) for split in splits]args.num_classes = len(train_dataset.label_dict)print(f'Train: {len(train_dataset)}\tVal: {len(val_dataset)}\tTest: {len(test_dataset)}')train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset, replacement=True)train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, sampler=train_sampler, pin_memory=True)val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True)test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True)model = LinearProbe(args.embed_dim, args.num_classes)train(model, train_loader, val_loader, test_loader, **vars(args)) class LinearProbe(nn.Module):def __init__(self, embed_dim: int = 1536, num_classes: int = 2):super(LinearProbe, self).__init__()self.fc = nn.Linear(1536, 2)def forward(self, x):return self.fc(x) class EmbeddingDataset(Dataset):def __init__(self, dataset_csv: str, zip_path: str, split: str = 'train', z_score=False, processor=None):df = pd.read_csv(dataset_csv)split_df = df[df['split'] == split]self.samples = split_df['input'].tolist()self.labels = split_df['label'].tolist()self.processor = processorself.embeds = processor.load_embeddings_from_zip(zip_path, split)label_set = list(set(self.labels))label_set.sort()self.label_dict = {label: i for i, label in enumerate(label_set)}self.z_score = z_scoredef __len__(self):return len(self.samples)def __getitem__(self, index):sample, target = self.samples[index], self.labels[index]embed = self.embeds[sample]if embed.dim() == 2 and embed.shape[0] == 1:embed = embed.squeeze(0)if self.z_score:embed = (embed - embed.mean()) / embed.std()target = self.label_dict[target]# target = torch.tensor(target)target = torch.tensor(target, dtype=torch.long) # 确保 target 是整数类型embed = torch.tensor(embed, dtype=torch.float32) # 确保 embed 是浮点类型return embed, targetclass Processor:def get_sample_name(self, path):return os.path.basename(path).replace('.pt', '')def load_embeddings_from_zip(self, zip_path, split):loaded_tensors = {}with zipfile.ZipFile(zip_path, 'r') as zip_ref:print(len(zip_ref.infolist()))for file_info in tqdm(zip_ref.infolist()): if file_info.filename.endswith('.pt') and split in file_info.filename:file_bytes = zip_ref.read(file_info.filename)byte_stream = io.BytesIO(file_bytes)tensor = torch.load(byte_stream)if tensor.dim() == 2 and tensor.shape[0] == 1:tensor = tensor.squeeze(0) # 移除第 0 维sample_name = self.get_sample_name(file_info.filename)loaded_tensors[sample_name] = tensorreturn loaded_tensors if __name__ == '__main__':main()
这样子训练下来得到的准确率极低,我们还需要进行参数调整
#!/bin/bash #PBS -N linear_probe #PBS -o linear_probe_$PBS_JOBID.log #PBS -e linear_probe_$PBS_JOBID.err #PBS -l nodes=1:ppn=12 #PBS -q gpucd $PBS_O_WORKDIRmodule add gcc/11.2.0 source /home/data/software/python/3.12.7/gigapath2/bin/activate# Default value for INPUTPATH if not provided as an argument INPUTPATH=${1:-/public/liujx/prov-gigapath2/mayoolast.zip} DATASETCSV=/public/liujx/mayo.csv OUTPUT=outputs3echo "Running linear probe with input path: $INPUTPATH"python linear_probe/main.py --input_path $INPUTPATH \--dataset_csv $DATASETCSV \--output $OUTPUT \--batch_size 256 \--embed_dim 1536 \--num_workers 2 \--lr 0.001 \--min_lr 1e-5 \--train_iters 10000 \--eval_interval 500 \--optim adam \--weight_decay 0.001
最终得到结果