1 数据划分
标注好后的数据集分为两个文件夹,一个文件夹中放置图片另一个文件夹中放置了txt文件。将数据集按照一定的比例进行划分为训练集,测试集,验证集(train、test、val),下述的代码中我按照了 8:1:1 的比例去划分,若想根据自己的需要去划分数据集,修改下述代码中的
代码如下:
import os
import random
from shutil import copyfiledef split_dataset(image_folder, txt_folder, output_folder, split_ratio=(0.8, 0.1, 0.1)):# Ensure output folders existfor dataset in ['train', 'val', 'test']:if not os.path.exists(os.path.join(output_folder, dataset, 'images')):os.makedirs(os.path.join(output_folder, dataset, 'images'))if not os.path.exists(os.path.join(output_folder, dataset, 'txt')):os.makedirs(os.path.join(output_folder, dataset, 'txt'))# Get list of image filesimage_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.jpeg', '.png'))]random.shuffle(image_files)num_images = len(image_files)num_train = int(split_ratio[0] * num_images)num_val = int(split_ratio[1] * num_images)train_images = image_files[:num_train]val_images = image_files[num_train:num_train + num_val]test_images = image_files[num_train + num_val:]# Copy images to respective foldersfor dataset, images_list in zip(['train', 'val', 'test'], [train_images, val_images, test_images]):for image_file in images_list:image_path = os.path.join(image_folder, image_file)copyfile(image_path, os.path.join(output_folder, dataset, 'images', image_file))txt_file = os.path.splitext(image_file)[0] + '.txt'txt_path = os.path.join(txt_folder, txt_file)# Copy corresponding txt file if existsif os.path.exists(txt_path):copyfile(txt_path, os.path.join(output_folder, dataset, 'txt', txt_file))if __name__ == "__main__":image_folder_path = "D:\PycharmProjects\VOCdevkit\VOC2007\JPEGImages"txt_folder_path = "D:\PycharmProjects\VOCdevkit\VOC2007\YOLOLabels"output_dataset_path = "D:\PycharmProjects\VOCdevkit"split_dataset(image_folder_path, txt_folder_path, output_dataset_path)
上面代码只需要改三个地方:
image_folder_path = "D:\PycharmProjects\VOCdevkit\VOC2007\JPEGImages"
txt_folder_path = "D:\PycharmProjects\VOCdevkit\VOC2007\YOLOLabels"
output_dataset_path = "D:\PycharmProjects\VOCdevkit"
image_folder_path为你保存的图片的文件夹的路径
txt_folder_path 为你保存的txt文件夹的路径
output_dataset_path 为你保存的数据集的文件夹的路径,代码会在改路径下自动生成子文件夹,分别进行测试集,训练集,验证集的存储。
若出现错误可以参考这个网站