【Python】数据集随机划分为训练集、测试集、验证集(图像和标签)
如下图红框处为源数据集的图片和标签样本文件夹路径,替换为你的即可
红框下面的两个路径替换为你的即可会自动生成文件夹,并在这俩个文件夹里自动生成多个文件夹为随机划分为训练集、测试集、验证集的图像和标签

如下完整程序:
import random
import shutil
import os
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def list_files_in_directory(directory):
"""
获取指定目录下的所有文件列表
"""
return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
def create_directory(directory):
"""
创建目录,如果目录已存在则不进行任何操作
"""
if not os.path.exists(directory):
os.makedirs(directory)
def copy_files(file_list, source_label_path, source_img_path, target_label_path, target_img_path, sub_dir):
"""
复制文件到目标目录
"""
target_label_dir = os.path.join(target_label_path, sub_dir)
target_img_dir = os.path.join(target_img_path, sub_dir)
create_directory(target_label_dir)
create_directory(target_img_dir)
for label_name in file_list:
label_file_pre = label_name.split('.')[0]
img_file_name = f'{label_file_pre}.jpg'
try:
shutil.copyfile(os.path.join(source_label_path, label_name), os.path.join(target_label_dir, label_name))
shutil.copyfile(os.path.join(source_img_path, img_file_name), os.path.join(target_img_dir, img_file_name))
except FileNotFoundError as e:
logging.error(f'文件未找到: {e}')
except PermissionError as e:
logging.error(f'权限不足: {e}')
except Exception as e:
logging.error(f'复制文件时出现错误: {e}')
def split_dataset(labels_file_path, img_file_path, target_labels_file_path, target_img_path, train_ratio, val_ratio,
test_ratio):
"""
划分数据集并复制文件
"""
if train_ratio + val_ratio + test_ratio!= 1:
raise ValueError('训练集、验证集和测试集比例之和必须为1')
label_file_list = list_files_in_directory(labels_file_path)
random.shuffle(label_file_list)
train_data_size = int(len(label_file_list) * train_ratio)
val_data_size = int(len(label_file_list) * val_ratio)
test_data_size = len(label_file_list) - train_data_size - val_data_size
train_list = label_file_list[:train_data_size]
val_list = label_file_list[train_data_size: train_data_size + val_data_size]
test_list = label_file_list[train_data_size + val_data_size:]
copy_files(train_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'train')
copy_files(val_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'val')
copy_files(test_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'test')
logging.info('数据集划分和文件复制完成')
if __name__ == "__main__":
labels_file_path = r'D:\lenovo\Archie\datasets\labels'
img_file_path = r'D:\lenovo\Archie\datasets\images'
target_labels_file_path = r'D:\lenovo\Archie\data\labels'
target_img_path = r'D:\lenovo\Archie\data\imgs'
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
split_dataset(labels_file_path, img_file_path, target_labels_file_path, target_img_path, train_ratio, val_ratio,
test_ratio)
这是一个用 Python 编写的程序,主要功能是将一个数据集按照指定的比例划分为训练集、验证集和测试集,并将相应的文件复制到目标目录中。以下是对该程序的详细解读:
导入必要的库:
import random
import shutil
import os
import logging
random:用于生成随机数,在打乱文件列表时会用到。shutil:提供了高级的文件操作功能,如复制文件和目录。os:提供了与操作系统交互的功能,如文件和目录操作。logging:用于记录程序运行过程中的日志信息。配置日志:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.basicConfig:配置日志的基本设置。level=logging.INFO:设置日志级别为 INFO,这意味着程序会记录 INFO 及以上级别的日志信息(如 INFO、WARNING、ERROR 等)。format='%(asctime)s - %(levelname)s - %(message)s':设置日志的格式,包括时间、日志级别和日志信息。定义函数list_files_in_directory:
def list_files_in_directory(directory):
"""
获取指定目录下的所有文件列表
"""
return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
os.path.isfile判断是否为文件,将所有文件添加到列表中并返回。定义函数create_directory:
def create_directory(directory):
"""
创建目录,如果目录已存在则不进行任何操作
"""
if not os.path.exists(directory):
os.makedirs(directory)
os.path.exists检查目录是否存在,如果不存在,则使用os.makedirs创建目录。定义函数copy_files:
def copy_files(file_list, source_label_path, source_img_path, target_label_path, target_img_path, sub_dir):
"""
复制文件到目标目录
"""
target_label_dir = os.path.join(target_label_path, sub_dir)
target_img_dir = os.path.join(target_img_path, sub_dir)
create_directory(target_label_dir)
create_directory(target_img_dir)
for label_name in file_list:
label_file_pre = label_name.split('.')[0]
img_file_name = f'{label_file_pre}.jpg'
try:
shutil.copyfile(os.path.join(source_label_path, label_name), os.path.join(target_label_dir, label_name))
shutil.copyfile(os.path.join(source_img_path, img_file_name), os.path.join(target_img_dir, img_file_name))
except FileNotFoundError as e:
logging.error(f'文件未找到: {e}')
except PermissionError as e:
logging.error(f'权限不足: {e}')
except Exception as e:
logging.error(f'复制文件时出现错误: {e}')
os.path.join构建目标标签目录和目标图像目录的路径。create_directory函数创建目标目录(如果不存在)。shutil.copyfile将标签文件和图像文件从源路径复制到目标路径。try - except块捕获可能的异常,如文件未找到、权限不足等,并记录相应的错误日志。定义函数split_dataset:
def split_dataset(labels_file_path, img_file_path, target_labels_file_path, target_img_path, train_ratio, val_ratio,
test_ratio):
"""
划分数据集并复制文件
"""
if train_ratio + val_ratio + test_ratio!= 1:
raise ValueError('训练集、验证集和测试集比例之和必须为1')
label_file_list = list_files_in_directory(labels_file_path)
random.shuffle(label_file_list)
train_data_size = int(len(label_file_list) * train_ratio)
val_data_size = int(len(label_file_list) * val_ratio)
test_data_size = len(label_file_list) - train_data_size - val_data_size
train_list = label_file_list[:train_data_size]
val_list = label_file_list[train_data_size: train_data_size + val_data_size]
test_list = label_file_list[train_data_size + val_data_size:]
copy_files(train_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'train')
copy_files(val_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'val')
copy_files(test_list, labels_file_path, img_file_path, target_labels_file_path, target_img_path, 'test')
logging.info('数据集划分和文件复制完成')
ValueError异常。list_files_in_directory获取源标签文件路径下的所有标签文件列表。random.shuffle打乱文件列表的顺序。copy_files函数将三个子列表中的文件分别复制到对应的目标目录中(train、val、test)。主程序部分:
if __name__ == "__main__":
labels_file_path = r'D:\lenovo\Archie\datasets\labels'
img_file_path = r'D:\lenovo\Archie\datasets\images'
target_labels_file_path = r'D:\lenovo\Archie\data\labels'
target_img_path = r'D:\lenovo\Archie\data\imgs'
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
split_dataset(labels_file_path, img_file_path, target_labels_file_path, target_img_path, train_ratio, val_ratio,
test_ratio)
split_dataset函数进行数据集划分和文件复制。这个程序的主要目的是将一个数据集按照指定的比例划分为训练集、验证集和测试集,并将相应的文件复制到目标目录中,同时记录操作过程中的日志信息。
作者:阿齐Archie