How to split folder of images into test/training/validation sets with stratified sampling?

问题

I have a very large folder of images, as well as a CSV file containing the class labels for each of those images. Because it's all in one giant folder, I'd like to split them up into training/test/validation sets; maybe create three new folders and move images into each based on a Python script of some kind. I'd like to do stratified sampling so I can keep the % of classes the same across all three sets.

What would be the approach to go about making a script that can do this?

回答1:

I ran into a similar problem myself. All my images were stored in two folders. "Project/Data2/DPN+" and "Project/Data2/DPN-". It was a binary classification problem. The two classes were "DPN+" and "DPN-". Both of these class folders had .png in them. My objective was to distribute the dataset into training, validation and testing folders. Each of these new folders will have 2 more folders - "DPN+" and "DPN-" - inside them indicating the class. For partition, I used 70:15:15 distribution. I am a beginner in python so, please let me know if I made any mistakes.

Following is my code:

import os
import numpy as np
import shutil

# # Creating Train / Val / Test folders (One time use)
root_dir = 'Data2'
posCls = '/DPN+'
negCls = '/DPN-'

os.makedirs(root_dir +'/train' + posCls)
os.makedirs(root_dir +'/train' + negCls)
os.makedirs(root_dir +'/val' + posCls)
os.makedirs(root_dir +'/val' + negCls)
os.makedirs(root_dir +'/test' + posCls)
os.makedirs(root_dir +'/test' + negCls)

# Creating partitions of the data after shuffeling
currentCls = posCls
src = "Data2"+currentCls # Folder to copy images from

allFileNames = os.listdir(src)
np.random.shuffle(allFileNames)
train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                          [int(len(allFileNames)*0.7), int(len(allFileNames)*0.85)])


train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]
test_FileNames = [src+'/' + name for name in test_FileNames.tolist()]

print('Total images: ', len(allFileNames))
print('Training: ', len(train_FileNames))
print('Validation: ', len(val_FileNames))
print('Testing: ', len(test_FileNames))

# Copy-pasting images
for name in train_FileNames:
    shutil.copy(name, "Data2/train"+currentCls)

for name in val_FileNames:
    shutil.copy(name, "Data2/val"+currentCls)

for name in test_FileNames:
    shutil.copy(name, "Data2/test"+currentCls)

回答2:

Taking Abdul Mukit's answer above and expanding it a bit to support more than two classes and loop through each to create train/val/test for each.

import os
import numpy as np
import shutil
import random

# # Creating Train / Val / Test folders (One time use)
root_dir = '4_classes'
classes_dir = ['/class1', 'class2', 'class3', 'class4']

val_ratio = 0.15
test_ratio = 0.05

for cls in classes_dir:
    os.makedirs(root_dir +'/train' + cls)
    os.makedirs(root_dir +'/val' + cls)
    os.makedirs(root_dir +'/test' + cls)


    # Creating partitions of the data after shuffeling
    src = root_dir + cls # Folder to copy images from

    allFileNames = os.listdir(src)
    np.random.shuffle(allFileNames)
    train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                              [int(len(allFileNames)* (1 - val_ratio + test_ratio)), 
                                                               int(len(allFileNames)* (1 - test_ratio))])


    train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
    val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]
    test_FileNames = [src+'/' + name for name in test_FileNames.tolist()]

    print('Total images: ', len(allFileNames))
    print('Training: ', len(train_FileNames))
    print('Validation: ', len(val_FileNames))
    print('Testing: ', len(test_FileNames))

    # Copy-pasting images
    for name in train_FileNames:
        shutil.copy(name, root_dir +'/train' + cls)

    for name in val_FileNames:
        shutil.copy(name, root_dir +'/val' + cls)

    for name in test_FileNames:
        shutil.copy(name, root_dir +'/test' + cls)

回答3:

Use the python library split-folder.

pip install split-folders

Let all the images be stored in Data folder. Then apply as follows:

import split_folders
split_folders.ratio('Data', output="output", seed=1337, ratio=(.8, 0.1,0.1))

On running the above code snippet, it will create 3 folders in the output directory:

train
val
test

The number of images in each folder can be varied using the values in the ratio argument(train:val:test).

回答4:

I had similar task. My images and corresponding annotations in XML format were stored in one folder. I made train and test folder but I used origin folder as validation folder after splitting files (see the script).

Here is my script to split files into into test/training/validation sets:

import os
from random import choice
import shutil

#arrays to store file names
imgs =[]
xmls =[]

#setup dir names
trainPath = 'train'
valPath = 'val'
testPath = 'test'
crsPath = 'img' #dir where images and annotations stored

#setup ratio (val ratio = rest of the files in origin dir after splitting into train and test)
train_ratio = 0.8
test_ratio = 0.1


#total count of imgs
totalImgCount = len(os.listdir(crsPath))/2

#soring files to corresponding arrays
for (dirname, dirs, files) in os.walk(crsPath):
    for filename in files:
        if filename.endswith('.xml'):
            xmls.append(filename)
        else:
            imgs.append(filename)


#counting range for cycles
countForTrain = int(len(imgs)*train_ratio)
countForTest = int(len(imgs)*test_ratio)

#cycle for train dir
for x in range(countForTrain):

    fileJpg = choice(imgs) # get name of random image from origin dir
    fileXml = fileJpg[:-4] +'.xml' # get name of corresponding annotation file

    #move both files into train dir
    shutil.move(os.path.join(crsPath, fileJpg), os.path.join(trainPath, fileJpg))
    shutil.move(os.path.join(crsPath, fileXml), os.path.join(trainPath, fileXml))

    #remove files from arrays
    imgs.remove(fileJpg)
    xmls.remove(fileXml)



#cycle for test dir   
for x in range(countForTest):

    fileJpg = choice(imgs) # get name of random image from origin dir
    fileXml = fileJpg[:-4] +'.xml' # get name of corresponding annotation file

    #move both files into train dir
    shutil.move(os.path.join(crsPath, fileJpg), os.path.join(testPath, fileJpg))
    shutil.move(os.path.join(crsPath, fileXml), os.path.join(testPath, fileXml))

    #remove files from arrays
    imgs.remove(fileJpg)
    xmls.remove(fileXml)

#rest of files will be validation files, so rename origin dir to val dir
os.rename(crsPath, valPath)

#summary information after splitting
print('Total images: ', totalImgCount)
print('Images in train dir:', len(os.listdir(trainPath))/2)
print('Images in test dir:', len(os.listdir(testPath))/2)
print('Images in validation dir:', len(os.listdir(valPath))/2)

来源：https://stackoverflow.com/questions/53074712/how-to-split-folder-of-images-into-test-training-validation-sets-with-stratified

标签

python

python-3.x