-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dataset.py
89 lines (67 loc) · 3.18 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""Split the SIGNS dataset into train/val/test and resize images to 64x64.
The SIGNS dataset comes into the following format:
train_signs/
0_IMG_5864.jpg
...
test_signs/
0_IMG_5942.jpg
...
Original images have size (3024, 3024).
Resizing to (64, 64) reduces the dataset size from 1.16 GB to 4.7 MB, and loading smaller images
makes training faster.
We already have a test set created, so we only need to split "train_signs" into train and val sets.
Because we don't have a lot of images and we want that the statistics on the val set be as
representative as possible, we'll take 20% of "train_signs" as val set.
"""
import argparse
import random
import os
from PIL import Image
from tqdm import tqdm
SIZE = 64
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', default='data/SIGNS', help="Directory with the SIGNS dataset")
parser.add_argument('--output_dir', default='data/64x64_SIGNS', help="Where to write the new data")
def resize_and_save(filename, output_dir, size=SIZE):
"""Resize the image contained in `filename` and save it to the `output_dir`"""
image = Image.open(filename)
# Use bilinear interpolation instead of the default "nearest neighbor" method
image = image.resize((size, size), Image.BILINEAR)
image.save(os.path.join(output_dir, filename.split('/')[-1]))
if __name__ == '__main__':
args = parser.parse_args()
assert os.path.isdir(args.data_dir), "Couldn't find the dataset at {}".format(args.data_dir)
# Define the data directories
train_data_dir = os.path.join(args.data_dir, 'train_signs')
test_data_dir = os.path.join(args.data_dir, 'test_signs')
# Get the filenames in each directory (train and test)
filenames = os.listdir(train_data_dir)
filenames = [os.path.join(train_data_dir, f) for f in filenames if f.endswith('.jpg')]
test_filenames = os.listdir(test_data_dir)
test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')]
# Split the images in 'train_signs' into 80% train and 20% val
# Make sure to always shuffle with a fixed seed so that the split is reproducible
random.seed(230)
filenames.sort()
random.shuffle(filenames)
split = int(0.8 * len(filenames))
train_filenames = filenames[:split]
val_filenames = filenames[split:]
filenames = {'train': train_filenames,
'val': val_filenames,
'test': test_filenames}
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
else:
print("Warning: output dir {} already exists".format(args.output_dir))
# Preprocess train, val and test
for split in ['train', 'val', 'test']:
output_dir_split = os.path.join(args.output_dir, '{}_signs'.format(split))
if not os.path.exists(output_dir_split):
os.mkdir(output_dir_split)
else:
print("Warning: dir {} already exists".format(output_dir_split))
print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
for filename in tqdm(filenames[split]):
resize_and_save(filename, output_dir_split, size=SIZE)
print("Done building dataset")