Apples2Oranges/preprocess.py at master · adeemm/Apples2Oranges · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import cv2
import numpy as np


IMG_SIZE = 50
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
TRAINING_DIR = os.path.join(CURRENT_DIR, "training_data")


# return one hot encoding of the image category
def encode_label(img):
    label = os.path.basename(os.path.dirname(img))

    if label == "apple":
        return [0, 1]
    elif label == "orange":
        return [1, 0]


# normalize pixel values of img
def normalize(img):
    for i in range(len(img)):
        img[i] = img[i] / 255.0

    return img


# store pre-processed images and their respective labels (remember to remove .DS_Store or Thumbs.db if they exist)
def generate_training_data():
    features = []
    labels = []

    for category in os.listdir(TRAINING_DIR):
        for img in os.listdir(os.path.join(TRAINING_DIR, category)):
            path = os.path.join(TRAINING_DIR, category, img)
            label = encode_label(path)
            img = cv2.resize(cv2.imread(path), (IMG_SIZE, IMG_SIZE))
            features.append(np.array(img).flatten())
            labels.append(label)

    normalize(features)
    features = np.array(features).reshape(-1, 7500, 1)
    labels = np.array(labels).reshape(-1, 2, 1)
    return features, labels


# splits dataset into training and validation sets (to check accuracy of predictions)
def split_dataset(x, y, ratio):
    split = int(ratio * x.shape[0])
    indices = np.random.permutation(x.shape[0])

    training_index, validation_index = indices[:split], indices[split:]
    x_training, x_validation = x[training_index, :], x[validation_index, :]
    y_training, y_validation = y[training_index, :], y[validation_index, :]

    print("Training dataset size: ", x_training.shape[0])
    print("Validation dataset size: ", x_validation.shape[0])
    return x_training, x_validation, y_training, y_validation