python: Reading RGB image for deep learning

Question

In a deep learning tutorial website, I found this code to read images as below.

cv2.resize(cv2.imread(folder + name, 1), (100, 200)).reshape(3, 100, 200)

This code change the shape of image from (100,200,3) into (3,100,200). I tried to see how the functions change shapes of matrix, and I got strange output.
Please suppose 2x4 RGB image(d) as below.

d = array([[[ 1,  2,  3],[ 4,  5,  6],[ 7,  8,  9], [10, 11, 12]],[[13, 14, 15],[16, 17, 18], [19, 20, 21],[22, 23, 24]]])
d.shape: (2, 4, 3)

After applying reshape, it shows

d.reshape(3,2,4) 
array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8]],
       [[ 9,  10,  11,  12],
        [ 13,  14,  15,  16]],
       [[ 17,  18,  19,  20],
        [ 21,  22,  23,  24]]])

But I do not think this is proper representation since we want to represent an image as below.

Therefore, I think we should convert the image as below.

d.reshape(3,2,4) 
array([[[ 1,  4,  7,  10],  
        [ 13,  16,  19,  22]],#R layer
       [[ 2,  5,   8,  11], 
        [ 14,  17,  20,  23]],#G layer
       [[  3,  6,  9,  12], 
        [ 15,  18,  21,  24]]])#B layer

Is my understanding wrong? Please help me out if you have knowledge.

I put entire code below.

    from torch import nn
    from torch.nn import functional as F
    from torch.autograd import Variable
    from sklearn.model_selection import train_test_split
    import numpy as np
    from collections import Counter
    import os
    import cv2
    import torch.optim as optim
    import torch.utils.data


    def read_labels(file):
      dic = {}
      with open(file) as f:
        reader = f
        for row in reader:
            dic[row.split(",")[0]]  = row.split(",")[1].rstrip() #rstrip(): eliminate "\n"
      return dic

    image_names= os.listdir("../train")
    label_dic = read_labels("../labels.csv")

    labels = []
    images =[]

    for name in image_names:
        images.append(cv2.resize(cv2.imread("../train/"+name,1), (100, 200)).reshape(3,100,200))
        labels.append(label_dic[os.path.splitext(name)[0]])

    images = np.asarray(images)


    """
    Assign numbers for each labels
    """

    tmp_labels = labels
    uniq_labels = set(tmp_labels) # eliminate duplication
    num_breeds = len(Counter(labels)) # number of breeds
    uniqu_labels_index = dict((label, i) for i, label in enumerate(uniq_labels)) #create dictionary and assign number for each labels

    labels_num = [uniqu_labels_index[label] for i,label in enumerate(labels)]
    labels_num = np.array(labels_num)

    """
    Data distribution
    """
    N = len(images)
    N_train = int(N * 0.7)
    N_test = int(N*0.2)

    X_train, X_tmp, Y_train, Y_tmp = train_test_split(images, labels_num, train_size=N_train)
    X_validation, X_test, Y_validation, Y_test = train_test_split(X_tmp, Y_tmp, test_size=N_test)

    """
    Model Definition
    """


    # CNN Model (2 conv layer)
    class CNN(nn.Module):
        def __init__(self):
            super(CNN, self).__init__()
            self.layer1 = nn.Sequential(
                nn.Conv2d(3,34, kernel_size=5,padding= 2),
                nn.Dropout2d(),
                nn.BatchNorm2d(34),
                nn.ReLU(),
                nn.MaxPool2d(2))
            self.layer2 = nn.Sequential(
                nn.Conv2d(34, 68, kernel_size=5,padding= 2),
                nn.BatchNorm2d(68),
                nn.ReLU(),
                nn.MaxPool2d(2))
            self.fc1 = nn.Linear(1700,300)
            self.fc2 = nn.Linear(300,num_breeds)

        def forward(self, x):
            out = self.layer1(x)
            #print out.data.shape
            out = self.layer2(out)
            #print out.data.shape
            out = out.view(out.size(0), -1)
            #print out.data.shape
            out =self.fc1(out)
            #out = F.dropout(out)
            #out = self.fc2(out)
            return F.log_softmax(out)

        def accuracy(self,outputs,labels):
            #for i, (images_val, labels_val) in enumerate(val_loader):

                # print images.shape
             #   images_val = Variable(images_val).float()
              #  labels_val = Variable(labels_val).float().type(torch.LongTensor)
              #  outputs_val = CNN(images_val)

            inference =  np.argmax(outputs.data.numpy(),axis=1)
            answers = labels.data.numpy()
            correction =  np.equal(inference,answers)
            return  np.sum(correction)/float(len(correction))

    CNN = CNN()

    """
    Training
    """
    batch_size = 100
    learning_rate =0.01
    # Data Loader (Input Pipeline)
    train = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train))
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)

    val = torch.utils.data.TensorDataset(torch.from_numpy(X_validation), torch.from_numpy(Y_validation))
    val_loader = torch.utils.data.DataLoader(val, batch_size=len(X_validation), shuffle=True)

    test = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test))
    test_loader = torch.utils.data.DataLoader(test, batch_size=len(X_test), shuffle=True)


    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(CNN.parameters(), lr=learning_rate)

    for epoch in range(250):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, (images, labels) in enumerate(train_loader):

            images = Variable(images).float()
            labels = Variable(labels).float().type(torch.LongTensor)
            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = CNN(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0]


        accuracy = CNN.accuracy(outputs,labels)
        print
        print "epoch :",epoch
        print 'loss:' ,float(running_loss) / 2000
        print "accuracy :",accuracy
        running_loss = 0.0

    print('Finished Training')
    for i, (images, labels) in enumerate(test_loader):

            images = Variable(images).float()
            labels = Variable(labels).float().type(torch.LongTensor)
            optimizer.zero_grad()
            outputs = CNN(images)
    inference =  np.argmax(outputs.data.numpy(),axis=1)
    answers = labels.data.numpy()
    correction =  np.equal(inference,answers)
    print  np.sum(correction)/float(len(correction))

Answer 1

The reshape function is taken from Numpy .

The shape method indicates how many elements you have in you array for each layer. So, in your example:

d = array([
            [[ 1,  2,  3],[ 4,  5,  6],[ 7,  8,  9],[10, 11, 12]], #1st layer 1st element (4 lists inside with 3 numbers each)
            [[13, 14, 15],[16, 17, 18], [19, 20, 21],[22, 23, 24]] #1st layer 2nd element (4 lists inside with 3 numbers each)
         ])

The first layer has two lists, the second layer 4 lists and the third has three numbers.

When you call reshape(3,2,4) you get 3 lists on the first layer, 2 lists on the second layer and four numbers in the third layer keeping the same elements you provided.

It does not change the order of the elements, just change the shape. In your example, if you try to see the modified image with imshow you will see that the reshape command has messed up the image.

Try:

image = cv2.imread(folder + name, 1)
cv2.imshow('image',image)
cv2.waitKey(0)
cv2.destroyAllWindows()

Then:

reshapedimage =cv2.resize(cv2.imread(folder + name, 1), (100, 200))
cv2.imshow('image',reshapedimage)
cv2.waitKey(0)
cv2.destroyAllWindows()

You will be able to see what each command is doing to your image

python: Reading RGB image for deep learning

Question

1 answers

solution1
0 2017-12-01 18:09:39

python: Reading RGB image for deep learning

Question

1 answers

solution1 0 2017-12-01 18:09:39

solution1
0 2017-12-01 18:09:39