What are the best practices for increasing the input size of a GAN neural network?

Question

I need to expand the input size of my GAN from 32 x 32 images to 128 x 128. How could I achieve this? I didn't fully write this code and it slightly confuses me.

# -*- coding: utf-8 -*-
"""gans.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/18pkC8YJZRV0K0gqgixHupy55Kwhs1QK3
"""

# Commented out IPython magic to ensure Python compatibility.
import matplotlib.pyplot as plt
import numpy as np
import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn import Module, Sequential, Conv2d, ConvTranspose2d, LeakyReLU, BatchNorm2d, ReLU, Tanh, Sigmoid, BCELoss 

# %matplotlib inline

# plot images in a nxn grid
k = 0

def plot_images(imgs, grid_size = 5):
    """
    imgs: vector containing all the numpy images
    grid_size: 2x2 or 5x5 grid containing images
    """
    
    fig = plt.figure(figsize = (8, 8))
    columns = rows = grid_size
    plt.title("Training Images")

    for i in range(1, columns*rows +1):
        plt.axis("off")
        fig.add_subplot(rows, columns, i)
        plt.imshow(imgs[i])
    plt.savefig("Figure" + str(i) + ".jpg")

# load the numpy vector containing image representations
imgs = np.load('32x32imgcompressed.npz')

# to check all the files contained in it
imgs.files

# this is where all our images are saved
imgs['arr_0'].shape

# pls ignore the poor quality of the images as we are working with 32x32 sized images.
plot_images(imgs['arr_0'], 3)

# Always good to check if gpu support available or not

dev = 'cpu'
device = torch.device(dev)

# To check the device name
#print ('Current cuda device name ', torch.cuda.get_device_name())

# Preparing custom dataset class - https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class


class ArtDataset(Dataset):
    

    def __init__(self, npz_imgs):
        self.imgs = npz_imgs

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = self.imgs[idx]

        return image

imgs['arr_0'][0].dtype # it will output float 64 i.e. double

# must convert it to float 32 (which is same as model weights)
np.float32(imgs['arr_0']).dtype



# Preparing dataloader for training

transpose_imgs = np.transpose( 
    np.float32(imgs['arr_0']), # imp step to convert double -> float (by default numpy input uses double as data type)
    (0, 3,1,2) # tuple to describe how to rearrange the dimensions
    ) 

dset = ArtDataset(transpose_imgs) # passing the npz variable to the constructor class
batch_size = 32
shuffle = True

dataloader = DataLoader(dataset = dset, batch_size = batch_size, shuffle = shuffle)

# Defining the Generator class

class Generator(Module):
    def __init__(self):

        # calling constructor of parent class
        super().__init__()

        self.gen = Sequential(
          ConvTranspose2d(in_channels = 100, out_channels = 512 , kernel_size = 4, stride = 1, padding = 0, bias = False),
          # the output from the above will be b_size ,512, 4,4
          BatchNorm2d(num_features = 512), # From an input of size (b_size, C, H, W), pick num_features = C
          ReLU(inplace = True),

          ConvTranspose2d(in_channels = 512, out_channels = 256 , kernel_size = 4, stride = 2, padding = 1, bias = False),
          # the output from the above will be b_size ,256, 8,8
          BatchNorm2d(num_features = 256),
          ReLU(inplace = True),

          ConvTranspose2d(in_channels = 256, out_channels = 128 , kernel_size = 4, stride = 2, padding = 1, bias = False),
          
          BatchNorm2d(num_features = 128),
          ReLU(inplace = True),

          ConvTranspose2d(in_channels = 128, out_channels = 3 , kernel_size = 4, stride = 2, padding = 1, bias = False),
         
          Tanh()
        
        )

    def forward(self, input):
        return self.gen(input)

netG = Generator().to(device)

print(netG)

 
t = torch.randn(2, 100, 1, 1)
netG(t.to(device)).shape

def init_weights(m):
    if type(m) == ConvTranspose2d:
        nn.init.normal_(m.weight, 0.0, 0.02)
    elif type(m) == BatchNorm2d:
        nn.init.normal_(m.weight, 1.0, 0.02)
        nn.init.constant_(m.bias, 0)

netG.apply(init_weights)

print(netG)

class Discriminator(Module):
    def __init__(self):

        super().__init__()
        self.dis = Sequential(

            # input is (3, 32, 32)
            Conv2d(in_channels = 3, out_channels = 32, kernel_size = 4, stride = 2, padding = 1, bias=False),
            # ouput from above layer is b_size, 32, 16, 16
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels = 32, out_channels = 32*2, kernel_size = 4, stride = 2, padding = 1, bias=False),
            # ouput from above layer is b_size, 32*2, 8, 8
            BatchNorm2d(32 * 2),
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels = 32*2, out_channels = 32*4, kernel_size = 4, stride = 2, padding = 1, bias=False),
            # ouput from above layer is b_size, 32*4, 4, 4
            BatchNorm2d(32 * 4),
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels = 32*4, out_channels = 32*8, kernel_size = 4, stride = 2, padding = 1, bias=False),
            # ouput from above layer is b_size, 256, 2, 2
            # NOTE: spatial size of this layer is 2x2, hence in the final layer, the kernel size must be 2 instead (or smaller than) 4
            BatchNorm2d(32 * 8),
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels = 32*8, out_channels = 1, kernel_size = 2, stride = 2, padding = 0, bias=False),
            # ouput from above layer is b_size, 1, 1, 1
            Sigmoid()
        )
    
    def forward(self, input):
        return self.dis(input)

# As an example, to show the shape of the output from the generator -  must be an integer
t = torch.randn(2, 3, 32, 32)

netD = Discriminator().to(device)
netD(t.to(device)).shape

# initializing the weights
netD.apply(init_weights)

print(netD)

# Setting up otimizers for both Generator and Discriminator

opt_D = optim.Adam(netD.parameters(), lr = 0.0002, betas= (0.5, 0.999))
opt_G = optim.Adam(netG.parameters(), lr = 0.0002, betas= (0.5, 0.999))

# Setting up the loss function - BCELoss (to check how far the predicted value is from real value)

loss = BCELoss()

# TRAINING GANS
epochs = 1000

# going over the entire dataset 10 times
for e in range(epochs):
    
    # pick each batch b of input images: shape of each batch is (32, 3, 32, 32)
    for i, b in enumerate(dataloader):

        ##########################
        ## Update Discriminator ##
        ##########################

        # Loss on real images
        
        # clear the gradient
        opt_D.zero_grad() # set the gradients to 0 at start of each loop because gradients are accumulated on subsequent backward passes
        # compute the D model output
        yhat = netD(b.to(device)).view(-1) # view(-1) reshapes a 4-d tensor of shape (2,1,1,1) to 1-d tensor with 2 values only
        # specify target labels or true labels
        target = torch.ones(len(b), dtype=torch.float, device=device)
        # calculate loss
        loss_real = loss(yhat, target)
        # calculate gradients -  or rather accumulation of gradients on loss tensor
        loss_real.backward()

        # Loss on fake images

        # generate batch of fake images using G
        # Step1: creating noise to be fed as input to G
        noise = torch.randn(len(b), 100, 1, 1, device = device)
        # Step 2: feed noise to G to create a fake img (this will be reused when updating G)
        fake_img = netG(noise) 

        # compute D model output on fake images
        yhat = netD.cpu()(fake_img.detach()).view(-1)
        # specify target labels
        target = torch.zeros(len(b), dtype=torch.float, device=device)
        # calculate loss
        loss_fake = loss(yhat, target)
        # calculate gradients
        loss_fake.backward()

        # total error on D
        loss_disc = loss_real + loss_fake

        # Update weights of D
        opt_D.step()

        ##########################
        #### Update Generator ####
        ##########################

        # clear gradient
        opt_G.zero_grad()
        # pass fake image through D
        yhat = netD.cpu()(fake_img).view(-1)
        # specify target variables - remember G wants D *to think* these are real images so label is 1
        target = torch.ones(len(b), dtype=torch.float, device=device)
        # calculate loss
        loss_gen = loss(yhat, target)
        # calculate gradients
        loss_gen.backward()
        # update weights on G
        opt_G.step()

        k +=1
        ####################################
        #### Plot some Generator images ####
        ####################################
        print(" Epoch %d and iteration %d " % (e, i))
        # during every epoch, print images at every 10th iteration.
        if i% 100 == 0:
            # convert the fake images from (b_size, 3, 32, 32) to (b_size, 32, 32, 3) for plotting 
            img_plot = np.transpose(fake_img.detach().cpu(), (0,2,3,1)) # .detach().cpu() is imp for copying fake_img tensor to host memory first
            plot_images(img_plot)
            print("********************")

I have tried adjusting the different.networks in-channels but I only end up with errors. I have tried asking for help elsewhere but I cant seem to find anyone with an expertise in this area. I was told I may have to remake all the.networks to adjust to the new image size.

Answer 1

Your generator uses ConvTranspose2d layers and your discriminator uses Conv2d layers. You have to understand that the stride argument of these two layers will increase (for ConvTranspose2d) or decrease (for Conv2d) the size of the image by stride . Note that the output size of a layer is also affected by the padding parameter. If you want to multilpy/divide it by stride , you need to set padding to 0 for Cnv2d, and 1 for ConvTranspose2d.

For example, a 32x32 image throught a Conv2d with stride=2 with result in a 16x16 image. As the discriminator uses 5 Conv2d layers with each stride=2, you end up with a 1x1 image, or a single vector in other words (its dimension is the number of channels of the last layer). And a 32x32 image throught a ConvTranspose2d with make it 64x64.

To deal with 128x128 images, one option is to change the stride of some layers, but I don't recommend that.

The other option is to add a new layer with stride=2, to both the generator and the discriminator. Don't forget to add a BatchNorm2d and ReLU after. I prefer this because 64x64 are more complex that 32x32, so you will need more parameters, and just modifying the strides doesn't add parameters.

class Generator(Module):
    def __init__(self):
        # calling constructor of parent class
        super().__init__()

        self.gen = Sequential(
            ConvTranspose2d(in_channels=100, out_channels=512, kernel_size=4, stride=1, padding=0, bias=False),
            # the output from the above will be b_size ,512, 4,4
            BatchNorm2d(num_features=512),  # From an input of size (b_size, C, H, W), pick num_features = C
            ReLU(inplace=True),

            ############################ new layer here ###########################
            ConvTranspose2d(in_channels=512, out_channels=512, kernel_size=4, stride=2, padding=1, bias=False),
            BatchNorm2d(num_features=512),
            ReLU(inplace=True),
            #######################################################################################

            ############################ new layer here ###########################
            ConvTranspose2d(in_channels=512, out_channels=512, kernel_size=4, stride=2, padding=1, bias=False),
            BatchNorm2d(num_features=512),
            ReLU(inplace=True),
            #######################################################################################

            ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=4, stride=2, padding=1, bias=False),
            # the output from the above will be b_size ,256, 8,8
            BatchNorm2d(num_features=256),
            ReLU(inplace=True),

            ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=4, stride=2, padding=1, bias=False),

            BatchNorm2d(num_features=128),
            ReLU(inplace=True),

            ConvTranspose2d(in_channels=128, out_channels=3, kernel_size=4, stride=2, padding=1, bias=False),

            Tanh()

        )

    def forward(self, input):
        return self.gen(input)


class Discriminator(Module):
    def __init__(self):
        super().__init__()
        self.dis = Sequential(

            # input is (3, 32, 32)
            Conv2d(in_channels=3, out_channels=32, kernel_size=4, stride=2, padding=1, bias=False),
            # ouput from above layer is b_size, 32, 16, 16
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels=32, out_channels=32 * 2, kernel_size=4, stride=2, padding=1, bias=False),
            # ouput from above layer is b_size, 32*2, 8, 8
            BatchNorm2d(32 * 2),
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels=32 * 2, out_channels=32 * 4, kernel_size=4, stride=2, padding=1, bias=False),
            # ouput from above layer is b_size, 32*4, 4, 4
            BatchNorm2d(32 * 4),
            LeakyReLU(0.2, inplace=True),

            ############################ new layer here ###########################
            Conv2d(in_channels=32 * 4, out_channels=32 * 4, kernel_size=4, stride=2, padding=0, bias=False),
            BatchNorm2d(32 * 4),
            LeakyReLU(0.2, inplace=True),
            #######################################################################################

            Conv2d(in_channels=32 * 4, out_channels=32 * 8, kernel_size=4, stride=2, padding=1, bias=False),
            # ouput from above layer is b_size, 256, 2, 2
            # NOTE: spatial size of this layer is 2x2, hence in the final layer, the kernel size must be 2 instead (or smaller than) 4
            BatchNorm2d(32 * 8),
            LeakyReLU(0.2, inplace=True),

            Conv2d(in_channels=32 * 8, out_channels=1, kernel_size=2, stride=2, padding=0, bias=False),
            # ouput from above layer is b_size, 1, 1, 1
            Sigmoid()
        )

    def forward(self, input):
        return self.dis(input)

This should work. You may want to change the number of channels to reduce the number of parameters. I didn't change the rest of the code but there should be some slight changes too (nothing crazy).

What are the best practices for increasing the input size of a GAN neural network?

Question

1 answers

solution1
0 ACCPTED 2023-01-09 23:06:40

What are the best practices for increasing the input size of a GAN neural network?

Question

1 answers

solution1 0 ACCPTED 2023-01-09 23:06:40

solution1
0 ACCPTED 2023-01-09 23:06:40