Convolutional encoder error - 'RuntimeError: input and target shapes do not match'

问题

In below code three images are created, saved and a convolutional auto-encoder attempts to encode them to a lower dimensional representation.

%reset -f

import torch.utils.data as data_utils
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
import torch
import torchvision
import torch.nn as nn
from torch.autograd import Variable
from os import listdir
import cv2
import torch.nn.functional as F
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt


number_channels = 3

%matplotlib inline

x = np.arange(10)
m = 1
b = 2
y = x * x
plt.plot(x, y)
plt.axis('off')
plt.savefig('1-increasing.jpg')

x = np.arange(10)
m = 0.01
b = 2
y = x * x * x
plt.plot(x, y)
plt.axis('off')
plt.savefig('2-increasing.jpg')

x = np.arange(10)
m = 0
b = 2
y = (m*x)+b
plt.plot(x, y)
plt.axis('off')
plt.savefig('constant.jpg')

batch_size_value = 2

train_image = []

train_image.append(cv2.imread('1-increasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('2-increasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('decreasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('constant.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))


data_loader = data_utils.DataLoader(train_image, batch_size=batch_size_value, shuffle=False,drop_last=True)

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST
import os

def to_img(x):
    x = 0.5 * (x + 1)
    x = x.clamp(0, 1)
    x = x.view(x.size(0), 1, 28, 28)
    return x


num_epochs = 100
# batch_size = 128
batch_size = 2

learning_rate = 1e-3
dataloader = data_loader

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
#         torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=2),  # b, 16, 5, 5
            nn.Conv2d(16, 8, 3, stride=2, padding=1),  # b, 8, 3, 3
            nn.ReLU(True),
            nn.MaxPool2d(3, stride=1)  # b, 8, 2, 2
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, 2, stride=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=3, padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


model = autoencoder().cuda().double()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

for epoch in range(num_epochs):
    for data in dataloader:
        img, _ = data
        img = img.double()
        img = Variable(img).cuda()
        img = img.unsqueeze_(0)

        # ===================forward=====================
        output = model(img)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log=================to_img=======
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, num_epochs, loss.data[0]))

torch.save(model.state_dict(), './conv_autoencoder.pth')

But error is returned :

RuntimeError: input and target shapes do not match: input [1 x 3 x 132 x 204], target [1 x 3 x 288 x 432] at /pytorch/aten/src/THCUNN/generic/MSECriterion.cu:15

The shape of the images are (3, 288, 432) . How to change the configuration of the model to allow [1 x 3 x 288 x 432] instead of [1 x 3 x 132 x 204] ?

Update:

I changed

nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1)

to :

nn.ConvTranspose2d(8, 3, 3, stride=4, padding=2)

Which results in closer dimensional output but not exact so error is now :

RuntimeError: input and target shapes do not match: input [1 x 3 x 263 x 407], target [1 x 3 x 288 x 432] at /pytorch/aten/src/THCUNN/generic/MSECriterion.cu:12

How should the output decoder dimensions be calculated in order to produce the correct dimension ?

回答1:

There are a couple of ways, Here is the one solution:

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
#         torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=2),  # b, 16, 5, 5
            nn.Conv2d(16, 8, 3, stride=2, padding=1),  # b, 8, 3, 3
            nn.ReLU(True),
            nn.MaxPool2d(3, stride=1)  # b, 8, 2, 2
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, 2, stride=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=3, padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 25, stride=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 3, stride=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

Here is the formula;

N --> Input Size, F --> Filter Size, stride-> Stride Size, pdg-> Padding size

ConvTranspose2d;

OutputSize = N*stride + F - stride - pdg*2

Conv2d;

OutputSize = (N - F)/stride + 1 + pdg*2/stride [e.g. 32/3=10 it ignores after the comma]

来源：https://stackoverflow.com/questions/55140554/convolutional-encoder-error-runtimeerror-input-and-target-shapes-do-not-matc

标签

deep-learning