caffe reshape / upsample fully connected layer

问题

Assuming we have a layer like this:

layer {
  name: "fully-connected"
  type: "InnerProduct"
  bottom: "bottom"
  top: "top"
  inner_product_param {
    num_output: 1
  }
}

The output is batch_size x 1. In several papers (for exmaple link1 page 3 picture on the top, or link2 page 4 on top)I have seen that they used such a layer in the end to come up with a 2D image for pixel-wise prediction. How is it possible to transform this into a 2D image? I was thinking of reshape or deconvolution, but I cannot figure out how that would work. A simple example would be helpful

UPDATE: My input images are 304x228 and my ground_truth (depth images) are 75x55.

################# Main net ##################

layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "norm1"
  type: "LRN"
  bottom: "conv1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "norm1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "norm2"
  type: "LRN"
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
}
layer {
  name: "conv4"
  type: "Convolution"
  bottom: "conv3"
  top: "conv4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4"
  top: "conv4"
}
layer {
  name: "conv5"
  type: "Convolution"
  bottom: "conv4"
  top: "conv5"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5"
  top: "conv5"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fc6"
  type: "InnerProduct"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relufc6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layer {
  name: "fc7"
  type: "InnerProduct"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4070
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}

layer {
  type: "Reshape"
  name: "reshape"
  bottom: "fc7"
  top: "fc7_reshaped"
  reshape_param {
    shape { dim:  1  dim: 1  dim:  55 dim: 74 }
  }
}

layer {
  name: "deconv1"
  type: "Deconvolution"
  bottom: "fc7_reshaped"
  top: "deconv1"
  convolution_param {
    num_output: 64
    kernel_size: 5
    pad: 2
    stride: 1
      #group: 256
    weight_filler {
        type: "bilinear"
    }
    bias_term: false
  }
}

#########################

layer {
  name: "conv6"
  type: "Convolution"
  bottom: "data"
  top: "conv6"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 63
    kernel_size: 9
    stride: 2
    pad: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "conv6"
  top: "conv6"
}

layer {
  name: "pool6"
  type: "Pooling"
  bottom: "conv6"
  top: "pool6"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}

########################
layer {
  name: "concat"
  type: "Concat"
  bottom: "deconv1"
  bottom: "pool6"
  top: "concat"
  concat_param {
    concat_dim: 1
  }
}

layer {
  name: "conv7"
  type: "Convolution"
  bottom: "concat"
  top: "conv7"
  convolution_param {
    num_output: 64
    kernel_size: 5
    pad: 2
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.011
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}

layer {
    name: "relu7"
    type: "ReLU"
    bottom: "conv7"
    top: "conv7"
    relu_param{
    negative_slope: 0.01
        engine: CUDNN
    }
}

layer {
  name: "conv8"
  type: "Convolution"
  bottom: "conv7"
  top: "conv8"
  convolution_param {
    num_output: 64
    kernel_size: 5
    pad: 2
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.011
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}

layer {
    name: "relu8"
    type: "ReLU"
    bottom: "conv8"
    top: "conv8"
    relu_param{
    negative_slope: 0.01
        engine: CUDNN
    }
}

layer {
  name: "conv9"
  type: "Convolution"
  bottom: "conv8"
  top: "conv9"
  convolution_param {
    num_output: 1
    kernel_size: 5
    pad: 2
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.011
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}

layer {
    name: "relu9"
    type: "ReLU"
    bottom: "conv9"
    top: "result"
    relu_param{
    negative_slope: 0.01
        engine: CUDNN
    }
}

log:

I1108 19:34:57.239722  4277 data_layer.cpp:41] output data size: 1,1,228,304
I1108 19:34:57.243340  4277 data_layer.cpp:41] output data size: 1,1,55,74
I1108 19:34:57.247392  4277 net.cpp:150] Setting up conv1
I1108 19:34:57.247407  4277 net.cpp:157] Top shape: 1 96 55 74 (390720)
I1108 19:34:57.248191  4277 net.cpp:150] Setting up pool1
I1108 19:34:57.248196  4277 net.cpp:157] Top shape: 1 96 27 37 (95904)
I1108 19:34:57.253263  4277 net.cpp:150] Setting up conv2
I1108 19:34:57.253276  4277 net.cpp:157] Top shape: 1 256 27 37 (255744)
I1108 19:34:57.254202  4277 net.cpp:150] Setting up pool2
I1108 19:34:57.254220  4277 net.cpp:157] Top shape: 1 256 13 18 (59904)
I1108 19:34:57.269943  4277 net.cpp:150] Setting up conv3
I1108 19:34:57.269961  4277 net.cpp:157] Top shape: 1 384 13 18 (89856)
I1108 19:34:57.285303  4277 net.cpp:150] Setting up conv4
I1108 19:34:57.285338  4277 net.cpp:157] Top shape: 1 384 13 18 (89856)
I1108 19:34:57.294801  4277 net.cpp:150] Setting up conv5
I1108 19:34:57.294841  4277 net.cpp:157] Top shape: 1 256 13 18 (59904)
I1108 19:34:57.295207  4277 net.cpp:150] Setting up pool5
I1108 19:34:57.295210  4277 net.cpp:157] Top shape: 1 256 6 9 (13824)
I1108 19:34:57.743222  4277 net.cpp:150] Setting up fc6
I1108 19:34:57.743259  4277 net.cpp:157] Top shape: 1 4096 (4096)
I1108 19:34:57.881680  4277 net.cpp:150] Setting up fc7
I1108 19:34:57.881718  4277 net.cpp:157] Top shape: 1 4070 (4070)

I1108 19:34:57.881826  4277 net.cpp:150] Setting up reshape
I1108 19:34:57.881846  4277 net.cpp:157] Top shape: 1 1 55 74 (4070)

I1108 19:34:57.884768  4277 net.cpp:150] Setting up conv6
I1108 19:34:57.885309  4277 net.cpp:150] Setting up pool6
I1108 19:34:57.885327  4277 net.cpp:157] Top shape: 1 63 55 74 (256410)

I1108 19:34:57.885395  4277 net.cpp:150] Setting up concat
I1108 19:34:57.885412  4277 net.cpp:157] Top shape: 1 64 55 74 (260480)

I1108 19:34:57.886759  4277 net.cpp:150] Setting up conv7
I1108 19:34:57.886786  4277 net.cpp:157] Top shape: 1 64 55 74 (260480)

I1108 19:34:57.897269  4277 net.cpp:150] Setting up conv8
I1108 19:34:57.897303  4277 net.cpp:157] Top shape: 1 64 55 74 (260480)
I1108 19:34:57.899129  4277 net.cpp:150] Setting up conv9
I1108 19:34:57.899138  4277 net.cpp:157] Top shape: 1 1 55 74 (4070)

回答1:

The value of num_output of the last fully connected layer will not be 1 for pixel wise prediction. It will be equal to w*h of the input image.

What made you feel that the value will be 1?

Edit 1:

Below are the dimensions of each layer mentioned in link1 page 3 figure:

LAYER        OUTPUT DIM [c*h*w]
course1     96*h1*w1     conv layer
course2     256*h2*w2    conv layer
course3     384*h3*w3    conv layer
course4     384*h4*w4    conv layer
course5     256*h5*w5    conv layer
course6     4096*1*1     fc layer
course7     X*1*1        fc layer    where 'X' could be interpreted as w*h

To understand this further, lets assume we have a network to predict the pixels of the image. The images are of size 10*10. Thus, the final output of the fc layer too will be having the dimension 100*1*1(like in course7). This could be interpreted as 10*10.

Now the question will be, how can the 1d array predict a 2d image correctly. For this, you have to note that the loss is calculated for this output, using the labels which could be corresponding to the pixel data. Thus during training, the weights will learn to predict the pixel data.

EDIT 2:

Trying to draw the net using draw_net.py in caffe, gives you this:

The relu layer connected with conv6 and fc6 has the same name, leading to a complicated connectivity in the drawn image. I am not sure on whether this will cause some issues during training, but I would suggest you to rename one of the relu layers to a unique name to avoid some unforseen issues.

Coming back to your question, there doesn't seem to be an upsampling happening after fully connected layers. As seen in the log:

I1108 19:34:57.881680  4277 net.cpp:150] Setting up fc7
I1108 19:34:57.881718  4277 net.cpp:157] Top shape: 1 4070 (4070)

I1108 19:34:57.881826  4277 net.cpp:150] Setting up reshape
I1108 19:34:57.881846  4277 net.cpp:157] Top shape: 1 1 55 74 (4070)

I1108 19:34:57.884768  4277 net.cpp:150] Setting up conv6
I1108 19:34:57.885309  4277 net.cpp:150] Setting up pool6
I1108 19:34:57.885327  4277 net.cpp:157] Top shape: 1 63 55 74 (256410)

fc7 has output dimension of 4070*1*1. This is being reshaped to 1*55*74 to be passed as an input to conv6 layer.

The output of the whole network is produced in conv9, which has an output dimension of 1*55*74, which is exactly similar to the dimension of the labels (depth data).

Please do pinpoint on where you feel the upsample is happening, if my answer is still not clear.

回答2:

if you simply need fully-connected networks like the conventional multi-layer perceptron, use 2D blobs (shape (N, D)) and call the InnerProductLayer.

来源：https://stackoverflow.com/questions/40483458/caffe-reshape-upsample-fully-connected-layer

标签

deep-learning

reshape

caffe