提交 5c21a5f6 编写于 作者: L lxx1991

add dilated conv

上级 4d62805b
......@@ -3,6 +3,7 @@
namespace caffe {
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
......@@ -11,7 +12,7 @@ void im2col_cpu(const Dtype* data_im, const int channels,
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, Dtype* data_im);
......@@ -23,10 +24,38 @@ void im2col_gpu(const Dtype* data_im, const int channels,
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, Dtype* data_im);
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
Dtype* data_col);
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
Dtype* data_im);
template <typename Dtype>
void im2col_gpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
Dtype* data_col);
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
Dtype* data_im);
} // namespace caffe
#endif // CAFFE_UTIL_IM2COL_HPP_
......@@ -68,6 +68,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
int kernel_h_, kernel_w_;
int stride_h_, stride_w_;
int dilation_h_, dilation_w_;
int num_;
int channels_;
int pad_h_, pad_w_;
......@@ -82,20 +83,20 @@ class BaseConvolutionLayer : public Layer<Dtype> {
// wrap im2col/col2im so we don't have to remember the (long) argument lists
inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_, col_buff);
}
inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_, data);
}
#ifndef CPU_ONLY
inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_, col_buff);
}
inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_, data);
}
#endif
......
......@@ -49,6 +49,12 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
stride_h_ = conv_param.stride_h();
stride_w_ = conv_param.stride_w();
}
if (!conv_param.has_dilation_h()) {
dilation_h_ = dilation_w_ = conv_param.dilation();
} else {
dilation_h_ = conv_param.dilation_h();
dilation_w_ = conv_param.dilation_w();
}
// Special case: im2col is the identity for 1x1 convolution with stride 1
// and no padding, so flag for skipping the buffer and transformation.
is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
......
......@@ -10,9 +10,9 @@ namespace caffe {
template <typename Dtype>
void ConvolutionLayer<Dtype>::compute_output_shape() {
this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
this->height_out_ = (this->height_ + 2 * this->pad_h_ - (this->dilation_h_ * (this->kernel_h_ - 1) + 1))
/ this->stride_h_ + 1;
this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
this->width_out_ = (this->width_ + 2 * this->pad_w_ - (this->dilation_w_ * (this->kernel_w_ - 1) + 1))
/ this->stride_w_ + 1;
}
......
......@@ -502,6 +502,9 @@ message ConvolutionParameter {
optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
optional uint32 stride_h = 13; // The stride height
optional uint32 stride_w = 14; // The stride width
optional uint32 dilation = 16 [default = 1]; // The dilation (equal in Y, X)
optional uint32 dilation_h = 17; // The dilation height
optional uint32 dilation_w = 18; // The dilation width
optional FillerParameter weight_filler = 7; // The filler for the weight
optional FillerParameter bias_filler = 8; // The filler for the bias
enum Engine {
......
......@@ -19,6 +19,7 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
Dtype* data_col);
......@@ -104,7 +105,7 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
im2col_gpu_kernel<TypeParam><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
num_kernels, bottom_data + this->blob_bottom_->offset(n),
this->height_, this->width_, this->kernel_size_, this->kernel_size_,
this->pad_, this->pad_, this->stride_, this->stride_,
this->pad_, this->pad_, this->stride_, this->stride_, 1, 1,
this->height_col_, this->width_col_,
top_data + this->blob_top_->offset(n));
CUDA_POST_KERNEL_CHECK;
......
......@@ -7,33 +7,73 @@
namespace caffe {
inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
return static_cast<unsigned>(a) < static_cast<unsigned>(b);
}
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
Dtype* data_col) {
im2col_cpu(data_im, channels, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, 1, 1, data_col);
}
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
Dtype* data_col) {
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
const int output_h = (height + 2 * pad_h -
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int output_w = (width + 2 * pad_w -
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
const int channel_size = height * width;
for (int channel = channels; channel--; data_im += channel_size) {
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_row = -pad_h + kernel_row * dilation_h;
for (int output_rows = output_h; output_rows; output_rows--) {
if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
for (int output_cols = output_w; output_cols; output_cols--) {
*(data_col++) = 0;
}
} else {
int input_col = -pad_w + kernel_col * dilation_w;
for (int output_col = output_w; output_col; output_col--) {
if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
*(data_col++) = data_im[input_row * width + input_col];
} else {
*(data_col++) = 0;
}
input_col += stride_w;
}
}
input_row += stride_h;
}
}
}
}
}
// Explicit instantiation
template void im2col_cpu<float>(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
float* data_col);
template void im2col_cpu<double>(const double* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
double* data_col);
// Explicit instantiation
template void im2col_cpu<float>(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
......@@ -44,39 +84,76 @@ template void im2col_cpu<double>(const double* data_im, const int channels,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, double* data_col);
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
Dtype* data_im) {
col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, 1, 1, data_im);
}
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
Dtype* data_im) {
caffe_set(height * width * channels, Dtype(0), data_im);
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int channels_col = channels * patch_h * patch_w;
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % patch_w;
int h_offset = (c / patch_w) % patch_h;
int c_im = c / patch_h / patch_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
const int output_h = (height + 2 * pad_h -
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int output_w = (width + 2 * pad_w -
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
const int channel_size = height * width;
for (int channel = channels; channel--; data_im += channel_size) {
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_row = -pad_h + kernel_row * dilation_h;
for (int output_rows = output_h; output_rows; output_rows--) {
if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
data_col += output_w;
} else {
int input_col = -pad_w + kernel_col * dilation_w;
for (int output_col = output_w; output_col; output_col--) {
if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
data_im[input_row * width + input_col] += *data_col;
}
data_col++;
input_col += stride_w;
}
}
input_row += stride_h;
}
}
}
}
}
// Explicit instantiation
template void col2im_cpu<float>(const float* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
float* data_im);
template void col2im_cpu<double>(const double* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
double* data_im);
// Explicit instantiation
template void col2im_cpu<float>(const float* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, float* data_im);
template void col2im_cpu<double>(const double* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, double* data_im);
......
......@@ -13,26 +13,28 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
Dtype* data_col) {
CUDA_KERNEL_LOOP(index, n) {
int w_out = index % width_col;
int h_index = index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
int channel_out = channel_in * kernel_h * kernel_w;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
Dtype* data_col_ptr = data_col;
data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const Dtype* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h = h_in + i;
int w = w_in + j;
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
data_im_ptr[i * width + j] : 0;
int h_im = h_offset + i * dilation_h;
int w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
......@@ -45,67 +47,89 @@ void im2col_gpu(const Dtype* data_im, const int channels,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
Dtype* data_col) {
im2col_gpu(data_im, channels, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, 1, 1, data_col);
}
template <typename Dtype>
void im2col_gpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
Dtype* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int height_col = (height + 2 * pad_h -
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w -
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col;
// NOLINT_NEXT_LINE(whitespace/operators)
im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
CAFFE_CUDA_NUM_THREADS>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
pad_w, stride_h, stride_w, height_col,
pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
width_col, data_col);
CUDA_POST_KERNEL_CHECK;
}
// Explicit instantiation
template void im2col_gpu<float>(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
float* data_col);
const int dilation_h, const int dilation_w, float* data_col);
template void im2col_gpu<double>(const double* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, double* data_col);
// Explicit instantiation
template void im2col_gpu<float>(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
float* data_col);
template void im2col_gpu<double>(const double* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
double* data_col);
template <typename Dtype>
__global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
const int height, const int width, const int channels,
const int patch_h, const int patch_w,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
Dtype* data_im) {
CUDA_KERNEL_LOOP(index, n) {
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
/*
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out]
int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
+ (w - w_col * stride_w);
val += data_col[(c_col * height_col + h_col) * width_col + w_col];
}
}
*/
// equivalent implementation
int offset =
(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
const int w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
data_im[index] = val;
......@@ -114,30 +138,54 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, Dtype* data_im) {
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
col2im_gpu(data_col, channels, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, 1, 1, data_im);
}
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
Dtype* data_im) {
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
stride_w + 1;
int num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// NOLINT_NEXT_LINE(whitespace/operators)
col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
CAFFE_CUDA_NUM_THREADS>>>(
num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w,
num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
height_col, width_col, data_im);
CUDA_POST_KERNEL_CHECK;
}
// Explicit instantiation
template void col2im_gpu<float>(const float* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
float* data_im);
template void col2im_gpu<double>(const double* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
double* data_im);
// Explicit instantiation
template void col2im_gpu<float>(const float* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, float* data_im);
template void col2im_gpu<double>(const double* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, double* data_im);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册