diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py
index 264b005f6cf1a5aa84526d40ed1890991ea8a67b..d4ad3710f3299c91d733c1195e07c00916a54bf4 100755
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -220,7 +220,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)
 
 (pdef('Images2Neibs').
  add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1,
-            'window_h', 3, 'window_w', 3))
+            'dilate_h', 1, 'dilate_w', 1, 'window_h', 3, 'window_w', 3))
 
 (pdef('Pooling', version=0, is_legacy=True).
  add_enum(
diff --git a/dnn/src/common/images2neibs.cpp b/dnn/src/common/images2neibs.cpp
index 5345f9e7115840962e863d475e5bdb982eef9459..24dff13cecbc8a842e324a8920ac8a527230f3d4 100644
--- a/dnn/src/common/images2neibs.cpp
+++ b/dnn/src/common/images2neibs.cpp
@@ -23,6 +23,8 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
                "pad_w=" + std::to_string(param().pad_w) + ", " +
                "stride_h=" + std::to_string(param().stride_h) + ", " +
                "stride_w=" + std::to_string(param().stride_w) + ", " +
+               "dilate_h=" + std::to_string(param().dilate_h) + ", " +
+               "dilate_w=" + std::to_string(param().dilate_w) + ", " +
                "window_h=" + std::to_string(param().window_h) + ", " +
                "window_w=" + std::to_string(param().window_w);
     };
@@ -34,11 +36,13 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
     size_t pw = this->param().pad_w;
     size_t sh = this->param().stride_h;
     size_t sw = this->param().stride_w;
+    size_t dh = this->param().dilate_h;
+    size_t dw = this->param().dilate_w;
     size_t wh = this->param().window_h;
     size_t ww = this->param().window_w;
     size_t oh, ow;
 
-    infer_conv_shape2d(ih, iw, wh, ww, sh, sw, ph, pw, oh, ow);
+    infer_conv_shape2d(ih, iw, wh+(wh-1)*(dh-1), ww+(ww-1)*(dw-1), sh, sw, ph, pw, oh, ow);
     dst = TensorLayout(TensorShape({n, ic, oh, ow, wh, ww}), src.dtype);
 }
 
diff --git a/dnn/src/cuda/images2neibs/kernel.cu b/dnn/src/cuda/images2neibs/kernel.cu
index c972548179ccf1bc0cae40d4f92acb4c3dfd743f..e8d150d2943cc197a6b4a992fec947e42a5290ca 100644
--- a/dnn/src/cuda/images2neibs/kernel.cu
+++ b/dnn/src/cuda/images2neibs/kernel.cu
@@ -24,7 +24,7 @@ namespace images2neibs {
 template <typename T>
 __global__ void forward_kernel(const T *src, T *dst,
         int N, int C, int IH, int IW, int OH, int OW,
-        int ph, int pw, int sh, int sw, int WH, int WW)
+        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
     int NC = N * C;
     int WP = WH*WW;
@@ -37,8 +37,8 @@ __global__ void forward_kernel(const T *src, T *dst,
             if (op < OH * OW) {
                 int oh = op / OW;
                 int ow = op % OW;
-                int ih = -ph + sh * oh + wh;
-                int iw = -pw + sw * ow + ww;
+                int ih = -ph + sh * oh + wh* dh;
+                int iw = -pw + sw * ow + ww* dw;
                 int dst_pos = nc * OH * OW * WH * WW + op * WH * WW + wp;
                 int src_pos = nc * IH * IW + ih * IW + iw;
                 dst[dst_pos] = (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
@@ -52,7 +52,7 @@ __global__ void forward_kernel(const T *src, T *dst,
 
 template <typename T>
 void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
-             int ph, int pw, int sh, int sw, int wh, int ww,
+             int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
              cudaStream_t stream) {
     int spatial_size = OH * OW;
     int kernel_size = wh * ww;
@@ -63,7 +63,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
     int by = N * C;
 
     forward_kernel<<<dim3(bx, std::min(grid_y_max, by)), dim3(tx, ty), 0,
-                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw,
+                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw, dh, dw,
                                wh, ww);
     after_kernel_launch();
 }
@@ -73,7 +73,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
 template <typename T>
 __global__ void backward_kernel(const T *diff, T *grad,
         int N, int C, int IH, int IW, int OH, int OW,
-        int ph, int pw, int sh, int sw, int WH, int WW)
+        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
     int id = threadIdx.x + blockIdx.x * blockDim.x;
     if (id < N*C*IH*IW) {
@@ -82,17 +82,20 @@ __global__ void backward_kernel(const T *diff, T *grad,
         int iw = id % (IH*IW) % IW;
         grad[nc*IH*IW + ih*IW + iw] = 0.0f;
         int oh_max = min((ih+ph) / sh, OH-1);
-        int oh_min = max((ih+ph-(WH-1)+sh-1) / sh, 0);
+        int oh_min = max((ih+ph-(WH-1)*dh+sh-1) / sh, 0);
         int ow_max = min((iw+pw) / sw, OW-1);
-        int ow_min = max((iw+pw-(WW-1)+sw-1) / sw, 0);
+        int ow_min = max((iw+pw-(WW-1)*dw+sw-1) / sw, 0);
         for (int oh = oh_min; oh <= oh_max; ++oh)
         for (int ow = ow_min; ow <= ow_max; ++ow)
         {
-            int wh = ih+ph - sh*oh;
-            int ww = iw+pw - sw*ow;
-            grad[nc*IH*IW + ih*IW + iw] +=
-                diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
-                        wh*WW + ww];
+            if ((ih+ph - sh*oh)%dh==0 && (iw+pw - sw*ow)%dw==0){
+                int wh = ih+ph - sh*oh - (ih+ph - sh*oh)/dh * (dh-1);
+                int ww = iw+pw - sw*ow - (iw+pw - sw*ow)/dw * (dw-1);
+                grad[nc*IH*IW + ih*IW + iw] +=
+                    diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
+                            wh*WW + ww];
+
+            }
         }
     }
 }
@@ -100,23 +103,23 @@ __global__ void backward_kernel(const T *diff, T *grad,
 template <typename T>
 void backward(const T *diff, T *grad,
         int N, int C, int IH, int IW, int OH, int OW,
-        int ph, int pw, int sh, int sw, int wh, int ww,
+        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
         cudaStream_t stream)
 {
     int threads = NR_THREADS;
     int blocks = DIVUP(N*C*IH*IW, threads);
     backward_kernel<<<blocks, threads, 0, stream>>>(diff, grad,
             N, C, IH, IW, OH, OW,
-            ph, pw, sh, sw, wh, ww);
+            ph, pw, sh, sw, dh, dw, wh, ww);
     after_kernel_launch();
 }
 
 #define INST(T) \
     template void forward<T>(const T *, T *, int, int, int, int, int, int, \
-            int, int, int, int, int, int, \
+            int, int, int, int, int, int, int, int, \
             cudaStream_t); \
     template void backward<T>(const T *, T *, int, int, int, int, int, int, \
-            int, int, int, int, int, int, \
+            int, int, int, int, int, int, int, int, \
             cudaStream_t);
 #define cb(DType) \
     INST(DTypeTrait<DType>::ctype)
diff --git a/dnn/src/cuda/images2neibs/kernel.cuh b/dnn/src/cuda/images2neibs/kernel.cuh
index a25ff996096ce6e3c58b8518e0d58b3e2f56f7e8..acef54164cb8b75009d7c92afea8d4f9f4b0e0e5 100644
--- a/dnn/src/cuda/images2neibs/kernel.cuh
+++ b/dnn/src/cuda/images2neibs/kernel.cuh
@@ -18,13 +18,13 @@ namespace images2neibs {
 template <typename T>
 void forward(const T *src, T *dst,
         int N, int C, int IH, int IW, int OH, int OW,
-        int ph, int pw, int sh, int sw, int wh, int ww,
+        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
         cudaStream_t stream);
 
 template <typename T>
 void backward(const T *diff, T *grad,
         int N, int C, int IH, int IW, int OH, int OW,
-        int ph, int pw, int sh, int sw, int wh, int ww,
+        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
         cudaStream_t stream);
        
 } // namespace images2neibs
diff --git a/dnn/src/cuda/images2neibs/opr_impl.cpp b/dnn/src/cuda/images2neibs/opr_impl.cpp
index dd642191ce54717c0507d0889a806a980f4c9194..4761fd4edb786ac5eaccd86f2e286005398b0a5a 100644
--- a/dnn/src/cuda/images2neibs/opr_impl.cpp
+++ b/dnn/src/cuda/images2neibs/opr_impl.cpp
@@ -27,13 +27,14 @@ void Images2NeibsForwardImpl::exec(_megdnn_tensor_in src,
     int OH = dst.layout[2], OW = dst.layout[3];
     int ph = param().pad_h, pw = param().pad_w;
     int sh = param().stride_h, sw = param().stride_w;
+    int dh = param().dilate_h, dw = param().dilate_w;
     int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
     if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
         using T = DTypeTrait<DType>::ctype; \
         images2neibs::forward(src.ptr<T>(), dst.ptr<T>(), \
                 N, C, IH, IW, OH, OW, \
-                ph, pw, sh, sw, wh, ww, \
+                ph, pw, sh, sw, dh, dw, wh, ww, \
                 stream); \
         return; \
     }
@@ -53,13 +54,14 @@ void Images2NeibsBackwardImpl::exec(_megdnn_tensor_in diff,
     int OH = diff.layout[2], OW = diff.layout[3];
     int ph = param().pad_h, pw = param().pad_w;
     int sh = param().stride_h, sw = param().stride_w;
+    int dh = param().dilate_h, dw = param().dilate_w;
     int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
     if (diff.layout.dtype == DType()) { \
         using T = DTypeTrait<DType>::ctype; \
         images2neibs::backward(diff.ptr<T>(), grad.ptr<T>(), \
                 N, C, IH, IW, OH, OW, \
-                ph, pw, sh, sw, wh, ww, \
+                ph, pw, sh, sw, dh, dw, wh, ww, \
                 stream); \
         return; \
     }
diff --git a/dnn/src/naive/images2neibs/opr_impl.cpp b/dnn/src/naive/images2neibs/opr_impl.cpp
index 20d7794143db4102d476ea41d531aff102879bb4..208e962364fc13bc2fc41317c1455e598571de5c 100644
--- a/dnn/src/naive/images2neibs/opr_impl.cpp
+++ b/dnn/src/naive/images2neibs/opr_impl.cpp
@@ -33,20 +33,25 @@ void Images2NeibsForwardImpl::exec_internal(_megdnn_tensor_in src,
     int pad_w = static_cast<int>(param().pad_w);
     int stride_h = static_cast<int>(param().stride_h);
     int stride_w = static_cast<int>(param().stride_w);
+    int dilate_h = static_cast<int>(param().dilate_h);
+    int dilate_w = static_cast<int>(param().dilate_w);
+    int equ_window_h = dilate_h * (window_h-1) + 1;
+    int equ_window_w = dilate_w * (window_w-1) + 1;
     for (int n = 0; n < N; ++n)
     for (int c = 0; c < C; ++c)
     {
         int ih = -pad_h;
-        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
+        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
             int iw = -pad_w;
-            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
+            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                 for (int kh = 0; kh < window_h; ++kh)
                 for (int kw = 0; kw < window_w; ++kw)
                 {
+                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                     dptr[idx*window_h*window_w + kh*window_w + kw] =
-                        (ih+kh) >= 0 && (ih+kh) < IH &&
-                        (iw+kw) >= 0 && (iw+kw) < IW ?
-                        sptr[n*C*IH*IW + c*IH*IW + (ih+kh)*IW + (iw+kw)] : 0.0f;
+                        ih2 >= 0 && ih2 < IH &&
+                        iw2 >= 0 && iw2 < IW ?
+                        sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] : 0.0f;
                 }
                 ++idx;
             }
@@ -86,18 +91,22 @@ void Images2NeibsBackwardImpl::exec_internal(_megdnn_tensor_in diff,
     int pad_w = static_cast<int>(param().pad_w);
     int stride_h = static_cast<int>(param().stride_h);
     int stride_w = static_cast<int>(param().stride_w);
+    int dilate_h = static_cast<int>(param().dilate_h);
+    int dilate_w = static_cast<int>(param().dilate_w);
+    int equ_window_h = dilate_h * (window_h-1) + 1;
+    int equ_window_w = dilate_w * (window_w-1) + 1;
     memset(sptr, 0, sizeof(T) * N*C*IH*IW);
     for (int n = 0; n < N; ++n)
     for (int c = 0; c < C; ++c)
     {
         int ih = -pad_h;
-        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
+        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
             int iw = -pad_w;
-            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
+            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                 for (int kh = 0; kh < window_h; ++kh)
                 for (int kw = 0; kw < window_w; ++kw)
                 {
-                    int ih2 = ih+kh, iw2 = iw+kw;
+                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                     if (ih2 >= 0 && ih2 < IH && iw2 >= 0 && iw2 < IW) {
                         sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] +=
                             dptr[idx*window_h*window_w + kh*window_w + kw];
diff --git a/dnn/test/common/images2neibs.h b/dnn/test/common/images2neibs.h
index 026016512a8d7b7bbb0a2c48ca0bf99e19e38756..086314e196999d2fd16819f9f9d542533950671b 100644
--- a/dnn/test/common/images2neibs.h
+++ b/dnn/test/common/images2neibs.h
@@ -31,17 +31,19 @@ inline std::vector<TestArg> get_args() {
     for (uint32_t pw : {0, 1})
     for (uint32_t sh : {1, 2})
     for (uint32_t sw : {1, 2})
+    for (uint32_t dh : {1, 2, 3})
+    for (uint32_t dw : {1, 2, 3})
     for (uint32_t wh : {3, 4})
     for (uint32_t ww : {3, 4}) {
-        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, wh, ww},
-                          TensorShape{2, 3, 5, 6});
+        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, dh, dw, wh, ww},
+                          TensorShape{2, 3, 19, 20});
     }
     // clang-format on
     // large window case
-    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 32, 64},
+    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 32, 64},
                       TensorShape{2, 3, 96, 128});
     // large size
-    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1},
+    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 1, 1},
                       TensorShape{128, 128, 28, 24});
 
     return args;
@@ -54,17 +56,19 @@ inline std::vector<TestArg> get_benchmark_args() {
     for (uint32_t pw : {0, 1})
     for (uint32_t sh : {1, 2})
     for (uint32_t sw : {1, 2})
+    for (uint32_t dh : {1, 2})
+    for (uint32_t dw : {1, 2})
     for (uint32_t wh : {3, 4})
     for (uint32_t ww : {3, 4})
     for (uint32_t b : {1, 64})
     for (uint32_t c : {64, 128})
     for (uint32_t hw : {64, 128}) {
-        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, wh, ww},
+        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, dh, dw, wh, ww},
                           TensorShape{b, c, hw, hw});
     }
     // clang-format on
     // large size
-    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1},
+    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 1, 1},
                       TensorShape{1024, 128, 28, 24});
 
     return args;
diff --git a/dnn/test/naive/images2neibs.cpp b/dnn/test/naive/images2neibs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c011bd7013988b910227254686e9caada8393f0
--- /dev/null
+++ b/dnn/test/naive/images2neibs.cpp
@@ -0,0 +1,59 @@
+/**
+ * \file dnn/test/naive/images2neibs.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "test/naive/fixture.h"
+
+#include "megdnn/oprs/nn.h"
+#include "test/common/checker.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(NAIVE, IMAGES2NEIBS_FORWARD) {
+    Checker<Images2Neibs> checker(handle(), /* check_dispatch */false);
+    
+    Images2Neibs::Param param(0,0,1,1,1,1,2,2);
+    checker.set_param(param).exect(
+            Testcase{TensorValue({1, 1, 3, 3}, dtype::Uint8(),
+                               {0,1,2,
+                               3,4,5,
+                               6,7,8}), {}}, 
+            Testcase{{},
+            TensorValue({1, 1, 2, 2, 2, 2}, dtype::Uint8(), 
+                        {0,1,3,4,
+                        1,2,4,5,
+                        3,4,6,7,
+                        4,5,7,8})});
+    
+    param.pad_h = 1;
+    param.pad_w = 1;
+    param.stride_h = 2;
+    param.stride_w = 2;
+    param.dilate_h = 2;
+    param.dilate_w = 2;
+    param.window_h = 3;
+    param.window_w = 3;
+    checker.set_param(param).exect(
+          Testcase{TensorValue({1, 1, 6, 7}, dtype::Uint8(),
+                               {0,1,2,3,4,5,6,
+                               7,8,9,10,11,12,13,
+                               14,15,16,17,18,19,20,
+                               21,22,23,24,25,26,27,
+                               28,29,30,31,32,33,34,
+                               35,36,37,38,39,40,41}), {}}, 
+          Testcase{{},
+          TensorValue({1, 1, 2, 3, 3, 3}, dtype::Uint8(),
+                        {0,0,0,0,8,10,0,22,24,
+                        0,0,0,8,10,12,22,24,26,
+                        0,0,0,10,12,0,24,26,0,
+                        0,8,10,0,22,24,0,36,38,
+                        8,10,12,22,24,26,36,38,40,
+                        10,12,0,24,26,0,38,40,0})});
+}
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 23106cb076d28f33044d09bdda710cd4eaf1a9b1..3d46c400de82b6f20a54627284a7dfd496b16dce 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -70,6 +70,7 @@ __all__ = [
     "remap",
     "resize",
     "sigmoid",
+    "sliding_window",
     "softmax",
     "softplus",
     "sync_batch_norm",
@@ -1353,6 +1354,44 @@ def indexing_one_hot(
     return result
 
 
+def sliding_window(
+    inp: Tensor,
+    kernel_size: Union[int, Tuple[int, int]],
+    padding: Union[int, Tuple[int, int]] = 0,
+    stride: Union[int, Tuple[int, int]] = 1,
+    dilation: Union[int, Tuple[int, int]] = 1,
+) -> Tensor:
+    """
+    Extracts sliding local blocks from a batched input tensor.
+
+    Refer to :class:`~.SlidingWindow` for more information.
+
+    :param inp: input tensor.
+    :param kernel_size: size of the window.
+    :param padding: implicit zero padding added on both sides of input. Default: 0
+    :param stride: stride of the window. Default: 1
+    :param dilation: dilation of the window. Default: 1
+    :return: output tensor.
+    """
+    padding_h, padding_w = _pair(padding)
+    stride_h, stride_w = _pair_nonzero(stride)
+    dilation_h, dilation_w = _pair_nonzero(dilation)
+    window_h, window_w = _pair_nonzero(kernel_size)
+
+    op = builtin.Images2Neibs(
+        pad_h=padding_h,
+        pad_w=padding_w,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        dilate_h=dilation_h,
+        dilate_w=dilation_w,
+        window_h=window_h,
+        window_w=window_w,
+    )
+    (output,) = apply(op, inp)
+    return output
+
+
 interpolate = deprecated_func("1.3", "megengine.functional.vision", "interpolate", True)
 roi_pooling = deprecated_func("1.3", "megengine.functional.vision", "roi_pooling", True)
 roi_align = deprecated_func("1.3", "megengine.functional.vision", "roi_align", True)
diff --git a/imperative/python/megengine/module/__init__.py b/imperative/python/megengine/module/__init__.py
index bdffc89dd03100ef24a0f3fbf53d005494bb4854..b178fe2a9f08742874510b2816d5968efc1576f4 100644
--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -34,3 +34,4 @@ from .normalization import GroupNorm, InstanceNorm, LayerNorm
 from .pooling import AvgPool2d, MaxPool2d
 from .quant_dequant import DequantStub, QuantStub
 from .sequential import Sequential
+from .sliding_window import SlidingWindow
diff --git a/imperative/python/megengine/module/sliding_window.py b/imperative/python/megengine/module/sliding_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7da29b3d72e01c3a0eaaf8e405f7f1c5dcc0a31
--- /dev/null
+++ b/imperative/python/megengine/module/sliding_window.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Tuple, Union
+
+from ..functional import sliding_window
+from .module import Module
+
+
+class SlidingWindow(Module):
+    r"""
+    Apply a sliding window to input tensor and copy content in the window to
+    corresponding output location. Assume input shape is :math:`(N, C, IH, IW)`,
+    then output shape would be :math:`(N, C, OH, OW, window_h, window_w)` where
+    :math:`(OH, OW)` would be computed from padding, stride, window and
+    :math:`(IH, IW)`, as in convolution. For each output location, we have;
+
+    .. math::
+
+        out_{n, c, oh, ow, wh, ww} &= src_{n, c, ih+wh, iw+ww} \\
+        \text{where } & ih=-pad_h+oh \times stride_h + (wh-1) \times (dilation_h-1) \\
+                       & iw=-pad_w+ow \times stride_w + (ww-1) \times (dilation_w-1)
+
+
+    :param kernel_size: the size of the window to take a max over.
+    :param padding: implicit zero padding to be added on both sides. Default: 0
+    :param stride: the stride of the window. Default: 1
+    :param dilation: the dilation of the window. Default: 1
+
+    Example:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.module as M
+        import numpy as np
+
+        inp = tensor(np.arange(30).reshape(1,1,5,6))
+        op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
+        out = op(inp)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[[[ 0  0  0]
+             [ 0  7  9]
+             [ 0 19 21]]
+
+            [[ 0  0  0]
+             [ 7  9 11]
+             [19 21 23]]]
+
+
+           [[[ 0  7  9]
+             [ 0 19 21]
+             [ 0  0  0]]
+
+            [[ 7  9 11]
+             [19 21 23]
+             [ 0  0  0]]]]]]
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int]] = 0,
+        stride: Union[int, Tuple[int, int]] = 1,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        **kwargs
+    ):
+        super(SlidingWindow, self).__init__(**kwargs)
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, inp):
+        return sliding_window(
+            inp, self.kernel_size, self.padding, self.stride, self.dilation
+        )
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index c96865358369ab1c6dcb6b6b8a79c30529e14fac..46486bfd09965d9972530a7287ed1b9895e1ba9f 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -927,3 +927,28 @@ def test_neg_axis():
     y = F.argmin(x, axis=(-1, -2))
     yy = F.argmin(x, axis=(0, 1))
     np.testing.assert_equal(y.numpy(), yy.numpy())
+
+
+def test_sliding_window():
+    N, C, H, W = 2, 3, 7, 8
+    inp = np.random.normal(size=(N, C, H, W))
+    ph, pw = 1, 2
+    sh, sw = 2, 1
+    wh, ww = 3, 2
+    dh, dw = 1, 3
+    s = lambda i, p, s, d, w: (i + p * 2 - (w - 1) * d - 1) // s + 1
+    inp_pad = np.zeros((N, C, H + ph * 2, W + pw * 2))
+    inp_pad[:, :, ph : H + ph, pw : W + pw] = inp
+    gt_out = np.empty(
+        (N, C, s(H, ph, sh, dh, wh), s(W, pw, sw, dw, ww), wh, ww), dtype=np.float32
+    )
+    for n, c, oh, ow in itertools.product(*map(range, gt_out.shape[:4])):
+        ih, iw = oh * sh, ow * sw
+        gt_out[n, c, oh, ow, :] = inp_pad[
+            n, c, ih : ih + (wh - 1) * dh + 1 : dh, iw : iw + (ww - 1) * dw + 1 : dw
+        ]
+
+    out = F.sliding_window(
+        tensor(inp), (wh, ww), padding=(ph, pw), stride=(sh, sw), dilation=(dh, dw)
+    )
+    np.testing.assert_equal(gt_out, out.numpy())
diff --git a/imperative/src/impl/ops/specializations.cpp b/imperative/src/impl/ops/specializations.cpp
index bde440cce06b0248d5a8aec39a28cdd096dda7a4..13ebcced7f6d50c6b22f0091aea6adfb1fc87248 100644
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
@@ -32,6 +32,7 @@
 #include "megbrain/opr/tensor_gen.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"
+#include "megbrain/opr/dnn/images2neibs.h"
 
 #include "../op_trait.h"
 
@@ -652,4 +653,17 @@ OP_TRAIT_REG(SVD, SVD)
     .fallback();
 }} // svd
 
+namespace { namespace images2neibs {
+auto apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    auto&& op = static_cast<const Images2Neibs&>(def);
+    OperatorNodeConfig config{op.make_name()};
+    return opr::Images2Neibs::make(inputs[0], op.param(), config);
+}
+OP_TRAIT_REG(Images2Neibs, Images2Neibs)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+}} // images2neibs
+
 } // namespace mgb::imperative
diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td
index 164510d24adc9ca8c82af25c2ce9b3ba27e51056..3a3fa7189526ee57628d210dd7b3260deeb7fe11 100644
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -79,6 +79,8 @@ def BatchConvBias : MgbHashableOp<"BatchConvBias", [BatchConvBiasParam, Executio
   );
 }
 
+def Images2Neibs : MgbHashableOp<"Images2Neibs", [Images2NeibsParam]>;
+
 def BatchNorm : MgbHashableOp<"BatchNorm", [BNParam]>;
 
 def ROIAlign: MgbHashableOp<"ROIAlign", [ROIAlignParam]>;