From dc6f89f2d37254175fa09db4a5bd1b409ddba638 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Tue, 31 Mar 2020 16:23:51 +0800
Subject: [PATCH] refactor(dnn): refactor winograd strategy helper

GitOrigin-RevId: ecc2b15df995a526688d3a5593d8db6767c0c717
---
 dnn/src/common/winograd/winograd_helper.cpp   | 801 ++++++------------
 dnn/src/common/winograd/winograd_helper.h     |  48 +-
 .../fallback/conv_bias/winograd/strategy.cpp  |  79 +-
 .../fallback/conv_bias/winograd/strategy.h    |   1 +
 .../winograd_filter_preprocess/opr_impl.cpp   |  53 +-
 5 files changed, 339 insertions(+), 643 deletions(-)
diff --git a/dnn/src/common/winograd/winograd_helper.cpp b/dnn/src/common/winograd/winograd_helper.cpp
index 6f1dcdd56..767bdda4c 100644
--- a/dnn/src/common/winograd/winograd_helper.cpp
+++ b/dnn/src/common/winograd/winograd_helper.cpp
@@ -58,368 +58,300 @@ struct OutputGetter<
         return dtype.param<dtype::Quantized8Asymm>().quantize(item).as_uint8();
     }
 };
-
 }  // namespace
 
 namespace megdnn {
 namespace winograd {
 
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type>
-class StrategyHelper<ctype, dst_type, input_filter_compute_type,
-                     output_compute_type, param::MatrixMul::Format::DEFAULT> {
-public:
-    static void filter(const ctype* filter,
-                       input_filter_compute_type* filter_transform_buf,
-                       input_filter_compute_type* transform_mid_buf, size_t OC,
-                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
-                       size_t r, const std::vector<float>& interp_points,
-                       DType dtype, float rescale) {
-        size_t alpha = m + r - 1;
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-
-        input_filter_compute_type* mid_buf1 = transform_mid_buf;
-        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            rep(ic, IC) {
-                const ctype* filter_ptr = filter + (oc * IC + ic) * r * r;
-                rep(i, r) rep(j, r) {
-                    mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]);
-                }
+constexpr size_t layout_pack_size(param::ConvBias::Format layout) {
+    switch (layout) {
+        case param::ConvBias::Format::NHWCD4:
+            return 4;
+        case param::ConvBias::Format::NCHW4:
+            return 4;
+        case param::ConvBias::Format::NCHW32:
+            return 32;
+        case param::ConvBias::Format::NCHW88:
+        case param::ConvBias::Format::NCHW8:
+            return 8;
+        default:
+            return 1;
+    }
+}
+
+template <param::ConvBias::Format layout, param::MatrixMul::Format format>
+struct FilterVisitor {
+    size_t IC, OC;
+    FilterVisitor(size_t OC, size_t IC) : IC(IC), OC(OC) {}
+    size_t get(size_t r, size_t oc, size_t ic, size_t h, size_t w) {
+        constexpr size_t input_pack_size = layout_pack_size(layout);
+        size_t ocb_layout = oc / input_pack_size;
+        size_t oc_layout = oc % input_pack_size;
+        size_t icb_layout = ic / input_pack_size;
+        size_t ic_layout = ic % input_pack_size;
+
+        return (ocb_layout * (IC / input_pack_size) + icb_layout) * r * r *
+                       input_pack_size * input_pack_size +
+               ic_layout * input_pack_size + oc_layout +
+               (h * r + w) * input_pack_size * input_pack_size;
+    }
 
-                /* tmp = Matmul(G, src) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, false>(
-                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
-                        alpha, r, r, r, r, r, dtype, dtype);
-                /* dst = Matmul(tmp, G^T) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, true>(
-                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
-                        alpha, alpha, r, r, r, alpha, dtype, dtype);
-
-                rep(i, alpha) rep(j, alpha) {
-                    filter_transform_buf[(i * alpha + j) * OC * IC + ic * OC +
-                                         oc] = mid_buf1[i * alpha + j];
-                }
-            }
+    size_t put(size_t alpha, size_t oc, size_t ic, size_t h, size_t w) {
+        if (format == param::MatrixMul::Format::DEFAULT) {
+            return (h * alpha + w) * OC * IC + ic * OC + oc;
         }
+        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
+        size_t ocb = oc / matmul_pack_size;
+        size_t oc_pack = oc % matmul_pack_size;
+        size_t icb = ic / matmul_pack_size;
+        size_t ic_pack = ic % matmul_pack_size;
+
+        size_t OCB = OC / matmul_pack_size;
+        size_t ICB = IC / matmul_pack_size;
+
+        return (h * alpha + w) * OCB * ICB * matmul_pack_size *
+                       matmul_pack_size +
+               ocb * ICB * matmul_pack_size * matmul_pack_size +
+               icb * matmul_pack_size * matmul_pack_size +
+               ic_pack * matmul_pack_size + oc_pack;
     }
+};
 
-    static void input(const ctype* input,
-                      input_filter_compute_type* input_transform_buf,
-                      input_filter_compute_type* transform_mid_buf,
-                      int ih_start, int iw_start, size_t IH, size_t IW,
-                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
-                      size_t m, size_t r,
-                      const std::vector<float>& interp_points, DType dtype,
-                      float rescale) {
-        size_t alpha = m + r - 1;
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-        rep(ic, IC) {
-            input_filter_compute_type* mid_buf1 = transform_mid_buf;
-            input_filter_compute_type* mid_buf2 =
-                    transform_mid_buf + alpha * alpha;
+template <param::ConvBias::Format layout, param::MatrixMul::Format format>
+struct InputVisitor {
+    size_t IC;
+    InputVisitor(size_t IC) : IC(IC) {}
 
-            memset(mid_buf1, 0,
-                   alpha * alpha * sizeof(input_filter_compute_type));
-            rep(i, alpha) rep(j, alpha) {
-                int ih = ih_start + i;
-                int iw = iw_start + j;
-                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
-                    mid_buf1[i * alpha + j] =
-                            getter(input[ic * IH * IW + ih * IW + iw]);
-                }
-            }
-            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                              input_filter_compute_type, true,
-                                              false>(
-                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
-            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                              input_filter_compute_type, false,
-                                              false>(
-                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
-            rep(i, alpha) rep(j, alpha) {
-                input_transform_buf[(i * alpha + j) * nr_units_in_tile * IC +
-                                    unit_idx * IC + ic] =
-                        mid_buf1[i * alpha + j];
-            }
-        }
+    size_t get(size_t alpha, size_t ic, size_t IH, size_t IW, size_t ih,
+               size_t iw) {
+        constexpr size_t input_pack_size = layout_pack_size(layout);
+        size_t icb_layout = ic / input_pack_size;
+        size_t ic_layout = ic % input_pack_size;
+
+        return (icb_layout * IH * IW + ih * IW + iw) * input_pack_size +
+               ic_layout;
     }
 
-    static void output(const output_compute_type* output_transform_buf,
-                       const output_compute_type* bias, dst_type* output,
-                       output_compute_type* transform_mid_buf, BiasMode bmode,
-                       NonlineMode nonline_mode, size_t oh_start,
-                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
-                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
-                       size_t m, size_t r,
-                       const std::vector<float>& interp_points, DType dtype,
-                       float input_filter_scale, float input_filter_rescale,
-                       float rescale) {
-        size_t alpha = m + r - 1;
-        size_t OC = oc_end - oc_start;
-
-        OutputGetter<output_compute_type, dst_type> getter(dtype);
-        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
-                m, r, interp_points);
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            size_t oc_index = oc - oc_start;
-            output_compute_type* mid_buf1 = transform_mid_buf;
-            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-            // gather
-            rep(i, alpha) rep(j, alpha) {
-                mid_buf1[i * alpha + j] =
-                        output_transform_buf[(i * alpha + j) *
-                                                     nr_units_in_tile * OC +
-                                             unit_idx * OC + oc_index];
-            }
-            /* A[alpha*m] M[alpha*alpha] */
-            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
-                                              output_compute_type, true, false>(
-                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
-                    alpha, alpha, m, alpha, alpha, dtype, dtype);
-            megdnn::naive::run_matrix_mul_tpl<
-                    output_compute_type, output_compute_type, false, false>(
-                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
-                    alpha, alpha, m, m, dtype, dtype);
-            rep(i, m) rep(j, m) {
-                auto oh = oh_start + i;
-                auto ow = ow_start + j;
-                if (oh < OH && ow < OW) {
-                    float val = mid_buf1[i * m + j];
-                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
-                        val += bias[oc] * input_filter_rescale *
-                               input_filter_rescale;
-                    } else if (bmode == BiasMode::BIAS) {
-                        val += bias[oc * OH * OW + oh * OW + ow] *
-                               input_filter_rescale * input_filter_rescale;
-                    }
-                    val = val * input_filter_scale /
-                          (input_filter_rescale * input_filter_rescale *
-                           rescale * rescale);
-                    if (nonline_mode == NonlineMode::RELU) {
-                        val = val > 0 ? val : 0;
-                    } else if (nonline_mode == NonlineMode::SIGMOID) {
-                        val = 1.f / (expf(-val) + 1.f);
-                    } else if (nonline_mode == NonlineMode::H_SWISH) {
-                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
-                    } else {
-                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
-                    }
-
-                    output[oc * OH * OW + oh * OW + ow] = getter(val);
-                }
-            }
+    size_t put(size_t alpha, size_t ic, size_t nr_units_in_tile,
+               size_t unit_idx, size_t h, size_t w) {
+        if (format == param::MatrixMul::Format::DEFAULT) {
+            return (h * alpha + w) * nr_units_in_tile * IC + unit_idx * IC + ic;
         }
+        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
+        size_t icb = ic / matmul_pack_size;
+        size_t ic_pack = ic % matmul_pack_size;
+        size_t ICB = IC / matmul_pack_size;
+
+        return (h * alpha + w) * ICB * nr_units_in_tile * matmul_pack_size +
+               icb * nr_units_in_tile * matmul_pack_size +
+               unit_idx * matmul_pack_size + ic_pack;
     }
 };
 
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type, param::MatrixMul::Format format>
-class StrategyHelper<
-        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
-        std::enable_if_t<format == param::MatrixMul::Format::MK4 ||
-                         format == param::MatrixMul::Format::MK8>> {
-public:
-    static void filter(const ctype* filter,
-                       input_filter_compute_type* filter_transform_buf,
-                       input_filter_compute_type* transform_mid_buf, size_t OC,
-                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
-                       size_t r, const std::vector<float>& interp_points,
-                       DType dtype, float rescale) {
-        size_t alpha = m + r - 1;
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-
-        input_filter_compute_type* mid_buf1 = transform_mid_buf;
-        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        size_t OCB = OC / pack_size;
-        size_t ICB = IC / pack_size;
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            rep(ic, IC) {
-                const ctype* filter_ptr = filter + (oc * IC + ic) * r * r;
-                rep(i, r) rep(j, r) {
-                    mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]);
-                }
+template <param::ConvBias::Format layout, param::MatrixMul::Format format>
+struct OutputVisitor {
+    size_t OC;
+    OutputVisitor(size_t OC) : OC(OC) {}
 
-                /* tmp = Matmul(G, src) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, false>(
-                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
-                        alpha, r, r, r, r, r, dtype, dtype);
-                /* dst = Matmul(tmp, G^T) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, true>(
-                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
-                        alpha, alpha, r, r, r, alpha, dtype, dtype);
-
-                size_t ocb = oc / pack_size;
-                size_t oc_pack = oc % pack_size;
-                size_t icb = ic / pack_size;
-                size_t ic_pack = ic % pack_size;
-                rep(i, alpha) rep(j, alpha) {
-                    filter_transform_buf[(i * alpha + j) * OCB * ICB *
-                                                 pack_size * pack_size +
-                                         ocb * ICB * pack_size * pack_size +
-                                         icb * pack_size * pack_size +
-                                         ic_pack * pack_size + oc_pack] =
-                            mid_buf1[i * alpha + j];
-                }
-            }
+    size_t get(size_t alpha, size_t oc_index, size_t oc,
+               size_t nr_units_in_tile, size_t unit_idx, size_t h, size_t w) {
+        if (format == param::MatrixMul::Format::DEFAULT) {
+            return (h * alpha + w) * nr_units_in_tile * OC + unit_idx * OC +
+                   oc_index;
         }
+        size_t matmul_pack_size = MatrixMulForward::pack_size(format);
+        size_t ocb = oc_index / matmul_pack_size;
+        size_t oc_pack = oc % matmul_pack_size;
+        size_t OCB = OC / matmul_pack_size;
+
+        return (h * alpha + w) * OCB * nr_units_in_tile * matmul_pack_size +
+               ocb * nr_units_in_tile * matmul_pack_size +
+               unit_idx * matmul_pack_size + oc_pack;
     }
 
-    static void input(const ctype* input,
-                      input_filter_compute_type* input_transform_buf,
-                      input_filter_compute_type* transform_mid_buf,
-                      int ih_start, int iw_start, size_t IH, size_t IW,
-                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
-                      size_t m, size_t r,
-                      const std::vector<float>& interp_points, DType dtype,
-                      float rescale) {
-        size_t alpha = m + r - 1;
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-        size_t ICB = IC / pack_size;
-        rep(ic, IC) {
-            input_filter_compute_type* mid_buf1 = transform_mid_buf;
-            input_filter_compute_type* mid_buf2 =
-                    transform_mid_buf + alpha * alpha;
+    size_t put(size_t oc, size_t OH, size_t OW, size_t oh, size_t ow) {
+        constexpr size_t input_pack_size = layout_pack_size(layout);
+        size_t oc_layout = oc % input_pack_size;
 
-            memset(mid_buf1, 0,
-                   alpha * alpha * sizeof(input_filter_compute_type));
-            rep(i, alpha) rep(j, alpha) {
-                int ih = ih_start + i;
-                int iw = iw_start + j;
-                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
-                    mid_buf1[i * alpha + j] =
-                            getter(input[ic * IH * IW + ih * IW + iw]);
-                }
+        return (oc / input_pack_size * OH * OW + oh * OW + ow) *
+                       input_pack_size +
+               oc_layout;
+    }
+};
+
+template <typename ctype, typename dst_type, typename input_filter_compute_type,
+          typename output_compute_type, param::ConvBias::Format layout,
+          param::MatrixMul::Format format>
+void StrategyHelper<
+        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
+        format>::filter(const ctype* filter,
+                        input_filter_compute_type* filter_transform_buf,
+                        input_filter_compute_type* transform_mid_buf, size_t OC,
+                        size_t IC, size_t oc_start, size_t oc_end, size_t m,
+                        size_t r, const std::vector<float>& interp_points,
+                        DType dtype, float rescale) {
+    size_t alpha = m + r - 1;
+    WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
+                                                            interp_points);
+    input_filter_compute_type* mid_buf1 = transform_mid_buf;
+    input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
+    Getter<ctype, input_filter_compute_type> getter(dtype);
+    FilterVisitor<layout, format> filter_visitor(OC, IC);
+
+    for (size_t oc = oc_start; oc < oc_end; oc++) {
+        rep(ic, IC) {
+            rep(i, r) rep(j, r) {
+                mid_buf1[i * r + j] =
+                        getter(filter[filter_visitor.get(r, oc, ic, i, j)]);
             }
+
+            /* tmp = Matmul(G, src) */
             megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                              input_filter_compute_type, true,
+                                              input_filter_compute_type, false,
                                               false>(
-                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
+                    winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2, alpha,
+                    r, r, r, r, r, dtype, dtype);
+            /* dst = Matmul(tmp, G^T) */
             megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
                                               input_filter_compute_type, false,
-                                              false>(
-                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
-            size_t icb = ic / pack_size;
-            size_t ic_pack = ic % pack_size;
+                                              true>(
+                    mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1, alpha,
+                    alpha, r, r, r, alpha, dtype, dtype);
+
             rep(i, alpha) rep(j, alpha) {
-                input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile *
-                                            pack_size +
-                                    icb * nr_units_in_tile * pack_size +
-                                    unit_idx * pack_size + ic_pack] =
+                filter_transform_buf[filter_visitor.put(alpha, oc, ic, i, j)] =
                         mid_buf1[i * alpha + j];
             }
         }
     }
+}
 
-    static void output(const output_compute_type* output_transform_buf,
-                       const output_compute_type* bias, dst_type* output,
-                       output_compute_type* transform_mid_buf, BiasMode bmode,
-                       NonlineMode nonline_mode, size_t oh_start,
-                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
-                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
+template <typename ctype, typename dst_type, typename input_filter_compute_type,
+          typename output_compute_type, param::ConvBias::Format layout,
+          param::MatrixMul::Format format>
+void StrategyHelper<
+        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
+        format>::input(const ctype* input,
+                       input_filter_compute_type* input_transform_buf,
+                       input_filter_compute_type* transform_mid_buf,
+                       int ih_start, int iw_start, size_t IH, size_t IW,
+                       size_t IC, size_t unit_idx, size_t nr_units_in_tile,
                        size_t m, size_t r,
                        const std::vector<float>& interp_points, DType dtype,
-                       float input_filter_scale, float input_filter_rescale,
                        float rescale) {
-        size_t alpha = m + r - 1;
-        size_t OC = oc_end - oc_start;
-
-        OutputGetter<output_compute_type, dst_type> getter(dtype);
-        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
-                m, r, interp_points);
-        size_t OCB = OC / pack_size;
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            output_compute_type* mid_buf1 = transform_mid_buf;
-            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-            size_t ocb = (oc - oc_start) / pack_size;
-            size_t oc_pack = oc % pack_size;
-            // gather
-            rep(i, alpha) rep(j, alpha) {
-                mid_buf1[i * alpha + j] = output_transform_buf
-                        [(i * alpha + j) * OCB * nr_units_in_tile * pack_size +
-                         ocb * nr_units_in_tile * pack_size +
-                         unit_idx * pack_size + oc_pack];
+    size_t alpha = m + r - 1;
+    WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
+                                                            interp_points);
+    input_filter_compute_type* mid_buf1 = transform_mid_buf;
+    input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
+    Getter<ctype, input_filter_compute_type> getter(dtype);
+    InputVisitor<layout, format> intput_visitor(IC);
+
+    rep(ic, IC) {
+        memset(mid_buf1, 0, alpha * alpha * sizeof(input_filter_compute_type));
+        rep(i, alpha) rep(j, alpha) {
+            int ih = ih_start + i;
+            int iw = iw_start + j;
+            if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
+                mid_buf1[i * alpha + j] = getter(
+                        input[intput_visitor.get(alpha, ic, IH, IW, ih, iw)]);
             }
-            /* A[alpha*m] M[alpha*alpha] */
-            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
-                                              output_compute_type, true, false>(
-                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
-                    alpha, alpha, m, alpha, alpha, dtype, dtype);
-            megdnn::naive::run_matrix_mul_tpl<
-                    output_compute_type, output_compute_type, false, false>(
-                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
-                    alpha, alpha, m, m, dtype, dtype);
-            rep(i, m) rep(j, m) {
-                auto oh = oh_start + i;
-                auto ow = ow_start + j;
-                if (oh < OH && ow < OW) {
-                    float val = mid_buf1[i * m + j];
-                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
-                        val += bias[oc] * input_filter_rescale *
-                               input_filter_rescale;
-                    } else if (bmode == BiasMode::BIAS) {
-                        val += bias[oc * OH * OW + oh * OW + ow] *
-                               input_filter_rescale * input_filter_rescale;
-                    }
-                    val = val * input_filter_scale /
-                          (input_filter_rescale * input_filter_rescale *
-                           rescale * rescale);
-                    if (nonline_mode == NonlineMode::RELU) {
-                        val = val > 0 ? val : 0;
-                    } else if (nonline_mode == NonlineMode::SIGMOID) {
-                        val = 1.f / (expf(-val) + 1.f);
-                    } else if (nonline_mode == NonlineMode::H_SWISH) {
-                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
-                    } else {
-                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
-                    }
-
-                    output[oc * OH * OW + oh * OW + ow] = getter(val);
+        }
+
+        megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
+                                          input_filter_compute_type, true,
+                                          false>(
+                winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
+                alpha, alpha, alpha, alpha, alpha, dtype, dtype);
+        megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
+                                          input_filter_compute_type, false,
+                                          false>(
+                mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
+                alpha, alpha, alpha, alpha, alpha, dtype, dtype);
+
+        rep(i, alpha) rep(j, alpha) {
+            input_transform_buf[intput_visitor.put(alpha, ic, nr_units_in_tile,
+                                                   unit_idx, i, j)] =
+                    mid_buf1[i * alpha + j];
+        }
+    }
+}
+
+template <typename ctype, typename dst_type, typename input_filter_compute_type,
+          typename output_compute_type, param::ConvBias::Format layout,
+          param::MatrixMul::Format format>
+void StrategyHelper<
+        ctype, dst_type, input_filter_compute_type, output_compute_type, layout,
+        format>::output(const output_compute_type* output_transform_buf,
+                        const output_compute_type* bias, dst_type* output,
+                        output_compute_type* transform_mid_buf, BiasMode bmode,
+                        NonlineMode nonline_mode, size_t oh_start,
+                        size_t ow_start, size_t OH, size_t OW, size_t oc_start,
+                        size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
+                        size_t m, size_t r,
+                        const std::vector<float>& interp_points, DType dtype,
+                        float input_filter_scale, float input_filter_rescale,
+                        float rescale) {
+    size_t alpha = m + r - 1;
+    winograd::WinogradCoeff<output_compute_type> winograd_coeff(m, r,
+                                                                interp_points);
+    output_compute_type* mid_buf1 = transform_mid_buf;
+    output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
+    OutputGetter<output_compute_type, dst_type> getter(dtype);
+    OutputVisitor<layout, format> output_visitor(oc_end - oc_start);
+
+    for (size_t oc = oc_start; oc < oc_end; oc++) {
+        /* gather */
+        rep(i, alpha) rep(j, alpha) {
+            mid_buf1[i * alpha + j] = output_transform_buf[output_visitor.get(
+                    alpha, oc - oc_start, oc, nr_units_in_tile, unit_idx, i,
+                    j)];
+        }
+        /* A[alpha*m] M[alpha*alpha] */
+        megdnn::naive::run_matrix_mul_tpl<output_compute_type,
+                                          output_compute_type, true, false>(
+                winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m, alpha,
+                alpha, m, alpha, alpha, dtype, dtype);
+        megdnn::naive::run_matrix_mul_tpl<output_compute_type,
+                                          output_compute_type, false, false>(
+                mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
+                alpha, alpha, m, m, dtype, dtype);
+
+        rep(i, m) rep(j, m) {
+            auto oh = oh_start + i;
+            auto ow = ow_start + j;
+            if (oh < OH && ow < OW) {
+                float val = mid_buf1[i * m + j];
+                if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
+                    val += bias[oc] * input_filter_rescale *
+                           input_filter_rescale;
+                } else if (bmode == BiasMode::BIAS) {
+                    val += bias[output_visitor.put(oc, OH, OW, oh, ow)] *
+                           input_filter_rescale * input_filter_rescale;
                 }
+                val = val * input_filter_scale /
+                      (input_filter_rescale * input_filter_rescale * rescale *
+                       rescale);
+                if (nonline_mode == NonlineMode::RELU) {
+                    val = val > 0 ? val : 0;
+                } else if (nonline_mode == NonlineMode::SIGMOID) {
+                    val = 1.f / (expf(-val) + 1.f);
+                } else if (nonline_mode == NonlineMode::H_SWISH) {
+                    val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
+                } else {
+                    megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
+                }
+                output[output_visitor.put(oc, OH, OW, oh, ow)] = getter(val);
             }
         }
     }
-
-    static size_t pack_size;
 };
 
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type, param::MatrixMul::Format format>
-size_t StrategyHelper<
-        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
-        std::enable_if_t<format == param::MatrixMul::Format::MK4 ||
-                         format == param::MatrixMul::Format::MK8>>::pack_size =
-        MatrixMulForward::pack_size(format);
-
-#define INST(_ctype, _dst_type, _input_filter_compute_type, \
-             _output_compute_type)                          \
-    template class StrategyHelper<                          \
-            _ctype, _dst_type, _input_filter_compute_type,  \
-            _output_compute_type, param::MatrixMul::Format::DEFAULT>;
+#define INST(_ctype, _dst_type, _input_filter_compute_type,   \
+             _output_compute_type)                            \
+    template class StrategyHelper<_ctype, _dst_type,          \
+                                  _input_filter_compute_type, \
+                                  _output_compute_type>;
 
 INST(float, float, float, float)
 MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16))
@@ -428,234 +360,23 @@ INST(uint8_t, uint8_t, int16_t, int)
 #undef INST
 
 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
-             _output_compute_type)                          \
+             _output_compute_type, layout)                  \
     template class StrategyHelper<                          \
             _ctype, _dst_type, _input_filter_compute_type,  \
-            _output_compute_type, param::MatrixMul::Format::MK4>;
-INST(float, float, float, float)
+            _output_compute_type, layout, param::MatrixMul::Format::MK4>;
+INST(float, float, float, float, param::ConvBias::Format::NCHW)
 #undef INST
 
 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
-             _output_compute_type)                          \
+             _output_compute_type, layout)                  \
     template class StrategyHelper<                          \
             _ctype, _dst_type, _input_filter_compute_type,  \
-            _output_compute_type, param::MatrixMul::Format::MK8>;
-INST(int8_t, int8_t, int16_t, int)
-MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16))
-#undef INST
-
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type, param::MatrixMul::Format format>
-class StrategyHelperNchwxx<
-        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
-        std::enable_if_t<format == param::MatrixMul::Format::MK8>> {
-public:
-    static void filter(const ctype* filter,
-                       input_filter_compute_type* filter_transform_buf,
-                       input_filter_compute_type* transform_mid_buf, size_t OC,
-                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
-                       size_t r, const std::vector<float>& interp_points,
-                       DType dtype, float rescale) {
-        megdnn_assert(
-                (oc_end - oc_start) % 8 == 0 && oc_start % 8 == 0 &&
-                        oc_end % 8 == 0 && IC % 8 == 0 && OC % 8 == 0,
-                "Winograd filter transform input param is not times of 8!");
-
-        size_t alpha = m + r - 1;
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-
-        input_filter_compute_type* mid_buf1 = transform_mid_buf;
-        input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        size_t OCB = OC / pack_size;
-        size_t ICB = IC / pack_size;
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            rep(ic, IC) {
-                size_t ocb = oc / pack_size;
-                size_t oc_pack = oc % pack_size;
-                size_t icb = ic / pack_size;
-                size_t ic_pack = ic % pack_size;
-
-                const ctype* filter_ptr =
-                        filter + (ocb * (IC / 8) + icb) * r * r * 8 * 8 +
-                        ic_pack * 8 + oc_pack;
-                rep(i, r) rep(j, r) {
-                    mid_buf1[i * r + j] =
-                            getter(filter_ptr[(i * r + j) * 8 * 8]);
-                }
-
-                /* tmp = Matmul(G, src) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, false>(
-                        winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2,
-                        alpha, r, r, r, r, r, dtype, dtype);
-                /* dst = Matmul(tmp, G^T) */
-                megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                                  input_filter_compute_type,
-                                                  false, true>(
-                        mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1,
-                        alpha, alpha, r, r, r, alpha, dtype, dtype);
-
-                rep(i, alpha) rep(j, alpha) {
-                    filter_transform_buf[(i * alpha + j) * OCB * ICB *
-                                                 pack_size * pack_size +
-                                         ocb * ICB * pack_size * pack_size +
-                                         icb * pack_size * pack_size +
-                                         ic_pack * pack_size + oc_pack] =
-                            mid_buf1[i * alpha + j];
-                }
-            }
-        }
-    }
-
-    static void input(const ctype* input,
-                      input_filter_compute_type* input_transform_buf,
-                      input_filter_compute_type* transform_mid_buf,
-                      int ih_start, int iw_start, size_t IH, size_t IW,
-                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
-                      size_t m, size_t r,
-                      const std::vector<float>& interp_points, DType dtype,
-                      float rescale) {
-        size_t alpha = m + r - 1;
-        Getter<ctype, input_filter_compute_type> getter(dtype);
-        WinogradCoeff<input_filter_compute_type> winograd_coeff(m, r,
-                                                                interp_points);
-        size_t ICB = IC / pack_size;
-        rep(ic, IC) {
-            size_t icb = ic / pack_size;
-            size_t ic_pack = ic % pack_size;
-            input_filter_compute_type* mid_buf1 = transform_mid_buf;
-            input_filter_compute_type* mid_buf2 =
-                    transform_mid_buf + alpha * alpha;
-
-            memset(mid_buf1, 0,
-                   alpha * alpha * sizeof(input_filter_compute_type));
-            rep(i, alpha) rep(j, alpha) {
-                int ih = ih_start + i;
-                int iw = iw_start + j;
-                if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) {
-                    mid_buf1[i * alpha + j] = getter(
-                            input[(icb * IH * IW + ih * IW + iw) * pack_size +
-                                  ic_pack]);
-                }
-            }
-            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                              input_filter_compute_type, true,
-                                              false>(
-                    winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
-            megdnn::naive::run_matrix_mul_tpl<input_filter_compute_type,
-                                              input_filter_compute_type, false,
-                                              false>(
-                    mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha,
-                    alpha, alpha, alpha, alpha, alpha, dtype, dtype);
-            rep(i, alpha) rep(j, alpha) {
-                input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile *
-                                            pack_size +
-                                    icb * nr_units_in_tile * pack_size +
-                                    unit_idx * pack_size + ic_pack] =
-                        mid_buf1[i * alpha + j];
-            }
-        }
-    }
-
-    static void output(const output_compute_type* output_transform_buf,
-                       const output_compute_type* bias, dst_type* output,
-                       output_compute_type* transform_mid_buf, BiasMode bmode,
-                       NonlineMode nonline_mode, size_t oh_start,
-                       size_t ow_start, size_t OH, size_t OW, size_t oc_start,
-                       size_t oc_end, size_t unit_idx, size_t nr_units_in_tile,
-                       size_t m, size_t r,
-                       const std::vector<float>& interp_points, DType dtype,
-                       float input_filter_scale, float input_filter_rescale,
-                       float rescale) {
-        size_t alpha = m + r - 1;
-        size_t OC = oc_end - oc_start;
-
-        OutputGetter<output_compute_type, dst_type> getter(dtype);
-        winograd::WinogradCoeff<output_compute_type> winograd_coeff(
-                m, r, interp_points);
-        size_t OCB = OC / pack_size;
-        for (size_t oc = oc_start; oc < oc_end; oc++) {
-            output_compute_type* mid_buf1 = transform_mid_buf;
-            output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha;
-
-            size_t ocb = (oc - oc_start) / pack_size;
-            size_t oc_pack = oc % pack_size;
-            // gather
-            rep(i, alpha) rep(j, alpha) {
-                mid_buf1[i * alpha + j] = output_transform_buf
-                        [(i * alpha + j) * OCB * nr_units_in_tile * pack_size +
-                         ocb * nr_units_in_tile * pack_size +
-                         unit_idx * pack_size + oc_pack];
-            }
-            /* A[alpha*m] M[alpha*alpha] */
-            megdnn::naive::run_matrix_mul_tpl<output_compute_type,
-                                              output_compute_type, true, false>(
-                    winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m,
-                    alpha, alpha, m, alpha, alpha, dtype, dtype);
-            megdnn::naive::run_matrix_mul_tpl<
-                    output_compute_type, output_compute_type, false, false>(
-                    mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m,
-                    alpha, alpha, m, m, dtype, dtype);
-            rep(i, m) rep(j, m) {
-                auto oh = oh_start + i;
-                auto ow = ow_start + j;
-                if (oh < OH && ow < OW) {
-                    float val = mid_buf1[i * m + j];
-                    if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
-                        val += bias[oc] * input_filter_rescale *
-                               input_filter_rescale;
-                    } else if (bmode == BiasMode::BIAS) {
-                        val += bias[(oc / pack_size * OH * OW + oh * OW + ow) *
-                                            pack_size +
-                                    oc_pack] *
-                               input_filter_rescale * input_filter_rescale;
-                    }
-                    val = val * input_filter_scale /
-                          (input_filter_rescale * input_filter_rescale *
-                           rescale * rescale);
-                    if (nonline_mode == NonlineMode::RELU) {
-                        val = val > 0 ? val : 0;
-                    } else if (nonline_mode == NonlineMode::SIGMOID) {
-                        val = 1.f / (expf(-val) + 1.f);
-                    } else if (nonline_mode == NonlineMode::H_SWISH) {
-                        val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f;
-                    } else {
-                        megdnn_assert(nonline_mode == NonlineMode::IDENTITY);
-                    }
-
-                    output[(oc / pack_size * OH * OW + oh * OW + ow) *
-                                   pack_size +
-                           oc_pack] = getter(val);
-                }
-            }
-        }
-    }
-
-    static size_t pack_size;
-};
-
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type, param::MatrixMul::Format format>
-size_t StrategyHelperNchwxx<
-        ctype, dst_type, input_filter_compute_type, output_compute_type, format,
-        std::enable_if_t<format == param::MatrixMul::Format::MK8>>::pack_size =
-        MatrixMulForward::pack_size(format);
-
-#define INST(_ctype, _dst_type, _input_filter_compute_type, \
-             _output_compute_type)                          \
-    template class StrategyHelperNchwxx<                    \
-            _ctype, _dst_type, _input_filter_compute_type,  \
-            _output_compute_type, param::MatrixMul::Format::MK8>;
-INST(float, float, float, float)
+            _output_compute_type, layout, param::MatrixMul::Format::MK8>;
+INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW)
+INST(float, float, float, float, param::ConvBias::Format::NCHW88)
+MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16,
+                        param::ConvBias::Format::NCHW))
 #undef INST
-
-
-
 }  // namespace winograd
 }  // namespace megdnn
 
diff --git a/dnn/src/common/winograd/winograd_helper.h b/dnn/src/common/winograd/winograd_helper.h
index bdbec6203..c2cd945bc 100644
--- a/dnn/src/common/winograd/winograd_helper.h
+++ b/dnn/src/common/winograd/winograd_helper.h
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #pragma once
@@ -28,8 +29,8 @@ using BiasMode = ConvBiasForward::BiasMode;
  */
 template <typename ctype, typename dst_type, typename input_filter_compute_type,
           typename output_compute_type,
-          param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT,
-          typename enable = void>
+          param::ConvBias::Format layout = param::ConvBias::Format::NCHW,
+          param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT>
 class StrategyHelper {
 public:
     static void filter(const ctype* filter,
@@ -61,47 +62,6 @@ public:
            float rescale = 1.0f);
 };
 
-/**
- * \brief Strategy helper, contains some helper function for debug kernel
- * implementation
- *
- * \warning The layout should be NCHW88
- */
-template <typename ctype, typename dst_type, typename input_filter_compute_type,
-          typename output_compute_type,
-          param::MatrixMul::Format format = param::MatrixMul::Format::MK8,
-          typename enable = void>
-class StrategyHelperNchwxx {
-public:
-    static void filter(const ctype* filter,
-                       input_filter_compute_type* filter_transform_buf,
-                       input_filter_compute_type* transform_mid_buf, size_t OC,
-                       size_t IC, size_t oc_start, size_t oc_end, size_t m,
-                       size_t r, const std::vector<float>& interp_points,
-                       DType dtype, float rescale = 1.0f);
-
-    static void input(const ctype* input,
-                      input_filter_compute_type* input_transform_buf,
-                      input_filter_compute_type* transform_mid_buf,
-                      int ih_start, int iw_start, size_t IH, size_t IW,
-                      size_t IC, size_t unit_idx, size_t nr_units_in_tile,
-                      size_t m, size_t r,
-                      const std::vector<float>& interp_points, DType dtype,
-                      float rescale = 1.0f);
-
-    static void
-    output(const output_compute_type* output_transform_buf,
-           const output_compute_type* bias, dst_type* output,
-           output_compute_type* transform_mid_buf, BiasMode bmode,
-           NonlineMode nonline_mode, size_t oh_start, size_t ow_start,
-           size_t OH, size_t OW, size_t oc_start, size_t oc_end,
-           size_t unit_idx, size_t nr_units_in_tile, size_t m, size_t r,
-           const std::vector<float>& interp_points, DType dtype,
-           float input_filter_scale = 1.0f,    // input_scale * filter_scale
-           float input_filter_rescale = 1.0f,  // input_rescale * filter_rescale
-           float rescale = 1.0f);
-};
-
 }  // namespace winograd
 }  // namespace megdnn
    // vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/conv_bias/winograd/strategy.cpp b/dnn/src/fallback/conv_bias/winograd/strategy.cpp
index 579dbdcc2..de0ff614b 100644
--- a/dnn/src/fallback/conv_bias/winograd/strategy.cpp
+++ b/dnn/src/fallback/conv_bias/winograd/strategy.cpp
@@ -6,13 +6,14 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "src/fallback/conv_bias/winograd/strategy.h"
-#include "src/fallback/conv_bias/winograd/winograd.h"
-#include "src/common/winograd/winograd_helper.h"
 #include "src/common/utils.h"
+#include "src/common/winograd/winograd_helper.h"
+#include "src/fallback/conv_bias/winograd/winograd.h"
 
 namespace megdnn {
 namespace fallback {
@@ -60,7 +61,7 @@ void winograd_2x3_4x4_f::filter(const float* filter,
                                 float* transform_mid_buf, size_t OC, size_t IC,
                                 size_t oc_start, size_t oc_end) {
     ::megdnn::winograd::StrategyHelper<
-            float, float, float, float,
+            float, float, float, float, param::ConvBias::Format::NCHW,
             param::MatrixMul::Format::MK4>::filter(filter, filter_transform_buf,
                                                    transform_mid_buf, OC, IC,
                                                    oc_start, oc_end,
@@ -73,11 +74,15 @@ void winograd_2x3_4x4_f::input(const float* input, float* input_transform_buf,
                                float* transform_mid_buf, int ih_start,
                                int iw_start, size_t IH, size_t IW, size_t IC,
                                size_t unit_idx, size_t nr_units_in_tile) {
-    ::megdnn::winograd::StrategyHelper<float, float, float, float,
-                                       param::MatrixMul::Format::MK4>::
-            input(input, input_transform_buf, transform_mid_buf, ih_start,
-                  iw_start, IH, IW, IC, unit_idx, nr_units_in_tile,
-                  OUTPUT_BLOCK_SIZE, KERNEL_SIZE, {0, 1, -1}, src_dtype);
+    ::megdnn::winograd::StrategyHelper<
+            float, float, float, float, param::ConvBias::Format::NCHW,
+            param::MatrixMul::Format::MK4>::input(input, input_transform_buf,
+                                                  transform_mid_buf, ih_start,
+                                                  iw_start, IH, IW, IC,
+                                                  unit_idx, nr_units_in_tile,
+                                                  OUTPUT_BLOCK_SIZE,
+                                                  KERNEL_SIZE, {0, 1, -1},
+                                                  src_dtype);
 }
 
 void winograd_2x3_4x4_f::output(const float* output_transform_buf,
@@ -87,16 +92,19 @@ void winograd_2x3_4x4_f::output(const float* output_transform_buf,
                                 size_t ow_start, size_t OH, size_t OW,
                                 size_t oc_start, size_t oc_end, size_t unit_idx,
                                 size_t nr_units_in_tile) {
-    ::megdnn::winograd::StrategyHelper<float, float, float, float,
-                                       param::MatrixMul::Format::MK4>::
-            output(output_transform_buf, bias, output, transform_mid_buf, bmode,
-                   nonline_mode, oh_start, ow_start, OH, OW, oc_start, oc_end,
-                   unit_idx, nr_units_in_tile, OUTPUT_BLOCK_SIZE, KERNEL_SIZE,
-                   {0, 1, -1}, dst_dtype);
+    ::megdnn::winograd::StrategyHelper<
+            float, float, float, float, param::ConvBias::Format::NCHW,
+            param::MatrixMul::Format::MK4>::output(output_transform_buf, bias,
+                                                   output, transform_mid_buf,
+                                                   bmode, nonline_mode,
+                                                   oh_start, ow_start, OH, OW,
+                                                   oc_start, oc_end, unit_idx,
+                                                   nr_units_in_tile,
+                                                   OUTPUT_BLOCK_SIZE,
+                                                   KERNEL_SIZE, {0, 1, -1},
+                                                   dst_dtype);
 }
 
-
-
 MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_1x1_qs8)
 
 void winograd_2x3_1x1_qs8::filter(const int8_t* filter,
@@ -136,7 +144,6 @@ void winograd_2x3_1x1_qs8::output(const int* output_transform_buf,
             {0, 1, -1}, dst_dtype, scale_input * scale_filter, 2.0f, 1.0f);
 }
 
-
 MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_8x8_qs8)
 
 void winograd_2x3_8x8_qs8::filter(const int8_t* filter,
@@ -144,7 +151,7 @@ void winograd_2x3_8x8_qs8::filter(const int8_t* filter,
                                   int16_t* transform_mid_buf, size_t OC,
                                   size_t IC, size_t oc_start, size_t oc_end) {
     ::megdnn::winograd::StrategyHelper<
-            int8_t, int8_t, int16_t, int,
+            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
             param::MatrixMul::Format::MK8>::filter(filter, filter_transform_buf,
                                                    transform_mid_buf, OC, IC,
                                                    oc_start, oc_end,
@@ -158,11 +165,15 @@ void winograd_2x3_8x8_qs8::input(const int8_t* input,
                                  int16_t* transform_mid_buf, int ih_start,
                                  int iw_start, size_t IH, size_t IW, size_t IC,
                                  size_t unit_idx, size_t nr_units_in_tile) {
-    ::megdnn::winograd::StrategyHelper<int8_t, int8_t, int16_t, int,
-                                       param::MatrixMul::Format::MK8>::
-            input(input, input_transform_buf, transform_mid_buf, ih_start,
-                  iw_start, IH, IW, IC, unit_idx, nr_units_in_tile,
-                  OUTPUT_BLOCK_SIZE, KERNEL_SIZE, {0, 1, -1}, src_dtype, 1.0f);
+    ::megdnn::winograd::StrategyHelper<
+            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
+            param::MatrixMul::Format::MK8>::input(input, input_transform_buf,
+                                                  transform_mid_buf, ih_start,
+                                                  iw_start, IH, IW, IC,
+                                                  unit_idx, nr_units_in_tile,
+                                                  OUTPUT_BLOCK_SIZE,
+                                                  KERNEL_SIZE, {0, 1, -1},
+                                                  src_dtype, 1.0f);
 }
 
 void winograd_2x3_8x8_qs8::output(const int* output_transform_buf,
@@ -180,13 +191,19 @@ void winograd_2x3_8x8_qs8::output(const int* output_transform_buf,
         megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS16);
         scale_filter = filter_dtype.param<dtype::QuantizedS16>().scale;
     }
-    ::megdnn::winograd::StrategyHelper<int8_t, int8_t, int16_t, int,
-                                       param::MatrixMul::Format::MK8>::
-            output(output_transform_buf, bias, output, transform_mid_buf, bmode,
-                   nonline_mode, oh_start, ow_start, OH, OW, oc_start, oc_end,
-                   unit_idx, nr_units_in_tile, OUTPUT_BLOCK_SIZE, KERNEL_SIZE,
-                   {0, 1, -1}, dst_dtype, scale_input * scale_filter, 2.0f,
-                   1.0f);
+    ::megdnn::winograd::StrategyHelper<
+            int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW,
+            param::MatrixMul::Format::MK8>::output(output_transform_buf, bias,
+                                                   output, transform_mid_buf,
+                                                   bmode, nonline_mode,
+                                                   oh_start, ow_start, OH, OW,
+                                                   oc_start, oc_end, unit_idx,
+                                                   nr_units_in_tile,
+                                                   OUTPUT_BLOCK_SIZE,
+                                                   KERNEL_SIZE, {0, 1, -1},
+                                                   dst_dtype,
+                                                   scale_input * scale_filter,
+                                                   2.0f, 1.0f);
 }
 
 }  // namespace winograd
diff --git a/dnn/src/fallback/conv_bias/winograd/strategy.h b/dnn/src/fallback/conv_bias/winograd/strategy.h
index ed1d3ad25..fe186cb9c 100644
--- a/dnn/src/fallback/conv_bias/winograd/strategy.h
+++ b/dnn/src/fallback/conv_bias/winograd/strategy.h
@@ -28,6 +28,7 @@ MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 1, 1,
 
 MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8,
                              winograd_2x3_8x8_qs8)
+
 }
 }  // namespace fallback
 }  // namespace megdnn
diff --git a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
index 4a012592b..b5db83e7f 100644
--- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
@@ -49,17 +50,16 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
     size_t m = param().output_block_size;
 
     bool execed = false;
-#define cb(_ctype, _dst_type, _input_filter_compute_type,                     \
-           _output_compute_type, _format, rescale)                            \
-    if (param().format == _format) {                                          \
-        return winograd::StrategyHelper<                                      \
-                _ctype, _dst_type, _input_filter_compute_type,                \
-                _output_compute_type, _format>::filter(src_ptr, dst_ptr,      \
-                                                       workspace_ptr, OC, IC, \
-                                                       0, OC, m, FW,          \
-                                                       interp_points,         \
-                                                       src.layout.dtype,      \
-                                                       rescale);              \
+
+#define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
+           _output_compute_type, _format, rescale)                           \
+    if (param().format == _format) {                                         \
+        return winograd::StrategyHelper<                                     \
+                _ctype, _dst_type, _input_filter_compute_type,               \
+                _output_compute_type, param::ConvBias::Format::NCHW,         \
+                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
+                                 OC, m, FW, interp_points, src.layout.dtype, \
+                                 rescale);                                   \
     }
 
 #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type,  \
@@ -110,8 +110,9 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
         DISPATCH_KERNEL(dt_float16, dt_float16, dt_float16, dt_float16,      \
                         DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 2);          \
     })
-    //! normal nchw mode
+
     if (src.layout.ndim <= 5) {
+        //! dispatch_dtype with consider layout and format.
         if (FW == 3) {
             if (m == 2) {
                 std::vector<float> interp_points = {0, 1, -1};
@@ -131,22 +132,20 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                 DISPATCH_DTYPE(3);
             }
         }
-    }
 #undef cb
 #undef DISPATCH_FORMAT_MK4
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_DTYPE
-#define cb(_ctype, _dst_type, _input_filter_compute_type,                     \
-           _output_compute_type, _format, rescale)                            \
-    if (param().format == _format) {                                          \
-        return winograd::StrategyHelperNchwxx<                                \
-                _ctype, _dst_type, _input_filter_compute_type,                \
-                _output_compute_type, _format>::filter(src_ptr, dst_ptr,      \
-                                                       workspace_ptr, OC, IC, \
-                                                       0, OC, m, FW,          \
-                                                       interp_points,         \
-                                                       src.layout.dtype,      \
-                                                       rescale);              \
+    } else {
+#define cb(_ctype, _dst_type, _input_filter_compute_type,                    \
+           _output_compute_type, _format, rescale)                           \
+    if (param().format == _format) {                                         \
+        return winograd::StrategyHelper<                                     \
+                _ctype, _dst_type, _input_filter_compute_type,               \
+                _output_compute_type, param::ConvBias::Format::NCHW88,       \
+                _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \
+                                 OC, m, FW, interp_points, src.layout.dtype, \
+                                 rescale);                                   \
     }
 
 #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type,  \
@@ -159,8 +158,6 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
         DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \
                         DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0);     \
     }
-    //! nchwxx mode
-    else {
         megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7);
         if (FW == 3) {
             if (m == 2) {
@@ -171,11 +168,11 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                 DISPATCH_DTYPE(5);
             }
         }
-    }
 #undef cb
 #undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_KERNEL
 #undef DISPATCH_DTYPE
+    }
     megdnn_assert(execed,
                   "Unsupport winograd filter preprocess. m: %zu src: %s", m,
                   src.layout.to_string().c_str());
-- 
GitLab