feat(opr): use weight preprocess feature of MegDNN

GitOrigin-RevId: 779041f8a87051e58d5e0ca289773b05a261a8a0

feat(opr): use weight preprocess feature of MegDNN
GitOrigin-RevId: 779041f8a87051e58d5e0ca289773b05a261a8a0
75eebb7c · Megvii Engine Team · 66509a54 · 75eebb7c · 75eebb7c · 75eebb7c
4 changed file
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -51,6 +51,17 @@ protected:
 };
 using SeparableConv = SeparableConvForward;
+namespace detail {
+struct PreprocessedFilter {
+    //! user data; its lifetime should be bound to MegDNN Convolution
+    //! operator
+    void* algorithm_id;
+    TensorNDArray tensors;
+};
+}  // namespace intl
 /**
 * \brief base class for convolution operation
 *
@@ -131,13 +142,7 @@ public:
            return flag;
        }
    };
+    using PreprocessedFilter = detail::PreprocessedFilter;
-    struct PreprocessedFilter {
-        //! user data; its lifetime should be bound to MegDNN Convolution
-        //! operator
-        void* algorithm_id;
-        TensorNDArray tensors;
-    };
 protected:
    // Check or deduce output DType

--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -10,6 +10,7 @@
 */
 #include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/io.h"
 #include "megbrain/graph/grad_impl.h"
 #include "megbrain/system.h"
@@ -95,67 +96,14 @@ MGB_FOREACH_FASTRUN_OPR(cb)
 #undef cb
-template <class MGBOpr>
-struct OprAttributeTrait {
-    static bool is_weights_persistent(const MGBOpr*) { return false; }
-};
-template <>
-struct OprAttributeTrait<opr::ConvBias> {
-    //! return true if the flag of weights is PERSISTENT_DEVICE_VALUE, false
-    //! otherwise. True means weights can be tranformed in the first run.
-    static bool is_weights_persistent(const opr::ConvBias* opr) {
-        return opr->input()[1]->contain_flag(
-                VarNode::Flag::PERSISTENT_DEVICE_VALUE);
-    }
-};
-template <typename Opr>
-constexpr bool opr_supports_preprocess() {
-    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
-           std::is_same<Opr, megdnn::ConvBias>::value;
-}
 template <typename Opr>
 struct OprArityTrait;
-#define APPLY(statement, ...)                                  \
-    mgb::apply([&](const auto&... args) { return statement; }, \
-               std::tuple_cat(__VA_ARGS__))
 template <typename Opr, int _arity_in, int _arity_out>
 struct OprArityTraitTmpl {
    static constexpr int arity_in = _arity_in;
    static constexpr int arity_out = _arity_out;
    static constexpr int arity = arity_in + arity_out;
-    using Algorithm = typename Opr::Algorithm;
-    using TensorLayoutArray = std::array<TensorLayout, arity>;
-    static size_t get_workspace_in_bytes(Opr* opr, Algorithm* algo,
-                                         const TensorLayoutArray& layouts) {
-        opr->execution_policy() = {algo};
-        size_t workspace_size;
-        if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
-            workspace_size = APPLY(
-                    opr->get_workspace_in_bytes(args..., nullptr), layouts);
-        }, /* else */ [&](auto) {
-            workspace_size =
-                    APPLY(opr->get_workspace_in_bytes(args...), layouts);
-        });
-        return workspace_size;
-    }
-    static void exec(Opr* opr,
-                     const std::array<DeviceTensorND, arity_in>& inp_val,
-                     const std::array<DeviceTensorND, arity_out>& out_val,
-                     megdnn::Workspace& workspace) {
-        if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
-            APPLY(opr->exec(args.as_megdnn()..., nullptr, workspace), inp_val,
-                   out_val);
-        }, /* else */ [&](auto) {
-            APPLY(opr->exec(args.as_megdnn()..., workspace), inp_val, out_val);
-        });
-    }
 };
 #define INST_ARITY(_Opr, _in, _out) \
@@ -179,6 +127,26 @@ INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);
 #undef INST_ARITY
+template <typename Opr>
+constexpr bool opr_supports_preprocess() {
+    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
+           std::is_same<Opr, megdnn::ConvBias>::value;
+}
+template <typename Opr, bool has_prep>
+struct PreprocessFilterImpl {
+    using T = union {};
+};
+template <typename Opr>
+struct PreprocessFilterImpl<Opr, true> {
+    using T = typename Opr::PreprocessedFilter;
+};
+template <typename Opr>
+using PreprocessFilter =
+        typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;
 // timeout delta to be added with fastest known algorithm for new algos
 constexpr double TIMEOUT_TOLERANCE = 2;
@@ -225,6 +193,7 @@ public:
        CompNode::Locator comp_node_loc;
        ConvTensorShapes shapes;
        typename Opr::Param opr_param;
+        bool allow_weight_preprocess;
        //! filled by profile()
        mutable double actual_timeout;
@@ -277,6 +246,10 @@ double TimedProfiler<Opr>::init_timeout_setting() {
    return 0;
 }
+#define APPLY(statement, ...)                                  \
+    mgb::apply([&](const auto&... args) { return statement; }, \
+               std::tuple_cat(__VA_ARGS__))
 template <typename Opr>
 typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        const TParam& raw_param) {
@@ -324,6 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        megdnn_opr->execution_policy() = {algo};
    }
+    // Allocate preprocessed weight buffers.
+    TensorLayoutArray preprocessed_layout;
+    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
+        if (param.allow_weight_preprocess) {
+            preprocessed_layout = APPLY(
+                    _(megdnn_opr)->deduce_preprocessed_filter_layout(args...),
+                    layouts);
+        }
+    });
    {
        // first allocate a whole chunk to avoid memory fragmentation (here we
        // rely on memory allocator to reuse memory)
@@ -332,6 +315,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        for (int i = 0; i < arity; ++i) {
            tot_size += layouts[i].span().high_byte + align;
        }
+        for (const auto& layout : preprocessed_layout) {
+            tot_size += layout.span().high_byte + align;
+        }
        tot_size += param.workspace;
        DeviceTensorStorage storage{cn};
        storage.ensure_size(tot_size);
@@ -362,15 +348,46 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        mdn_workspace.raw_ptr = workspace.raw_ptr();
    }
+    // allocate storage for preprocessed filter
+    SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
+    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
+        flt_val[i] = {cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
+                      preprocessed_layout[i].format};
+    }
    for (int i = 0; i < arity_in; ++i) {
        fill_zero_dev_tensor(inp_val[i]);
    }
+    PreprocessFilter<Opr> prep_flt;
+    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
+        if (!preprocessed_layout.empty()) {
+            auto&& pf = _(prep_flt);
+            pf.algorithm_id = nullptr;
+            pf.tensors.resize(flt_val.size());
+            for (size_t i = 0; i < flt_val.size(); i++) {
+                pf.tensors[i] = flt_val[i].as_megdnn();
+            }
+            APPLY(_(megdnn_opr)->exec_preprocess(args..., &pf, mdn_workspace),
+                  std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
+                  array_skip<2>(layouts));
+        }
+    });
    RealTimer timer;
    auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
         ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
    ev_start->record();
-    OprArityTrait<Opr>::exec(megdnn_opr.get(), inp_val, out_val, mdn_workspace);
+    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
+        auto&& opr = _(megdnn_opr);
+        PreprocessFilter<Opr>* pf =
+                preprocessed_layout.empty() ? nullptr : &prep_flt;
+        APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
+              out_val);
+    }, /* else */ [&](auto _) {
+        APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
+              out_val);
+    });
    ev_end->record();
    double next_report_time = 0.5;
@@ -425,13 +442,15 @@ class AlgoChooser {
        const ConvTensorLayouts& m_layouts;
        Opr* m_megdnn_opr;
        const MGBOpr* m_mgb_opr;
+        bool m_allow_weight_preprocess;
    public:
        ExeContext(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
-                   const MGBOpr* mgb_opr)
+                   const MGBOpr* mgb_opr, bool allow_weight_preprocess)
                : m_layouts{layouts},
                  m_megdnn_opr{megdnn_opr},
-                  m_mgb_opr{mgb_opr} {
+                  m_mgb_opr{mgb_opr},
+                  m_allow_weight_preprocess{allow_weight_preprocess} {
            mgb_assert(m_layouts.size() == layouts.size());
            static_assert(
                    std::tuple_size<ConvTensorLayouts>::value == 3 ||
@@ -499,8 +518,23 @@ class AlgoChooser {
        //! get workspace size required for specific algo
        size_t get_workspace_size_bytes(ImplAlgo algo) const {
-            return OprArityTrait<Opr>::get_workspace_in_bytes(m_megdnn_opr,
+            m_megdnn_opr->execution_policy() = {algo};
-                                                              algo, m_layouts);
+            size_t result;
+            if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
+                auto&& opr = _(m_megdnn_opr);
+                auto prep = construct_fake_preprocess_filter();
+                PreprocessFilter<Opr>* prep_ptr =
+                        prep.valid() ? &prep.val() : nullptr;
+                result = std::max(
+                        APPLY(opr->get_preprocess_workspace_in_bytes(args...),
+                              m_layouts),
+                        APPLY(opr->get_workspace_in_bytes(args..., prep_ptr),
+                              m_layouts));
+            }, /* else */ [&](auto _) {
+                result = APPLY(_(m_megdnn_opr)->get_workspace_in_bytes(args...),
+                               m_layouts);
+            });
+            return result;
        }
        /*!
@@ -525,6 +559,28 @@ class AlgoChooser {
         */
        void modify_param_with_weights_preprocessed(
                typename TimedProfiler<Opr>::Param& param) const {}
+        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const {
+            Maybe<PreprocessFilter<Opr>> result = None;
+            if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
+                if (!m_allow_weight_preprocess)
+                    return;
+                auto opr = _(m_megdnn_opr);
+                auto layout =
+                        APPLY(opr->deduce_preprocessed_filter_layout(args...),
+                              m_layouts);
+                if (layout.empty())
+                    return;
+                result = PreprocessFilter<Opr>{};
+                auto& res = result.val();
+                res.algorithm_id = nullptr;
+                res.tensors.resize(layout.size());
+                for (size_t i = 0; i < layout.size(); i++) {
+                    res.tensors[i] = megdnn::TensorND(nullptr, layout[i]);
+                }
+            });
+            return result;
+        }
    };
    //! entrance for getting algorithm according to execution strategy
@@ -571,12 +627,13 @@ public:
     * \brief setup algorithm and return workspace size
     */
    static size_t setup_algo(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
-                             const MGBOpr* mgb_opr) {
+                             const MGBOpr* mgb_opr,
+                             bool allow_weight_preprocess = false) {
        if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) {
            return 0;
        }
-        ExeContext ctx(layouts, megdnn_opr, mgb_opr);
+        ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess);
        auto algo = get_algo(ctx);
        size_t workspace = ctx.get_workspace_size_bytes(algo);
@@ -780,9 +837,6 @@ Maybe<AlgoChooserProfileCache::ResultEntry>
 AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
                                                  double& timeout) const {
    typename TimedProfiler<Opr>::Param param;
-    bool is_weights_persistent =
-            OprAttributeTrait<typename MegDNNOpr2MGBOpr<Opr>::MGBOpr>::
-                    is_weights_persistent(m_mgb_opr);
    auto name = algo->name();
    // force check copy size <= dest len-1 from gcc8 for safe
    auto len = sizeof(param.algo_name);
@@ -806,8 +860,9 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
    for (size_t i = 0; i < param.shapes.size(); ++i)
        param.shapes[i] = m_layouts[i];
    param.opr_param = m_megdnn_opr->param();
+    param.allow_weight_preprocess = m_allow_weight_preprocess;
-    if (is_weights_persistent) {
+    if (m_allow_weight_preprocess) {
        modify_param_with_weights_preprocessed(param);
    }
@@ -911,6 +966,78 @@ AlgoChooserProfileCache& mixin::Convolution::profile_cache() const {
    return *m_profile_cache;
 }
+class mixin::WeightPreprocessExecutor::PreprocessedFilterExecDep final
+        : public cg::GraphExecutable::ExecDependency {
+    std::unique_ptr<PreprocessedFilter> m_pf;
+    SmallVector<DeviceTensorND> m_filter_storage;
+public:
+    explicit PreprocessedFilterExecDep(
+            std::unique_ptr<PreprocessedFilter> preprocessed_filter,
+            SmallVector<DeviceTensorND> filter_storage)
+            : m_pf(std::move(preprocessed_filter)),
+              m_filter_storage(std::move(filter_storage)) {}
+};
+void mixin::WeightPreprocessExecutor::mixin_update_preprocessed_filter(
+        cg::OperatorNodeBase& opr) {
+    if (!mixin_allow_weight_preprocess(opr)) return;
+    auto new_layout = deduce_preprocessed_filter_layout();
+    if (new_layout.empty()) {
+        // Weight preprocess was needed before, but no longer needed.
+        if (m_preprocessed_filter) {
+            m_preprocessed_filter.reset();
+            m_filter_storage.clear();
+        }
+        return;
+    }
+    bool should_update = false;
+    size_t new_size = new_layout.size();
+    if (!m_preprocessed_filter ||
+        m_preprocessed_filter->tensors.size() != new_size) {
+        should_update = true;
+    } else {
+        for (size_t i = 0; i < new_size; i++) {
+            if (!new_layout[i].eq_layout(
+                        m_preprocessed_filter->tensors[i].layout)) {
+                should_update = true;
+                break;
+            }
+        }
+    }
+    if (!should_update) return;
+    if (!m_preprocessed_filter) {
+        m_preprocessed_filter.reset(new PreprocessedFilter{});
+    }
+    m_preprocessed_filter->tensors.resize(new_size);
+    m_filter_storage.resize(new_size);
+    m_preprocessed_filter->algorithm_id = nullptr;
+    for (size_t i = 0; i < new_size; i++) {
+        m_filter_storage[i] = {opr.output(0)->comp_node(), new_layout[i],
+                               new_layout[i].dtype, new_layout[i].format};
+        m_preprocessed_filter->tensors[i] = m_filter_storage[i].as_megdnn();
+    }
+    scn_do_execute_preprocess();
+}
+void mixin::WeightPreprocessExecutor::record_preprocessed_weight(
+        cg::GraphExecutable::ExecDependencyArray& deps) {
+    deps.emplace_back(new PreprocessedFilterExecDep{
+            std::move(m_preprocessed_filter), std::move(m_filter_storage)});
+}
+bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
+        const cg::OperatorNodeBase& opr) const {
+    bool param_merged = opr.input(1)
+                                ->owner_opr()
+                                ->same_type<opr::MultipleDeviceTensorHolder>();
+    return opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
+           (cg::is_const_var_value(opr.input(1)) || param_merged);
+}
 /* ==================== ConvolutionForward  ==================== */
 IMPL_CONV(ConvolutionForward, "conv_fwd");
@@ -971,7 +1098,7 @@ size_t ConvolutionForward::get_workspace_size_bytes(
                          input(0)->format()},
             {input_shapes[1], input(1)->dtype(), input(1)->format()},
             {output_shapes[0], output(0)->dtype(), output(0)->format()}},
-            megdnn_opr(), this);
+            megdnn_opr(), this, allow_weight_preprocess());
 }
 void ConvolutionForward::init_output_format() {
@@ -980,9 +1107,14 @@ void ConvolutionForward::init_output_format() {
 }
 void ConvolutionForward::scn_do_execute() {
+    if (input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
+        cg::is_const_var_value(input(1))) {
+        update_preprocessed_filter();
+    }
    megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(),
                       input(1)->dev_tensor().as_megdnn(),
-                       output(0)->dev_tensor().as_megdnn(), nullptr,
+                       output(0)->dev_tensor().as_megdnn(),
+                       preprocessed_filter(),
                       intl::get_megdnn_workspace_from_var(output().back()));
 }
@@ -1012,6 +1144,20 @@ void ConvolutionForward::get_output_var_shape(
 void ConvolutionForward::record_execute_deps(
        cg::GraphExecutable::ExecDependencyArray& deps) {
    record_megdnn_opr(deps);
+    record_preprocessed_weight(deps);
+}
+SmallVector<TensorLayout>
+ConvolutionForward::deduce_preprocessed_filter_layout() {
+    return megdnn_opr()->deduce_preprocessed_filter_layout(
+            input(0)->layout(), input(1)->layout(), output(0)->layout());
+}
+void ConvolutionForward::scn_do_execute_preprocess() {
+    megdnn_opr()->exec_preprocess(
+            input(0)->layout(), input(1)->dev_tensor().as_megdnn(),
+            output(0)->layout(), preprocessed_filter(),
+            intl::get_megdnn_workspace_from_var(output().back()));
 }
 /* ==================== ConvolutionBackwardData  ==================== */
@@ -1504,10 +1650,12 @@ size_t ConvBiasForward::get_workspace_size_bytes(
             i2,
             i3,
             {output_shapes[0], output(0)->dtype(), output(0)->format()}},
-            mo, this);
+            mo, this, allow_weight_preprocess());
 }
 void ConvBiasForward::scn_do_execute() {
+    update_preprocessed_filter();
    auto&& inp = input();
    auto mo = megdnn_opr();
    if (inp.size() == 2) {
@@ -1621,6 +1769,33 @@ megdnn::param::MatrixMul::Format ConvBiasForward::get_matmul_format(
    }
 }
+SmallVector<TensorLayout> ConvBiasForward::deduce_preprocessed_filter_layout() {
+    TensorLayout i2, i3;
+    if (input().size() > 2) {
+        i2 = input(2)->layout();
+    }
+    if (input().size() > 3) {
+        i3 = input(3)->layout();
+    }
+    return megdnn_opr()->deduce_preprocessed_filter_layout(
+            input(0)->layout(), input(1)->layout(), i2, i3,
+            output(0)->layout());
+}
+void ConvBiasForward::scn_do_execute_preprocess() {
+    TensorLayout bias_layout(output(0)->dtype()), z_layout(output(0)->dtype());
+    if (input().size() > 2) {
+        bias_layout = input(2)->layout();
+    }
+    if (input().size() > 3) {
+        z_layout = input(3)->layout();
+    }
+    megdnn_opr()->exec_preprocess(
+            input(0)->layout(), input(1)->dev_tensor().as_megdnn(), bias_layout,
+            z_layout, output(0)->layout(), preprocessed_filter(),
+            intl::get_megdnn_workspace_from_var(output().back()));
+}
 /* ===================== LocalShareForward ==================== */
 IMPL_CONV(LocalShareForward, "local_share");

--- a/src/opr/include/megbrain/opr/dnn/convolution.h
+++ b/src/opr/include/megbrain/opr/dnn/convolution.h
@@ -72,13 +72,52 @@ class Convolution {
                cg::OperatorNodeBase* self);
 };
+class WeightPreprocessExecutor : public cg::OperatorNodeMixinBase {
+    class PreprocessedFilterExecDep;
+    using PreprocessedFilter = megdnn::detail::PreprocessedFilter;
+    std::unique_ptr<PreprocessedFilter> m_preprocessed_filter;
+    SmallVector<DeviceTensorND> m_filter_storage;
+protected:
+    //! this should only be called in scn_do_execute or similar functions (i.e.
+    //! post dispatch-to-ExecEnv)
+    void mixin_update_preprocessed_filter(OperatorNodeBase& opr);
+    void record_preprocessed_weight(
+            cg::GraphExecutable::ExecDependencyArray& deps);
+    PreprocessedFilter* preprocessed_filter() const {
+        return m_preprocessed_filter.get();
+    }
+    bool mixin_allow_weight_preprocess(const OperatorNodeBase& opr) const;
+    virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout() = 0;
+    virtual void scn_do_execute_preprocess() = 0;
+};
 } // namespace mixin
 namespace intl {
+    //! glue class to apply mixin::WeightPreprocessExecutor
+    template<class Base = cg::OperatorNodeBase,
+             class MixinImpl = mixin::WeightPreprocessExecutor>
+    class OprWithWeightPreprocess: public mixin::CheckBase<Base>::Base,
+                                   public MixinImpl {
+    protected:
+        using Base::Base;
+        void update_preprocessed_filter() {
+            this->mixin_update_preprocessed_filter(*this);
+        }
+        bool allow_weight_preprocess() const {
+            return this->mixin_allow_weight_preprocess(*this);
+        }
+    };
    using ConvBiasBase = cg::SingleCNOperatorNode<
            cg::OutshapePureByInshapeOpr<>,
            mixin::MegDNNOprHolderImpl<megdnn::ConvBiasForward>>;
-    using ConvBiasForwardBase = WorkspaceSizeInfer<ConvBiasBase>;
+    using ConvBiasForwardBase =
+            OprWithWeightPreprocess<WorkspaceSizeInfer<ConvBiasBase>>;
    using DeformableConvBackwardDataT = cg::SingleCNOperatorNode<
            cg::OutshapePureByInshapeOpr<>,
@@ -90,12 +129,20 @@ namespace intl {
            mixin::MegDNNOprHolderImpl<megdnn::BatchConvBiasForward>>;
    using BatchConvBiasForwardBase = WorkspaceSizeInfer<BatchConvBiasBase>;
-    using ConvolutionForwardBase = WorkspaceSizeInfer<
+    using ConvolutionForwardBase = OprWithWeightPreprocess<
-            typename MegDNNOprWrapperFwdBase<megdnn::ConvolutionForward>::Base>;
+            WorkspaceSizeInfer<typename MegDNNOprWrapperFwdBase<
+                    megdnn::ConvolutionForward>::Base>>;
 }  // namespace intl
+namespace testing {
+class ConvolutionTestingPeer;
+}  // namespace testing
 MGB_DEFINE_OPR_CLASS(ConvolutionForward,
        intl::ConvolutionForwardBase, public mixin::Convolution) // {
    void init_profile_cache() override;
    void init_output_dtype() override;
    size_t get_workspace_size_bytes(
@@ -109,6 +156,10 @@ MGB_DEFINE_OPR_CLASS(ConvolutionForward,
                              TensorShapeArray& out_shape) const override final;
    void record_execute_deps(
            cg::GraphExecutable::ExecDependencyArray& deps) override;
+    SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
+    void scn_do_execute_preprocess() override;
+    friend testing::ConvolutionTestingPeer;
    public:
        ConvolutionForward(VarNode *src, VarNode *filter,
@@ -142,7 +193,10 @@ MGB_DEFINE_OPR_CLASS(ConvBiasForward, intl::ConvBiasForwardBase,
    void record_execute_deps(
            cg::GraphExecutable::ExecDependencyArray& deps) override {
        this->record_megdnn_opr(deps);
+        this->record_preprocessed_weight(deps);
    }
+    SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
+    void scn_do_execute_preprocess() override;
 public:
    //! src * filter

--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -21,6 +21,8 @@
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/tensor_manip.h"
+#include <gmock/gmock.h>
 #include <cmath>
 #include <random>
@@ -244,7 +246,6 @@ opr::Convolution::Param convert_to_conv_param(
            param.dilate_w, param.sparse,   param.format};
 };
 #endif
-} // anonymous namespace
 TEST(TestOprDNN, ConvolutionForward) {
    uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
@@ -1172,6 +1173,7 @@ TEST(TestOprDNN, ConvBiasForward) {
                          {1, OC, 1, 1}},
                         opt3);
        };
+        run(1, 1, 1, 5, 5, 1, 1);
        run(1, 1, 1, 5, 5, 3, 3);
        run(2, 3, 4, 5, 5, 3, 3);
        run(3, 3, 4, 224, 223, 3, 3);
@@ -2124,4 +2126,225 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) {
 #endif
+}  // anonymous namespace
+namespace mgb {
+namespace opr {
+namespace testing {
+class ConvolutionTestingPeer {
+    opr::ConvolutionForward& m_conv_opr;
+public:
+    explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
+            : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
+    void set_megdnn_opr(
+            std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
+        m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
+    }
+};
+}  // namespace testing
+}  // namespace opr
+}  // namespace mgb
+namespace {
+using megdnn::TensorND;
+using megdnn::Workspace;
+using opr::testing::ConvolutionTestingPeer;
+class MockConvolutionForward : public megdnn::ConvolutionForward {
+    const char* m_algorithm_set_name;
+public:
+    MockConvolutionForward(megdnn::ConvolutionForward* orig,
+                           const char* algo_set_name)
+            : megdnn::ConvolutionForward(orig->handle()),
+              m_algorithm_set_name(algo_set_name) {}
+    MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
+                            _megdnn_tensor_out dst,
+                            const PreprocessedFilter* preprocessed_filter,
+                            _megdnn_workspace workspace));
+    MOCK_METHOD5(exec_preprocess,
+                 void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
+                      const TensorLayout& dst_layout,
+                      PreprocessedFilter* preprocessed_filter,
+                      _megdnn_workspace workspace));
+    MOCK_METHOD4(get_workspace_in_bytes,
+                 size_t(const TensorLayout& src, const TensorLayout& filter,
+                        const TensorLayout& dst,
+                        const PreprocessedFilter* preprocessed_filter));
+    MOCK_METHOD3(deduce_preprocessed_filter_layout,
+                 SmallVector<TensorLayout>(const TensorLayout& src,
+                                           const TensorLayout& filter,
+                                           const TensorLayout& dst));
+    MOCK_METHOD3(get_preprocess_workspace_in_bytes,
+                 size_t(const TensorLayout& src, const TensorLayout& filter,
+                        const TensorLayout& dst));
+    MOCK_METHOD3(get_all_algorithms,
+                 std::vector<Algorithm*>(const TensorLayout& p0,
+                                         const TensorLayout& p1,
+                                         const TensorLayout& p2));
+    MOCK_METHOD5(get_algorithm_heuristic,
+                 Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
+                            const TensorLayout& p2,
+                            size_t workspace_limit_in_bytes,
+                            bool reproducible));
+    const char* get_algorithm_set_name() const override {
+        return m_algorithm_set_name;
+    }
+};
+class MockAlgorithm : public megdnn::detail::Algorithm {
+    const char* m_name;
+public:
+    MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
+    bool is_reproducible() const override { return true; }
+    const char* name() const override { return m_name; }
+    virtual ~MockAlgorithm() = default;
+};
+class TestWeightPreprocess : public ::testing::Test {
+protected:
+    CompNode comp_node;
+    std::shared_ptr<ComputingGraph> graph;
+    std::shared_ptr<HostTensorND> x_host;
+    MockConvolutionForward* mock_conv_ptr;
+    SymbolVar y;
+    HostTensorND y_host;
+    std::unique_ptr<cg::AsyncExecutable> func;
+    MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
+    void SetUp() override {
+        constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
+                           iw = ih;
+        comp_node = CompNode::load("cpux");
+        graph = ComputingGraph::make();
+        TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
+        x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
+        auto x = opr::Host2DeviceCopy::make(*graph, x_host);
+        auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
+        Param param;
+        param.pad_h = param.pad_w = ph;
+        param.stride_h = param.stride_w = sh;
+        param.format = Param::Format::NCHW;
+        y = opr::ConvolutionForward::make(x, w, param);
+        auto& opr =
+                y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
+        auto mock = std::make_unique<MockConvolutionForward>(
+                opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
+                                          ->current_test_info()
+                                          ->name());
+        mock_conv_ptr = mock.get();
+        ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
+        func = graph->compile({make_callback_copy(y, y_host)});
+    }
+    void run() { func->execute().wait(); }
+    void TearDown() override {
+        func.reset();
+        // Triggers mock check
+        graph.reset();
+        x_host.reset();
+    }
+};
+TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
+    using ::testing::_;
+    using ::testing::Return;
+    auto& mock = mock_conv();
+    MockAlgorithm algo;
+    EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
+            .WillRepeatedly(Return(&algo));
+    EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
+            .WillRepeatedly(Return(0));
+    EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
+            .WillRepeatedly(Return(0));
+    {
+        ::testing::InSequence seq;
+        // Return empty preprocess filters, indicating no need to preprocess
+        EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
+                .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
+        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
+        EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
+        run();
+    }
+}
+TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
+    using ::testing::_;
+    using ::testing::Return;
+    using ::testing::Field;
+    using ::testing::Invoke;
+    using ::testing::Expectation;
+    using PF = MockConvolutionForward::PreprocessedFilter;
+    auto& mock = mock_conv();
+    MockAlgorithm algo;
+    SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
+                                            {{5, 6, 7, 8}, dtype::Float32()}};
+    EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
+            .WillRepeatedly(Return(filter_layout));
+    Expectation algo_call =
+            EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
+                    .WillOnce(Return(&algo));
+    Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
+                                  .After(algo_call)
+                                  .WillOnce(Return(0));
+    Expectation pre_ws_call =
+            EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
+                    .After(algo_call)
+                    .WillOnce(Return(233));
+    {
+        ::testing::InSequence seq;
+        // exec_preprocess should be called only once, with workspace allocated
+        int salt = 0;
+        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
+                .After(ws_call, pre_ws_call)
+                .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
+                                     const TensorLayout&, PF* pf,
+                                     _megdnn_workspace workspace) {
+                    ASSERT_EQ(workspace.size, 233);
+                    ASSERT_NE(pf, nullptr);
+                    pf->algorithm_id = &salt;
+                    ASSERT_EQ(pf->tensors.size(), 2);
+                    ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
+                    ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
+                    ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
+                    ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
+                    pf->tensors[0].ptr<float>()[0] = 114.514f;
+                    pf->tensors[1].ptr<float>()[0] = 1926.0817f;
+                }));
+        // Run the graph multiple times.
+        for (int i = 0; i < 3; i++) {
+            if (i > 0) {
+                EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
+            }
+            EXPECT_CALL(mock, exec(_, _, _, _, _))
+                    .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
+                                         _megdnn_tensor_out, const PF* pf,
+                                         _megdnn_workspace) {
+                        ASSERT_NE(pf, nullptr);
+                        ASSERT_EQ(pf->algorithm_id, &salt);
+                        ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
+                        ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
+                    }));
+            run();
+        }
+    }
+}
+}  // anonymous namespace
+>>>>>>> 11c3561ca... feat(opr): use weight preprocess feature of MegDNN
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}