From 44c8d2d16f7276a00a170eeb2f7cfe9d6277184f Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 26 Jan 2021 16:20:33 +0800 Subject: [PATCH] refactor(megdnn): refactor matmul algo in deformable conv GitOrigin-RevId: 05291baf98f36141ccb6d686e2f92a58766d848c --- dnn/src/cuda/deformable_conv/bwd_data/algo.h | 12 +-- .../deformable_conv/bwd_data/algo_matmul.cpp | 96 ++++++++++++------- dnn/src/cuda/deformable_conv/bwd_flt/algo.h | 10 +- .../deformable_conv/bwd_flt/algo_matmul.cpp | 95 +++++++++++------- dnn/src/cuda/deformable_conv/fwd/algo.h | 10 +- .../cuda/deformable_conv/fwd/algo_matmul.cpp | 93 ++++++++++++------ 6 files changed, 208 insertions(+), 108 deletions(-) diff --git a/dnn/src/cuda/deformable_conv/bwd_data/algo.h b/dnn/src/cuda/deformable_conv/bwd_data/algo.h index bbc6beb80..70be564d2 100644 --- a/dnn/src/cuda/deformable_conv/bwd_data/algo.h +++ b/dnn/src/cuda/deformable_conv/bwd_data/algo.h @@ -102,24 +102,24 @@ class DeformableConvBackwardDataImpl::AlgoMatmul final : public AlgoBase { private: static WorkspaceBundle get_bundle(const SizeArgs& args); - static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl); - public: - AlgoMatmul() {} - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; bool is_reproducible() const override { return true; } - const char* name() const override { return "AlgoMatmul"; } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + + const char* name() const override { return "MATMUL"; } MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) }; class DeformableConvBackwardDataImpl::AlgoPack : NonCopyableObj { AlgoBase::Mapper m_all_algos_map; + public: AlgoPack(); AlgoMatmul algo_matmul; diff --git a/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp index 83d1786bc..e1befc395 100644 --- a/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp +++ b/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp @@ -57,24 +57,47 @@ deformable_conv::Param create_param(const Algo::SizeArgs& args, return p; } -}; // anonymous namespace -bool Algo::is_available(const SizeArgs&) const { - return true; +std::pair sub_opr_config( + const DeformableConvForwardImpl::CanonizedFilterMeta& fm, + const TensorLayout& im, + const TensorLayout& out_grad) { + auto&& dt = im.dtype; + size_t batch_sz = im[0], OH = out_grad[2], + OW = out_grad[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + size_t M = fm.icpg * FH * FW, K = fm.ocpg, N = batch_sz * OH * OW, + batch = fm.group; + TensorLayout al = {{batch, K, M}, dt}; + TensorLayout bl = {{batch, K, N}, dt}; + TensorLayout cl = {{batch, M, N}, dt}; + + BatchedMatrixMulForward::Param param; + param.compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + param.transposeA = true; + + return {{al, bl, cl}, param}; } -void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl) { - auto&& dt = args.im_layout.dtype; - auto&& fm = args.filter_meta; - size_t batch_sz = args.im_layout[0], OH = args.out_grad_layout[2], - OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; +}; // anonymous namespace - size_t M = fm.icpg * FH * FW, K = fm.ocpg, N = batch_sz * OH * OW, - batch = fm.group; - al = {{batch, K, M}, dt}; - bl = {{batch, K, N}, dt}; - cl = {{batch, M, N}, dt}; +std::vector +Algo::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + const DeformableConvBackwardDataImpl* deformable_conv = + static_cast(opr); + CanonizedFilterMeta fm = deformable_conv->make_canonized_filter_meta( + layouts[0].ndim, layouts[1], layouts[2]); + auto&& config = sub_opr_config(fm, layouts[0], layouts[4]); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD, param_str, + config.first}}; +} + +bool Algo::is_available(const SizeArgs&) const { + return true; } WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { @@ -83,14 +106,20 @@ WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { OC = args.out_grad_layout[1], OH = args.out_grad_layout[2], OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; - auto&& bmm_opr = args.handle->create_operator(); - TensorLayout al, bl, cl; + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } - get_matmul_layout(args, al, bl, cl); - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; - bmm_opr->param().transposeA = true; + auto&& config = sub_opr_config(args.filter_meta, args.im_layout, + args.out_grad_layout); + bmatmul_opr->param() = config.second; - size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + size_t bmm_ws = bmatmul_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2]); size_t result_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); size_t relayout_ws1 = batch_sz * OC * OH * OW * sizeof(float); size_t relayout_ws2 = batch_sz * IC * FH * FW * OH * OW * sizeof(float); @@ -154,21 +183,24 @@ void Algo::exec(const ExecArgs& args) const { // matmul [g, icpg, FH, FW, ocpg] * [g, ocpg, N, OH, OW] => // => [g, icpg, FH, FW, N, OH, OW] { - TensorLayout al, bl, cl; - get_matmul_layout(args, al, bl, cl); - - TensorND A(static_cast(dev_filter), al), - B(static_cast(relayout_ws1), bl), - C(static_cast(result_ws), cl); - - size_t bmm_ws_size = bundle.get_size(0); - auto&& bmm_opr = + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; - bmm_opr->param().transposeA = true; + auto&& config = sub_opr_config(args.filter_meta, args.im_layout, + args.out_grad_layout); + bmatmul_opr->param() = config.second; - bmm_opr->exec( + TensorND A(static_cast(dev_filter), config.first[0]), + B(static_cast(relayout_ws1), config.first[1]), + C(static_cast(result_ws), config.first[2]); + + size_t bmm_ws_size = bundle.get_size(0); + bmatmul_opr->exec( A, B, C, Workspace(static_cast(bmm_ws), bmm_ws_size)); } diff --git a/dnn/src/cuda/deformable_conv/bwd_flt/algo.h b/dnn/src/cuda/deformable_conv/bwd_flt/algo.h index 54100708d..83349c308 100644 --- a/dnn/src/cuda/deformable_conv/bwd_flt/algo.h +++ b/dnn/src/cuda/deformable_conv/bwd_flt/algo.h @@ -92,20 +92,20 @@ public: class DeformableConvBackwardFilterImpl::AlgoMatmul final : public AlgoBase { private: - static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl); static WorkspaceBundle get_bundle(const SizeArgs& args); public: - AlgoMatmul() {} - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; bool is_reproducible() const override { return true; } - const char* name() const override { return "AlgoMatmul"; } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + + const char* name() const override { return "MATMUL"; } MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) }; diff --git a/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp index 32b91270b..e011ebcc5 100644 --- a/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp +++ b/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/utils.h" @@ -57,25 +58,46 @@ deformable_conv::Param create_param(const Algo::SizeArgs& args, return p; } -}; // anonymous namespace - -bool Algo::is_available(const SizeArgs&) const { - return true; -} -void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl) { - auto&& dt = args.im_layout.dtype; - auto&& fm = args.filter_grad_meta; - size_t batch_sz = args.im_layout[0], OH = args.out_grad_layout[2], - OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; +std::pair sub_opr_config( + const DeformableConvBackwardFilterImpl::CanonizedFilterMeta& fm, + const TensorLayout& im, const TensorLayout& out_grad) { + auto&& dt = im.dtype; + size_t batch_sz = im[0], OH = out_grad[2], OW = out_grad[3], + FH = fm.spatial[0], FW = fm.spatial[1]; size_t M = fm.ocpg, K = OH * OW * batch_sz, N = fm.icpg * FH * FW, batch = fm.group; + TensorLayout al = {{batch, M, K}, dt}; + TensorLayout bl = {{batch, N, K}, dt}; + TensorLayout cl = {{batch, M, N}, dt}; + + BatchedMatrixMulForward::Param param; + param.compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + param.transposeB = true; - al = {{batch, M, K}, dt}; - bl = {{batch, N, K}, dt}; - cl = {{batch, M, N}, dt}; + return {{al, bl, cl}, param}; +} + +}; // anonymous namespace + +std::vector +Algo::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + const DeformableConvBackwardFilterImpl* deformable_conv = + static_cast(opr); + CanonizedFilterMeta fm = deformable_conv->make_canonized_filter_meta( + layouts[0].ndim, layouts[4], layouts[1]); + auto&& config = sub_opr_config(fm, layouts[0], layouts[3]); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD, param_str, + config.first}}; +} + +bool Algo::is_available(const SizeArgs&) const { + return true; } WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { @@ -85,16 +107,22 @@ WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { size_t IC = fm.group * fm.icpg, OC = args.out_grad_layout[1]; auto batch_sz = args.im_layout[0]; - auto&& bmm_opr = args.handle->create_operator(); - TensorLayout al, bl, cl; + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } - get_matmul_layout(args, al, bl, cl); - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; - bmm_opr->param().transposeB = true; + auto&& config = sub_opr_config(args.filter_grad_meta, args.im_layout, + args.out_grad_layout); + bmatmul_opr->param() = config.second; size_t col_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); size_t out_grad_ws = batch_sz * OC * OH * OW * sizeof(float); - size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + size_t bmm_ws = bmatmul_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2]); return {nullptr, {col_ws, out_grad_ws, bmm_ws}}; } @@ -138,20 +166,23 @@ void Algo::exec(const ExecArgs& args) const { args.handle->relayout_opr()->exec(C2, C3); // matmul - TensorLayout al, bl, cl; - get_matmul_layout(args, al, bl, cl); - - TensorND A(static_cast(out_grad_ws), al), - B(static_cast(col_ws), bl), - C(static_cast(dev_filter_grad), cl); + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } - size_t bmm_ws_size = bundle.get_size(2); - auto&& bmm_opr = args.handle->create_operator(); + auto&& config = sub_opr_config(args.filter_grad_meta, args.im_layout, + args.out_grad_layout); + bmatmul_opr->param() = config.second; - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; - bmm_opr->param().transposeB = true; + TensorND A(static_cast(out_grad_ws), config.first[0]), + B(static_cast(col_ws), config.first[1]), + C(static_cast(dev_filter_grad), config.first[2]); - bmm_opr->exec( + size_t bmm_ws_size = bundle.get_size(2); + bmatmul_opr->exec( A, B, C, Workspace(static_cast(bmm_ws), bmm_ws_size)); } diff --git a/dnn/src/cuda/deformable_conv/fwd/algo.h b/dnn/src/cuda/deformable_conv/fwd/algo.h index 52cf5ff75..0dc85f789 100644 --- a/dnn/src/cuda/deformable_conv/fwd/algo.h +++ b/dnn/src/cuda/deformable_conv/fwd/algo.h @@ -87,20 +87,20 @@ public: class DeformableConvForwardImpl::AlgoMatmul final : public AlgoBase { private: - static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl); static WorkspaceBundle get_bundle(const SizeArgs& args); public: - AlgoMatmul(){}; - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; bool is_reproducible() const override { return true; } - const char* name() const override { return "AlgoMatmul"; } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + + const char* name() const override { return "MATMUL"; } MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) }; diff --git a/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp index ab209e685..d59fdb300 100644 --- a/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp +++ b/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp @@ -57,24 +57,47 @@ deformable_conv::Param create_param(const Algo::SizeArgs& args, return p; } + +std::pair sub_opr_config( + const DeformableConvForwardImpl::CanonizedFilterMeta& fm, + const TensorLayout& im, + const TensorLayout& dst) { + auto&& dt = im.dtype; + size_t batch_sz = im[0], OH = dst[2], + OW = dst[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + size_t M = fm.ocpg, N = OH * OW * batch_sz, K = fm.icpg * FH * FW, + batch = fm.group; + TensorLayout al = {{batch, M, K}, dt}; + TensorLayout bl = {{batch, K, N}, dt}; + TensorLayout cl = {{batch, M, N}, dt}; + + BatchedMatrixMulForward::Param param; + param.compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + + return {{al, bl, cl}, param}; +} + }; // anonymous namespace -bool Algo::is_available(const SizeArgs&) const { - return true; +std::vector +Algo::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + const DeformableConvForwardImpl* deformable_conv = + static_cast(opr); + CanonizedFilterMeta fm = deformable_conv->make_canonized_filter_meta( + layouts[0].ndim, layouts[1], layouts[2]); + auto&& config = sub_opr_config(fm, layouts[0], layouts[4]); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD, param_str, + config.first}}; } -void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, - TensorLayout& bl, TensorLayout& cl) { - auto&& dt = args.im_layout.dtype; - auto&& fm = args.filter_meta; - size_t batch_sz = args.im_layout[0], OH = args.dst_layout[2], - OW = args.dst_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; - size_t M = fm.ocpg, N = OH * OW * batch_sz, K = fm.icpg * FH * FW, - batch = fm.group; - al = {{batch, M, K}, dt}; - bl = {{batch, K, N}, dt}; - cl = {{batch, M, N}, dt}; +bool Algo::is_available(const SizeArgs&) const { + return true; } WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { @@ -83,17 +106,24 @@ WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { OC = args.dst_layout[1], OH = args.dst_layout[2], OW = args.dst_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; - auto&& bmm_opr = args.handle->create_operator(); - TensorLayout al, bl, cl; + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } - get_matmul_layout(args, al, bl, cl); - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + auto&& config = + sub_opr_config(args.filter_meta, args.im_layout, args.dst_layout); + bmatmul_opr->param() = config.second; size_t col_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); - size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + size_t bmm_ws = bmatmul_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2]); size_t result_ws = batch_sz * OC * OH * OW * sizeof(float); - return {nullptr, {col_ws, bmm_ws, result_ws}}; + return WorkspaceBundle{nullptr, {col_ws, bmm_ws, result_ws}}; } size_t Algo::get_workspace_in_bytes(const SizeArgs& args) const { @@ -123,18 +153,25 @@ void Algo::exec(const ExecArgs& args) const { // im2col deformable_conv::im2col(dev_im, dev_offset, dev_mask, static_cast(col_ws), p); - // matmul - TensorLayout al, bl, cl; - get_matmul_layout(args, al, bl, cl); - TensorND A(static_cast(dev_filter), al), - B(static_cast(col_ws), bl), - C(static_cast(result_ws), cl); + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = + sub_opr_config(args.filter_meta, args.im_layout, args.dst_layout); + bmatmul_opr->param() = config.second; + + // matmul + TensorND A(static_cast(dev_filter), config.first[0]), + B(static_cast(col_ws), config.first[1]), + C(static_cast(result_ws), config.first[2]); size_t bmm_ws_size = bundle.get_size(1); - auto&& bmm_opr = args.handle->create_operator(); - bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; - bmm_opr->exec( + bmatmul_opr->exec( A, B, C, Workspace(static_cast(bmm_ws), bmm_ws_size)); // relayout -- GitLab