From 25089e520e6a012594452454a79945a8ed2752f2 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 26 Jan 2021 00:42:31 +0800 Subject: [PATCH] refactor(megdnn): refactor matmul algo in conv backward data GitOrigin-RevId: 8de601df6e7ddcdb48cd38d6ff05d132c1a014fe --- dnn/src/cuda/convolution/backward_data/algo.h | 4 + .../cuda/convolution/backward_data/matmul.cpp | 179 +++++++++++------- .../convolution/backward_filter/matmul.cpp | 8 +- dnn/src/cuda/convolution/helper.cpp | 4 +- dnn/src/cuda/convolution/helper.h | 3 +- dnn/test/cuda/convolution.cpp | 33 +++- 6 files changed, 159 insertions(+), 72 deletions(-) diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index e6a3f6b5b..1fa0e5082 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -141,6 +141,10 @@ public: size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + const char* name() const override { return "MATMUL"; } bool is_reproducible() const override { return true; } MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) diff --git a/dnn/src/cuda/convolution/backward_data/matmul.cpp b/dnn/src/cuda/convolution/backward_data/matmul.cpp index 6fd374aa1..d2cfea0ad 100644 --- a/dnn/src/cuda/convolution/backward_data/matmul.cpp +++ b/dnn/src/cuda/convolution/backward_data/matmul.cpp @@ -6,42 +6,101 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" -#include "src/cuda/utils.h" #include "src/cuda/convolution/helper.h" #include "src/cuda/convolution/im2col.cuh" #include "src/cuda/matrix_mul/opr_impl.h" +#include "src/cuda/utils.h" using namespace megdnn; using namespace cuda; +namespace { +std::pair sub_opr_config( + const ConvolutionBackwardDataImpl::CanonizedFilterMeta& fm, + const TensorLayout& filter_layout, const TensorLayout& diff_layout, + const TensorLayout& grad_layout, + const ConvolutionBackwardDataImpl* opr) { + size_t N = grad_layout.shape[0], IC = fm.icpg, + OC = fm.ocpg, OH = diff_layout.shape[2], + OW = diff_layout.shape[3], FH = fm.spatial[0], + FW = fm.spatial[1]; + + megdnn_assert(filter_layout.dtype.enumv() == diff_layout.dtype.enumv()); + TensorLayout Al({OC, IC * FH * FW}, filter_layout.dtype), + Bl({IC * FH * FW, OH * OW * N}, filter_layout.dtype), + Cl({OC, OH * OW * N}, filter_layout.dtype); + MatrixMulForward::Param param; + if (opr->param().compute_mode == + param::Convolution::ComputeMode::FLOAT32) { + param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32; + } + + param.transposeA = true; + return {{Al, Cl, Bl}, param}; +} +} // namespace + +std::vector +ConvolutionBackwardDataImpl::AlgoMatmul::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + const ConvolutionBackwardDataImpl* conv_backward_data_opr = + static_cast(opr); + CanonizedFilterMeta fm = conv_backward_data_opr->check_layout_fwd( + layouts[2], layouts[0], layouts[1]); + auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[2], + conv_backward_data_opr); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::MATRIX_MUL_FORWARD, param_str, + config.first}}; +} + bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available( - const SizeArgs &args) const { + const SizeArgs& args) const { if (args.diff_layout->dtype == args.filter_layout->dtype && args.diff_layout->dtype == dtype::BFloat16()) { return false; } - auto &&fm = args.filter_meta; + auto&& fm = args.filter_meta; return args.filter_meta.format == Param::Format::NCHW && args.diff_layout->dtype.category() == DTypeCategory::FLOAT && fm.group == 1 && fm.spatial_ndim == 2; } size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes( - const SizeArgs &args) const { - return matmul_get_workspace_bundle( - args.as_fwd_args()).total_size_in_bytes(); + const SizeArgs& args) const { + auto matmul_opr = + args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + matmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = + sub_opr_config(args.filter_meta, *args.filter_layout, + *args.diff_layout, *args.grad_layout, args.opr); + matmul_opr->param() = config.second; + + auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args()); + sizes.push_back(matmul_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2])); + return WorkspaceBundle(nullptr, sizes).total_size_in_bytes(); } -void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const { -#define cb(DType) \ - if (args.diff_layout->dtype == DType()) { \ +void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const { +#define cb(DType) \ + if (args.diff_layout->dtype == DType()) { \ using ctype = typename DTypeTrait::ctype; \ - exec_internal(args); \ - return; \ + exec_internal(args); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -49,77 +108,67 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const { megdnn_assert_internal(0); } -template +template void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal( - const ExecArgs &args) { - auto &&fm = args.filter_meta; - size_t N = args.grad_layout->shape[0], - IC = fm.icpg, - IH = args.grad_layout->shape[2], - IW = args.grad_layout->shape[3], - OC = fm.ocpg, - OH = args.diff_layout->shape[2], - OW = args.diff_layout->shape[3], - FH = fm.spatial[0], - FW = fm.spatial[1], - PH = fm.padding[0], - PW = fm.padding[1], - SH = fm.stride[0], - SW = fm.stride[1], - DH = fm.dilation[0], + const ExecArgs& args) { + auto&& fm = args.filter_meta; + size_t N = args.grad_layout->shape[0], IC = fm.icpg, + IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3], + OC = fm.ocpg, OH = args.diff_layout->shape[2], + OW = args.diff_layout->shape[3], FH = fm.spatial[0], + FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1], + SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0], DW = fm.dilation[1]; auto stream = cuda_stream(args.handle); - auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); - wbundle.set(args.workspace.raw_ptr); - T *diff_t = static_cast(wbundle.get(0)); - T *col = static_cast(wbundle.get(1)); + + auto matmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + matmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + auto&& config = + sub_opr_config(args.filter_meta, *args.filter_layout, + *args.diff_layout, *args.grad_layout, args.opr); + matmul_opr->param() = config.second; + + auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args()); + sizes.push_back(matmul_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2])); + auto wbundle = WorkspaceBundle(args.workspace.raw_ptr, sizes); + + T* diff_t = static_cast(wbundle.get(0)); + T* col = static_cast(wbundle.get(1)); { // transpose diff - TensorLayout froml({N, OC*OH*OW}, typename DTypeTrait::dtype()), - tol(froml); + TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait::dtype()), + tol(froml); froml.stride[0] = args.diff_layout->stride[0]; tol.stride[0] = 1; tol.stride[1] = N; - TensorND from(args.diff_tensor->ptr(), froml), - to(diff_t, tol); + TensorND from(args.diff_tensor->ptr(), froml), to(diff_t, tol); args.handle->relayout_opr()->exec(from, to); } { // take gemm grad - TensorLayout Al({OC, IC*FH*FW}, typename DTypeTrait::dtype()), - Bl({IC*FH*FW, OH*OW*N}, typename DTypeTrait::dtype()), - Cl({OC, OH*OW*N}, typename DTypeTrait::dtype()); - TensorND A(args.filter_tensor->ptr(), Al), - B(col, Bl), - C(diff_t, Cl); + TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait::dtype()), + Bl({IC * FH * FW, OH * OW * N}, + typename DTypeTrait::dtype()), + Cl({OC, OH * OW * N}, typename DTypeTrait::dtype()); + TensorND A(args.filter_tensor->ptr(), Al), B(col, Bl), C(diff_t, Cl); if (fm.should_flip) { convolution::flip_filter(args.as_fwd_args(), - wbundle.get_workspace(2), A.raw_ptr); - } - auto&& matmul_opr = args.handle->create_operator(); - if (args.opr->param().compute_mode == - param::Convolution::ComputeMode::FLOAT32) { - matmul_opr->param().compute_mode = - param::MatrixMul::ComputeMode::FLOAT32; + wbundle.get_workspace(2), A.raw_ptr); + matmul_opr->exec(A, C, B, wbundle.get_workspace(3)); + } else { + matmul_opr->exec(A, C, B, wbundle.get_workspace(2)); } - matmul_opr->param().transposeA = true; - megdnn_assert(matmul_opr->get_workspace_in_bytes(A.layout, C.layout, - B.layout) == 0_z, - "Assume matmul opr in algo MATMUL doesn't need extra " - "workspace"); - matmul_opr->exec(A, C, B, Workspace()); } { // col2im - convolution::col2im(col, args.grad_tensor->ptr(), - N, args.grad_layout->stride[0], - IC, IH, IW, - FH, FW, - OH, OW, - PH, PW, - SH, SW, - DH, DW, - stream); + convolution::col2im(col, args.grad_tensor->ptr(), N, + args.grad_layout->stride[0], IC, IH, IW, FH, FW, + OH, OW, PH, PW, SH, SW, DH, DW, stream); } } diff --git a/dnn/src/cuda/convolution/backward_filter/matmul.cpp b/dnn/src/cuda/convolution/backward_filter/matmul.cpp index 0ae5871a4..9ba38d195 100644 --- a/dnn/src/cuda/convolution/backward_filter/matmul.cpp +++ b/dnn/src/cuda/convolution/backward_filter/matmul.cpp @@ -31,8 +31,9 @@ bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available( size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes( const SizeArgs &args) const { - return matmul_get_workspace_bundle( - args.as_fwd_args()).total_size_in_bytes(); + return WorkspaceBundle(nullptr, + matmul_get_workspace_bundle(args.as_fwd_args())) + .total_size_in_bytes(); } void ConvolutionBackwardFilterImpl::AlgoMatmul::exec( @@ -69,7 +70,8 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal( DH = fm.dilation[0], DW = fm.dilation[1]; auto stream = cuda_stream(args.handle); - auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); + auto wbundle = WorkspaceBundle( + nullptr, matmul_get_workspace_bundle(args.as_fwd_args())); wbundle.set(args.workspace.raw_ptr); T *diff_t = static_cast(wbundle.get(0)); T *col = static_cast(wbundle.get(1)); diff --git a/dnn/src/cuda/convolution/helper.cpp b/dnn/src/cuda/convolution/helper.cpp index 7c7280b55..2cfb99420 100644 --- a/dnn/src/cuda/convolution/helper.cpp +++ b/dnn/src/cuda/convolution/helper.cpp @@ -48,7 +48,7 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { return supported; } -WorkspaceBundle convolution::matmul_get_workspace_bundle( +SmallVector convolution::matmul_get_workspace_bundle( const ForwardSizeArgs &args) { auto dtype = args.src_layout->dtype; auto &&fm = args.filter_meta; @@ -67,7 +67,7 @@ WorkspaceBundle convolution::matmul_get_workspace_bundle( if (args.filter_meta.should_flip) { sizes.push_back(dtype.size() * OC * IC * FH * FW); } - return {nullptr, std::move(sizes)}; + return sizes; } void convolution::flip_filter(const ForwardSizeArgs &args, diff --git a/dnn/src/cuda/convolution/helper.h b/dnn/src/cuda/convolution/helper.h index 0528f5e22..a8bc57c8e 100644 --- a/dnn/src/cuda/convolution/helper.h +++ b/dnn/src/cuda/convolution/helper.h @@ -34,7 +34,8 @@ namespace convolution { bool is_cudnn_supported(const ForwardSizeArgs &args); //! get workspace bundle for matmul algo - WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs &args); + SmallVector matmul_get_workspace_bundle( + const ForwardSizeArgs& args); struct CUDNNForwardDescs { TensorDesc src_desc, dst_desc; diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 64da4422e..dae59a25a 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -230,7 +230,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) } checker.set_before_exec_callback(AlgoChecker( ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16", - {{"MATMUL", {}}}})); + {{"MATMUL", {{"CUBLAS", {}}}}}})); src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_rng(0, &rng) @@ -243,6 +243,37 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) } } +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) +{ + using namespace convolution; + std::vector args = get_args_cuda_conv_bwd_data(); + Checker checker(handle_cuda()); + + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); + NormalRNG default_rng; + for (auto &&arg: args) { + float scale = + 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); + UniformFloatRNG rng(scale, 2 * scale); + auto src = TensorLayout(arg.src, dtype::Float32()); + auto filter = TensorLayout(arg.filter, dtype::Float32()); + TensorLayout dst; + { + auto opr = handle_cuda()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout(src, filter, dst); + } + src.dtype = dst.dtype = filter.dtype = dtype::Float32(); + checker.set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + } +} + + TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) { // BRAIN-481 failed on architectures 7.0, remove the following if statement, -- GitLab