diff --git a/dnn/src/cuda/convolution/forward/algos.cpp b/dnn/src/cuda/convolution/forward/algos.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3891bb7e13c04b2f986acd598b8fe2a03f23d7d6 --- /dev/null +++ b/dnn/src/cuda/convolution/forward/algos.cpp @@ -0,0 +1,172 @@ +/** + * \file dnn/src/cuda/convolution/forward/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/convolution/forward/algos.h" +#include "src/cuda/conv_bias/opr_impl.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/common/algo_base.h" +#include "src/common/algo_chooser.h" + +using namespace megdnn; +using namespace cuda; + +namespace { +std::pair sub_opr_config( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, const ConvolutionForwardImpl* opr) { + auto conv_param = opr->param(); + DType bias_type; + if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { + bias_type = dtype::QuantizedS32( + src.dtype.param().scale * + + filter.dtype.param().scale); + } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { + bias_type = dtype::QuantizedS32( + src.dtype.param().scale * + + filter.dtype.param().scale); + } else if (src.dtype.enumv() == DTypeEnum::Uint8 || + src.dtype.enumv() == DTypeEnum::Int8) { + bias_type = dtype::Int32{}; + } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + bias_type = dtype::QuantizedS32( + src.dtype.param().scale * + + filter.dtype.param().scale); + } else { + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + bias_type = src.dtype; + } + + std::pair ret; + ret.second = {param::ConvBias::NonlineMode::IDENTITY, + conv_param.mode, + conv_param.sparse, + conv_param.format, + conv_param.pad_h, + conv_param.pad_w, + conv_param.stride_h, + conv_param.stride_w, + conv_param.dilate_h, + conv_param.dilate_w, + conv_param.compute_mode}; + ret.first.push_back(TensorLayout({}, bias_type)); + ret.first.push_back(TensorLayout({}, dst.dtype)); + return ret; +} + +} // namespace + +ConvolutionForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_default); + + for (auto&& algo : all_algos) { + m_all_algos_map.emplace(algo->info().desc, algo); + } +} + +ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack; + +MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionForwardImpl) + +ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o, + const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) + : opr{o}, layout_src{&src}, layout_filter{&filter}, layout_dst{&dst} {} + +ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs( + ConvolutionForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_out dst, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, filter.layout, dst.layout), + tensor_src{src}, + tensor_filter{filter}, + tensor_dst{dst}, + workspace{workspace} {} + +std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const { + return megdnn_mangle(ssprintf("src=%s, filter=%s, dst=%s", + layout_src->to_string().c_str(), + layout_filter->to_string().c_str(), + layout_dst->to_string().c_str())); +} + +/* ===================== default algo ===================== */ +std::vector +ConvolutionForwardImpl::AlgoDefault::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + auto&& config = + sub_opr_config(layouts[0], layouts[1], layouts[2], + static_cast(opr)); + + TensorLayoutArray conv_bias_layouts = {layouts[0], layouts[1], + config.first[0], config.first[1], + layouts[2]}; + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, + conv_bias_layouts}}; +} + +bool ConvolutionForwardImpl::AlgoDefault::is_available( + const SizeArgs& args) const { + auto conv_bias_opr = + args.opr->handle()->create_operator(); + auto&& config = sub_opr_config( + *args.layout_src, *args.layout_filter, *args.layout_dst, + args.opr); + conv_bias_opr->param() = config.second; + return get_algorithm(static_cast(conv_bias_opr.get()), + *args.layout_src, *args.layout_filter, config.first[0], + config.first[1], *args.layout_dst); +} + + +size_t ConvolutionForwardImpl::AlgoDefault::get_workspace_in_bytes( + const SizeArgs& args) const { + auto conv_bias_opr = args.opr->handle()->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + conv_bias_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = sub_opr_config( + *args.layout_src, *args.layout_filter, *args.layout_dst, + args.opr); + conv_bias_opr->param() = config.second; + return conv_bias_opr->get_workspace_in_bytes( + *args.layout_src, *args.layout_filter, config.first[0], + config.first[1], *args.layout_dst, nullptr); +} + +void ConvolutionForwardImpl::AlgoDefault::exec(const ExecArgs& args) const { + auto conv_bias_opr = args.opr->handle()->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + conv_bias_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = sub_opr_config( + *args.layout_src, *args.layout_filter, *args.layout_dst, + args.opr); + conv_bias_opr->param() = config.second; + conv_bias_opr->exec(args.tensor_src, args.tensor_filter, + {nullptr, config.first[0]}, {nullptr, config.first[1]}, + args.tensor_dst, nullptr, args.workspace); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/forward/algos.h b/dnn/src/cuda/convolution/forward/algos.h new file mode 100644 index 0000000000000000000000000000000000000000..b06b6ee8a9528eaec2e691b4ac70003f59ac1305 --- /dev/null +++ b/dnn/src/cuda/convolution/forward/algos.h @@ -0,0 +1,111 @@ +/** + * \file dnn/src/cuda/convolution/forward/algos.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once +#include "megdnn/oprs.h" +#include "src/common/algo_base.h" +#include "src/common/metahelper.h" +#include "src/common/utils.h" +#include "src/cuda/convolution/opr_impl.h" + +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for convolutionForward algos + * + */ +class ConvolutionForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + enum class AlgoType : uint32_t { + CUDA_DEFAULT, + }; + using Mapper = std::unordered_map; + + AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } + + struct SizeArgs { + ConvolutionForwardImpl* opr; + const TensorLayout *layout_src, *layout_filter, *layout_dst; + + std::string to_string() const; + SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + TensorND tensor_src, tensor_filter, tensor_dst; + Workspace workspace; + + ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs&) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) const { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) const { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "convolution fwd algo %s: required workspace %zu bytes, " + "got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class ConvolutionForwardImpl::AlgoDefault final : public AlgoBase { +public: + AlgoDefault() = default; + bool is_available(const SizeArgs&) const override; + size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override; + const char* name() const override { return "DEFAULT"; } + void exec(const ExecArgs&) const override; + bool is_reproducible() const override { return true; } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + MEGDNN_DECL_ALGO_TYPE(CUDA_DEFAULT) +}; + +class ConvolutionForwardImpl::AlgoPack : NonCopyableObj { +private: + AlgoBase::Mapper m_all_algos_map; + +public: + AlgoPack(); + AlgoDefault algo_default; + std::vector all_algos; + + const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index 288fdb34d7ad4e8a5d1b763caa23881e723ed95c..badbde22d90fa348bf2a5ea4e5f2a5d358d6afa0 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -12,6 +12,7 @@ #include "src/cuda/convolution/opr_impl.h" #include "megdnn/dtype.h" #include "src/cuda/convolution/helper.h" +#include "src/cuda/convolution/forward/algos.h" #include "src/cuda/convolution/backward_data/algo.h" #include "src/cuda/convolution/backward_filter/algo.h" #include "src/cuda/conv_bias/opr_impl.h" @@ -28,108 +29,34 @@ using namespace convolution; TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) /* ============== ConvolutionForwardImpl ============== */ -ConvolutionForwardImpl::ConvBiasExtraData -ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& dst) { - auto conv_param = param(); - DType bias_type; - if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { - bias_type = dtype::QuantizedS32( - src.dtype.param().scale * - - filter.dtype.param().scale); - } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { - bias_type = dtype::QuantizedS32( - src.dtype.param().scale * - - filter.dtype.param().scale); - } else if (src.dtype.enumv() == DTypeEnum::Uint8 || - src.dtype.enumv() == DTypeEnum::Int8) { - bias_type = dtype::Int32{}; - } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - bias_type = dtype::QuantizedS32( - src.dtype.param().scale * - - filter.dtype.param().scale); - } else { - megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); - bias_type = src.dtype; - } - ConvBiasExtraData ret = {this->handle()->create_operator(), - TensorLayout(bias_type), TensorLayout(dst.dtype)}; - ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY, - conv_param.mode, - conv_param.sparse, - conv_param.format, - conv_param.pad_h, - conv_param.pad_w, - conv_param.stride_h, - conv_param.stride_w, - conv_param.dilate_h, - conv_param.dilate_w, - conv_param.compute_mode}; - ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; - return ret; -} - ConvolutionForwardImpl::Algorithm* ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, size_t workspace_limit_in_bytes, bool reproducible) { - auto extra_data = conv_bias_extra_data(src, filter, dst); - return static_cast(extra_data.convbias_opr.get()) - ->get_algorithm_heuristic(src, filter, extra_data.bias_layout, - extra_data.z_layout, dst, - workspace_limit_in_bytes, reproducible); -} - -ConvolutionForwardImpl::Algorithm* -ConvolutionForwardImpl::get_algorithm_from_desc( - const ConvolutionForward::AlgorithmDesc& desc) { - auto conv_param = param(); - auto convbias_opr = this->handle()->create_operator(); - convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY, - conv_param.mode, - conv_param.sparse, - conv_param.format, - conv_param.pad_h, - conv_param.pad_w, - conv_param.stride_h, - conv_param.stride_w, - conv_param.dilate_h, - conv_param.dilate_w, - conv_param.compute_mode}; - convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; - - return static_cast(convbias_opr.get()) - ->get_algorithm_from_desc(desc); + AlgoBase::SizeArgs args{this, src, filter, dst}; + MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); + MEGDNN_MARK_USED_VAR(reproducible); + return &sm_algo_pack.algo_default; } std::vector ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) { - auto extra_data = conv_bias_extra_data(src, filter, dst); - return static_cast(extra_data.convbias_opr.get()) - ->get_all_algorithms(src, filter, extra_data.bias_layout, - extra_data.z_layout, dst); + AlgoBase::SizeArgs args{this, src, filter, dst}; + return megdnn::get_all_algorithms(args); } size_t ConvolutionForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { - auto extra_data = conv_bias_extra_data(src, filter, dst); - return static_cast(extra_data.convbias_opr.get()) - ->get_workspace_in_bytes( - src, filter, extra_data.bias_layout, extra_data.z_layout, - dst, - reinterpret_cast::PreprocessedFilter*>( - preprocessed_filter)); + MEGDNN_MARK_USED_VAR(preprocessed_filter); + AlgoBase::SizeArgs args{this, src, filter, dst}; + return megdnn::get_algorithm(this, src, filter, dst) + ->get_workspace_in_bytes(args); } void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, @@ -137,20 +64,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { - auto extra_data = - conv_bias_extra_data(src.layout, filter.layout, dst.layout); - TensorND bias(nullptr, extra_data.bias_layout); - TensorND z(nullptr, extra_data.z_layout); - return static_cast(extra_data.convbias_opr.get()) - ->exec(src, filter, bias, z, dst, - reinterpret_cast::PreprocessedFilter*>( - preprocessed_filter), - workspace); + check_exec(src.layout, filter.layout, dst.layout, workspace.size, + preprocessed_filter); + AlgoBase::ExecArgs args(this, src, filter, dst, workspace); + auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout); + algo->check_workspace(args, workspace).exec(args); } const char* ConvolutionForwardImpl::get_algorithm_set_name() const { - return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; + return "CUDA CONVOLUTION_FORWARD" ; } /* ============== ConvolutionBackwardDataImpl ============== */ diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h index 8610325c6628e0878f736e19bb9a539b5ad17868..1ca8db092b8e30de1ffc80520915dbde2c94339f 100644 --- a/dnn/src/cuda/convolution/opr_impl.h +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -16,58 +17,56 @@ namespace megdnn { namespace cuda { -class ConvolutionForwardImpl: public ConvolutionForward { - public: - using ConvolutionForward::ConvolutionForward; - void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, - _megdnn_tensor_out dst, - const PreprocessedFilter* preprocessed_filter, - _megdnn_workspace workspace) override; - - size_t get_workspace_in_bytes( - const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& dst, - const PreprocessedFilter* preprocessed_filter) override; - const char* get_algorithm_set_name() const override; - - SmallVector deduce_preprocessed_filter_layout( - const TensorLayout&, const TensorLayout&, - const TensorLayout&) override { - return {}; - } - size_t get_preprocess_workspace_in_bytes( - const TensorLayout& , const TensorLayout& , - const TensorLayout& ) override{ - return 0; - } - void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, - const TensorLayout&, PreprocessedFilter*, - _megdnn_workspace) override { - megdnn_throw("cuda exec_preprocess has not implemeted yet"); - } - - Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; - - protected: - struct ConvBiasExtraData{ - std::unique_ptr convbias_opr; - TensorLayout bias_layout; - TensorLayout z_layout; - }; - - std::vector get_all_algorithms( - const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& dst) override; - Algorithm* get_algorithm_heuristic(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& dst, - size_t workspace_limit_in_bytes, - bool reproducible) override; - - private: - ConvBiasExtraData conv_bias_extra_data(const TensorLayout&, - const TensorLayout&, - const TensorLayout&); +class ConvolutionForwardImpl : public ConvolutionForward { +public: + using ConvolutionForward::ConvolutionForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + const PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter) override; + const char* get_algorithm_set_name() const override; + + SmallVector deduce_preprocessed_filter_layout( + const TensorLayout&, const TensorLayout&, + const TensorLayout&) override { + return {}; + } + size_t get_preprocess_workspace_in_bytes(const TensorLayout&, + const TensorLayout&, + const TensorLayout&) override { + return 0; + } + void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, + const TensorLayout&, PreprocessedFilter*, + _megdnn_workspace) override { + megdnn_throw("cuda exec_preprocess has not implemeted yet"); + } + + Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; + + class AlgoBase; + class AlgoDefault; + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +protected: + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + +private: + static AlgoPack sm_algo_pack; }; class ConvolutionBackwardDataImpl : public ConvolutionBackwardData { @@ -122,6 +121,7 @@ protected: const TensorLayout& grad, size_t workspace_limit_in_bytes, bool reproducible) override; + private: Algorithm* get_algorithm_heuristic(const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, @@ -141,12 +141,10 @@ public: size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) override; - AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& src, - const TensorLayout& diff, - const TensorLayout& grad, - const CanonizedFilterMeta& grad_meta, - size_t workspace_limit_in_bytes, - bool reproducible) { + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, + size_t workspace_limit_in_bytes, bool reproducible) { return get_algorithm_heuristic(src, diff, grad, grad_meta, workspace_limit_in_bytes, reproducible) ->info(); @@ -162,7 +160,6 @@ public: ->info(); } - const char* get_algorithm_set_name() const override; class AlgoBase; @@ -187,6 +184,7 @@ protected: const TensorLayout& grad, size_t workspace_limit_in_bytes, bool reproducible) override; + private: Algorithm* get_algorithm_heuristic(const TensorLayout& src, const TensorLayout& diff, diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h index 18eeb95ffe86dc4b8d9e4df4b8ec17626698d8ce..640b4ad6279da769638ae4fa446276b7e98ef949 100644 --- a/dnn/test/common/checker.h +++ b/dnn/test/common/checker.h @@ -532,6 +532,30 @@ private: bool* m_require_algo; }; +template +void construct_sub_execution_policy_heuristic(ExecutionPolicy& policy, + const TensorLayoutArray& layouts, + const std::string& param, + Handle* handle) { + megdnn_assert(layouts.size() == OprTrait::arity); + auto opr = handle->create_operator(); + opr->param() = Algorithm::deserialize_read_pod(param); + if (!policy.algo.valid()) { + policy.algo = AlgoProxy::arity>:: + get_algorithm_info_heuristic(opr.get(), layouts).desc; + } + + Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); + std::vector&& sub_items = + algo->get_subopr_list(layouts, opr.get()); + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + policy.sub_policy.push_back(ExecutionPolicy{}); + construct_sub_execution_policy_heuristic<_Opr>( + policy.sub_policy.back(), _item.layouts, _item.param, + handle); + }); +} + } // namespace test } // namespace megdnn diff --git a/dnn/test/common/convolution.cpp b/dnn/test/common/convolution.cpp index 75cfaae377848755648b8685cbed9019ba1e994f..1a8ae04b3791a2a5d17f44d304aae8f1456d8f65 100644 --- a/dnn/test/common/convolution.cpp +++ b/dnn/test/common/convolution.cpp @@ -570,6 +570,8 @@ void convolution::test_conv_config_combinations(int k_size, .set_param(param); auto opr = checker.opr(); opr->param() = param; + std::string param_str; + Algorithm::serialize_write_pod(opr->param(), param_str); TensorLayout ily{ishp, inp_type}, fly{fshp, inp_type}, oly; oly.dtype = out_type; opr->deduce_layout(ily, fly, oly); @@ -581,10 +583,14 @@ void convolution::test_conv_config_combinations(int k_size, for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { used_algos.insert(algo.desc); opr->execution_policy().algo = algo.desc; + + construct_sub_execution_policy_heuristic( + opr->execution_policy(), {ily, fly, oly}, param_str, + opr->handle()); checker .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) .execs({ishp, fshp, {}}); - opr->execution_policy().algo.reset(); + opr->execution_policy() = {}; ASSERT_TRUE(checker.prev_succ()) << errmsg(algo.name.c_str()); } @@ -597,13 +603,19 @@ void convolution::test_conv_config_combinations(int k_size, auto opr = checker_bwd_data.opr(); opr->param() = param; + std::string param_str; + Algorithm::serialize_write_pod(opr->param(), param_str); for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { used_algos_bwd_data.insert(algo.desc); opr->execution_policy().algo = algo.desc; + construct_sub_execution_policy_heuristic< + ConvolutionBackwardData>(opr->execution_policy(), + {fly, oly, ily}, param_str, + opr->handle()); checker_bwd_data .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) .execl({fly, oly, ily}); - opr->execution_policy().algo.reset(); + opr->execution_policy() = {}; ASSERT_TRUE(checker_bwd_data.prev_succ()) << errmsg(algo.name.c_str()); } @@ -618,13 +630,19 @@ void convolution::test_conv_config_combinations(int k_size, auto opr = checker_bwd_filter.opr(); opr->param() = param; + std::string param_str; + Algorithm::serialize_write_pod(opr->param(), param_str); for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { used_algos_bwd_flt.insert(algo.desc); opr->execution_policy().algo = algo.desc; + construct_sub_execution_policy_heuristic< + ConvolutionBackwardFilter>(opr->execution_policy(), + {ily, oly, fly}, param_str, + opr->handle()); checker_bwd_filter .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) .execl({ily, oly, fly}); - opr->execution_policy().algo.reset(); + opr->execution_policy() = {}; ASSERT_TRUE(checker_bwd_filter.prev_succ()) << errmsg(algo.name.c_str()); } diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index 0aafd518a9675b1f14eebb7e9e8b7fdd9cc54b29..388c990f403e907db680f99642d0f7b159df832f 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -338,6 +338,7 @@ struct OprProxyProfilingBase FastRunCache& cache) { megdnn_assert(layouts.size() == arity); auto opr = handle->create_operator(); + opr->param() = Algorithm::deserialize_read_pod(param); SmallVector sizes_in_bytes; @@ -427,9 +428,9 @@ struct OprProxyProfilingBase auto&& search_items = flatten_search_space(layouts, param_str, opr->handle()); FOREACH_OPR_TYPE_DISPATCH(search_items, { - OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W, - opr->handle(), warmup_times, - exec_times, cache); + OprProxyProfilingBase<_Opr>::search( + _item.layouts, _item.param, W, opr->handle(), + warmup_times, exec_times, cache); }); construct_execution_policy(layouts, param_str, opr->handle(), cache, diff --git a/dnn/test/cuda/chanwise_convolution.cpp b/dnn/test/cuda/chanwise_convolution.cpp index 8a41d72655b72d8e204fc1e2ef3ff1938113205e..3575c08ac2a5be423ba48c9414eb3e59c8afb7f4 100644 --- a/dnn/test/cuda/chanwise_convolution.cpp +++ b/dnn/test/cuda/chanwise_convolution.cpp @@ -273,10 +273,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) { Checker checker(handle_cuda()); bool require_algo = false; checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "CHANNEL_WISE", {}) - .c_str(), + ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "CHANNEL_WISE", {}) + .c_str(), + {}}}}, &require_algo)); + for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) { checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); if (dtype.enumv() == DTypeEnum::Float16) @@ -306,8 +310,12 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) { Checker checker(handle_cuda()); bool require_algo = false; checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "CHANNEL_WISE_SMALL", {}).c_str(), + ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "CHANNEL_WISE_SMALL", {}) + .c_str(), + {}}}}, &require_algo)); for (auto dtype : std::vector { dtype::Float32(), @@ -338,6 +346,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) { bool require_algo = false; checker.set_before_exec_callback(AlgoChecker( "CHANNEL_WISE", &require_algo)); + for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) { checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); if (dtype.enumv() == DTypeEnum::Float16) @@ -368,9 +377,8 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) { TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) { Checker checker(handle_cuda()); bool require_algo = false; - checker.set_before_exec_callback( - AlgoChecker( - "CHANNEL_WISE_SMALL", &require_algo)); + checker.set_before_exec_callback(AlgoChecker( + "CHANNEL_WISE_SMALL", &require_algo)); for (auto dtype : std::vector { dtype::Float32(), #if CUDA_VERSION >= 9000 @@ -396,10 +404,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) { Checker checker(handle_cuda()); bool require_algo = false; checker.set_before_exec_callback(AlgoChecker( - "CHANNEL_WISE", &require_algo)); + "CHANNEL_WISE", &require_algo)); UniformFloatRNG rng(-0.1, 0.1); for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) { - checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng); + checker.set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype) + .set_rng(0, &rng) + .set_rng(1, &rng); if (dtype.enumv() == DTypeEnum::Float16) checker.set_epsilon(2e-1); // simple case @@ -514,7 +526,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) { auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) { - checker.proxy()->target_execution_policy.algo.reset(); + checker.proxy()->target_execution_policy = {}; checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); }; @@ -614,7 +626,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_execution_policy.algo.reset(); + bencher.proxy()->target_execution_policy = {}; auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; bencher.set_param(param) @@ -623,7 +635,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { .set_dtype(2, dtype::Float16()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_execution_policy.algo.reset(); + bencher.proxy()->target_execution_policy = {}; auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; bencher.proxy()->target_execution_policy.algo.reset(); @@ -677,10 +689,13 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) { CUBenchmarker bencher(handle_cuda()); size_t RUNS = 1; bencher.set_display(false).set_times(RUNS); - bencher.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "CHANNEL_WISE", {}) - .c_str())); + bencher.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "CHANNEL_WISE", {}) + .c_str(), + {}}}})); Convolution::Param param; param.format = ConvBias::Param::Format::NCHW; @@ -783,17 +798,24 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng) - .set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name< - ConvBiasForward::DirectParam>("CHANNEL_WISE", - {}) - .c_str())); + .set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name< + ConvBiasForward::DirectParam>( + "CHANNEL_WISE", {}) + .c_str(), + {}}}})); auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS; bencher.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "CHANNEL_WISE", {}) - .c_str())); + ExecutionPolicyAlgoName{"DEFAULT", + {{ConvBiasForward::algo_name< + ConvBiasForward::DirectParam>( + "CHANNEL_WISE", {}) + .c_str(), + {}}}})); + auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS; bencher.set_param(param) diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index e2a82ba59ced7520311ef10b14337906546c8d95..64da4422e8b08735b9e70baad9e7db9255416360 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -135,10 +135,13 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) { .set_rng(1, &int_rng) .set_param(param); - checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "MATMUL8X8X32", {}) - .c_str())); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "MATMUL8X8X32", {}) + .c_str(), + {}}}})); param.sparse = Convolution::Param::Sparse::DENSE; param.pad_h = param.pad_w = 1; diff --git a/dnn/test/cuda/dilated_convolution.cpp b/dnn/test/cuda/dilated_convolution.cpp index 505b90defd271cc0425c8e46888412eafc140e4f..14979bf687b9cef1ee8656db0272ccce369e2b70 100644 --- a/dnn/test/cuda/dilated_convolution.cpp +++ b/dnn/test/cuda/dilated_convolution.cpp @@ -30,19 +30,26 @@ TEST_F(CUDA, DILATED_CONVOLUTION_FORWARD) auto args = get_dilated_args(); Checker checker(handle_cuda()); #if CUDNN_VERSION >= 7500 - checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_" - "PRECOMP_" - "GEMM" CUDNN_VERSION_STRING, - {}) - .c_str())); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_" + "IMPLICIT_" + "PRECOMP_" + "GEMM" CUDNN_VERSION_STRING, + {}) + .c_str(), + {}}}})); printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); #else - checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name("MATMUL", - {}) - .c_str())); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name( + "MATMUL", {}) + .c_str(), + {}}}})); #endif NormalRNG default_rng; for (auto &&arg: args) { diff --git a/dnn/test/cuda/group_conv.cpp b/dnn/test/cuda/group_conv.cpp index 881530e4208339a881f005a920cfe2405590d972..16f47b7f069007ff41891cc62d547daab1ca7e30 100644 --- a/dnn/test/cuda/group_conv.cpp +++ b/dnn/test/cuda/group_conv.cpp @@ -116,12 +116,17 @@ TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { std::string conv1x1_name = ConvBiasForward::algo_name( "MATMUL1X1", {}); - checker.set_before_exec_callback(AlgoChecker( - ConvBiasForward::algo_name( - ssprintf("%s:%s", "CUDA:GROUP_CONV", - conv1x1_name.c_str()), - {}) - .c_str())); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "DEFAULT", + {{ConvBiasForward::algo_name< + ConvBiasForward::DirectParam>( + ssprintf("%s:%s", "CUDA:GROUP_CONV", + conv1x1_name.c_str()) + .c_str(), + {}) + .c_str(), + {}}}})); #endif Convolution::Param param; param.sparse = Convolution::Param::Sparse::GROUP; diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp index c28001a3d3eac3714509377765baaa1f4e71af47..d12ff3196ce810729a695368ab37f84884ecca18 100644 --- a/src/opr/impl/search_policy/algo_chooser.cpp +++ b/src/opr/impl/search_policy/algo_chooser.cpp @@ -231,7 +231,7 @@ void AlgoChooser::profile(ExeContext& ctx, bool require_reproducible) { algo.name.c_str(), str_on_inp_shape.c_str()); ImplExecutionPolicy policy; policy.algo = algo.desc; - ctx.construct_execution_policy_from_cache(require_reproducible, policy); + ctx.construct_execution_policy(require_reproducible, policy); if (ctx.get_workspace_size_bytes(policy) >= workspace_limit) continue; @@ -302,7 +302,7 @@ AlgoChooser::choose_by_profile(ExeContext& ctx, bool require_reproducible, }); } typename AlgoChooser::ImplExecutionPolicy policy; - ctx.construct_execution_policy_from_cache(require_reproducible, policy); + ctx.construct_execution_policy(require_reproducible, policy); return policy; MIDOUT_E } @@ -324,6 +324,11 @@ size_t AlgoChooser::setup_algo(const FixedTensorLayouts& layouts, ImplExecutionPolicy policy; if (auto algo_choose_hook = mgb_opr->algo_chooser()) { policy = algo_choose_hook(mgb_opr); + ctx.construct_execution_policy( + mgb_opr->execution_policy().strategy == + mixin::AlgoChooserHelper::ExecutionPolicy::Strategy:: + HEURISTIC_REPRODUCIBLE, + policy, false); } if (!policy.algo.valid()) { policy = get_policy(ctx); @@ -520,13 +525,26 @@ AlgoChooser::ExeContext::get_all_candidates() const { } template -void AlgoChooser::ExeContext::construct_execution_policy_from_cache( +void AlgoChooser::ExeContext::construct_execution_policy( bool require_reproducible, - typename AlgoChooser::ImplExecutionPolicy& policy) const { + typename AlgoChooser::ImplExecutionPolicy& policy, + bool retrive_from_cache) const { if (!policy.algo.valid()) { - policy.algo = get_profile_result_from_cache(require_reproducible).desc; + if (retrive_from_cache) { + policy.algo = + get_profile_result_from_cache(require_reproducible).desc; + } else { + auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( + owner_graph(), m_cn, m_execution_policy.workspace_limit); + policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic( + args..., workspace_limit, + require_reproducible), + m_layouts) + .desc; + } mgb_assert(policy.algo.valid(), - "No cache found, maybe some error occured"); + "No algo found from cache or heuristic, maybe some error " + "occured"); } Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); @@ -544,8 +562,9 @@ void AlgoChooser::ExeContext::construct_execution_policy_from_cache( _item.param, m_base_mgb_opr, m_cn, m_execution_policy, m_allow_weight_preprocess); policy.sub_policy.push_back({}); - sub_ctx.construct_execution_policy_from_cache(require_reproducible, - policy.sub_policy.back()); + sub_ctx.construct_execution_policy(require_reproducible, + policy.sub_policy.back(), + retrive_from_cache); }); return; @@ -672,11 +691,11 @@ AlgoChooser::ExeContext::construct_fake_preprocess_filter() const { AlgoChooser::ExeContext::get_workspace_size_bytes( \ const typename AlgoChooser::ImplExecutionPolicy& \ policy) const; \ - template void AlgoChooser::ExeContext:: \ - construct_execution_policy_from_cache( \ - bool require_reproducible, \ - typename AlgoChooser::ImplExecutionPolicy& \ - policy) const; \ + template void \ + AlgoChooser::ExeContext::construct_execution_policy( \ + bool require_reproducible, \ + typename AlgoChooser::ImplExecutionPolicy& policy, \ + bool retrive_from_cache) const; \ template Maybe \ AlgoChooser::ExeContext::profile_single_algo( \ const typename AlgoChooser::ImplExecutionPolicy& \ diff --git a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h index eb4f390b3d63a4ba2995fc682113f3bcb36ed6b8..a619f9bcaee92f562485d4c3786961b26d461702 100644 --- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h +++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h @@ -129,13 +129,16 @@ public: ImplAlgo get_profile_result_from_cache(bool require_reproducible) const; /** - * \brief construct execution policy from cache. + * \brief construct execution policy from cache or heuristic. * * \param require_reproducible select algo which is reproducible * \param policy execution policy + * \param retrive_from_cache retrive algo from cache if set True, get + * from heuristic otherwise. */ - void construct_execution_policy_from_cache( - bool require_reproducible, ImplExecutionPolicy& policy) const; + void construct_execution_policy( + bool require_reproducible, ImplExecutionPolicy& policy, + bool retrive_from_cache = true) const; private: Maybe> construct_fake_preprocess_filter() const;