提交 0d720653 编写于 作者: M Megvii Engine Team

refactor(megdnn): add default algo for convolution forward

GitOrigin-RevId: a12a7d399ac9f365ca7770e9b8c50cd4e88cddce
上级 659217ac
/**
* \file dnn/src/cuda/convolution/forward/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/cuda/convolution/forward/algos.h"
#include "src/cuda/conv_bias/opr_impl.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/common/algo_base.h"
#include "src/common/algo_chooser.h"
using namespace megdnn;
using namespace cuda;
namespace {
std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, const ConvolutionForwardImpl* opr) {
auto conv_param = opr->param();
DType bias_type;
if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::QuantizedS8>().scale *
filter.dtype.param<dtype::QuantizedS8>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized8Asymm>().scale *
filter.dtype.param<dtype::Quantized8Asymm>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
src.dtype.enumv() == DTypeEnum::Int8) {
bias_type = dtype::Int32{};
} else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized4Asymm>().scale *
filter.dtype.param<dtype::Quantized4Asymm>().scale);
} else {
megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
bias_type = src.dtype;
}
std::pair<TensorLayoutArray, ConvBiasForward::Param> ret;
ret.second = {param::ConvBias::NonlineMode::IDENTITY,
conv_param.mode,
conv_param.sparse,
conv_param.format,
conv_param.pad_h,
conv_param.pad_w,
conv_param.stride_h,
conv_param.stride_w,
conv_param.dilate_h,
conv_param.dilate_w,
conv_param.compute_mode};
ret.first.push_back(TensorLayout({}, bias_type));
ret.first.push_back(TensorLayout({}, dst.dtype));
return ret;
}
} // namespace
ConvolutionForwardImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&algo_default);
for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
}
}
ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack;
MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionForwardImpl)
ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o,
const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst)
: opr{o}, layout_src{&src}, layout_filter{&filter}, layout_dst{&dst} {}
ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs(
ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_out dst,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, filter.layout, dst.layout),
tensor_src{src},
tensor_filter{filter},
tensor_dst{dst},
workspace{workspace} {}
std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const {
return megdnn_mangle(ssprintf("src=%s, filter=%s, dst=%s",
layout_src->to_string().c_str(),
layout_filter->to_string().c_str(),
layout_dst->to_string().c_str()));
}
/* ===================== default algo ===================== */
std::vector<Algorithm::SearchItem>
ConvolutionForwardImpl::AlgoDefault::get_subopr_list(
const TensorLayoutArray& layouts, const OperatorBase* opr) const {
auto&& config =
sub_opr_config(layouts[0], layouts[1], layouts[2],
static_cast<const ConvolutionForwardImpl*>(opr));
TensorLayoutArray conv_bias_layouts = {layouts[0], layouts[1],
config.first[0], config.first[1],
layouts[2]};
std::string param_str;
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str,
conv_bias_layouts}};
}
bool ConvolutionForwardImpl::AlgoDefault::is_available(
const SizeArgs& args) const {
auto conv_bias_opr =
args.opr->handle()->create_operator<ConvBiasForward>();
auto&& config = sub_opr_config(
*args.layout_src, *args.layout_filter, *args.layout_dst,
args.opr);
conv_bias_opr->param() = config.second;
return get_algorithm(static_cast<ConvBiasForwardImpl*>(conv_bias_opr.get()),
*args.layout_src, *args.layout_filter, config.first[0],
config.first[1], *args.layout_dst);
}
size_t ConvolutionForwardImpl::AlgoDefault::get_workspace_in_bytes(
const SizeArgs& args) const {
auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>();
if (args.opr->execution_policy().algo.valid() &&
!args.opr->execution_policy().sub_policy.empty()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_bias_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config = sub_opr_config(
*args.layout_src, *args.layout_filter, *args.layout_dst,
args.opr);
conv_bias_opr->param() = config.second;
return conv_bias_opr->get_workspace_in_bytes(
*args.layout_src, *args.layout_filter, config.first[0],
config.first[1], *args.layout_dst, nullptr);
}
void ConvolutionForwardImpl::AlgoDefault::exec(const ExecArgs& args) const {
auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>();
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_bias_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config = sub_opr_config(
*args.layout_src, *args.layout_filter, *args.layout_dst,
args.opr);
conv_bias_opr->param() = config.second;
conv_bias_opr->exec(args.tensor_src, args.tensor_filter,
{nullptr, config.first[0]}, {nullptr, config.first[1]},
args.tensor_dst, nullptr, args.workspace);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/convolution/forward/algos.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/common/utils.h"
#include "src/cuda/convolution/opr_impl.h"
#include <unordered_map>
namespace megdnn {
namespace cuda {
/*!
* \brief base class for convolutionForward algos
*
*/
class ConvolutionForwardImpl::AlgoBase : public Algorithm {
protected:
~AlgoBase() = default;
public:
enum class AlgoType : uint32_t {
CUDA_DEFAULT,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs {
ConvolutionForwardImpl* opr;
const TensorLayout *layout_src, *layout_filter, *layout_dst;
std::string to_string() const;
SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
const TensorLayout& filter, const TensorLayout& dst);
};
struct ExecArgs : public SizeArgs {
TensorND tensor_src, tensor_filter, tensor_dst;
Workspace workspace;
ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_out dst,
_megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs&) const = 0;
bool is_available_wk(const SizeArgs& args, size_t limit) const {
return is_available(args) && get_workspace_in_bytes(args) <= limit;
}
bool is_available_reproducible(
const SizeArgs& args, bool reproducible = true,
size_t limit = std::numeric_limits<size_t>::max()) const {
return (!reproducible || is_reproducible()) &&
is_available_wk(args, limit);
}
AlgoBase& check_workspace(const SizeArgs& args,
const Workspace& workspace) {
auto req = get_workspace_in_bytes(args);
megdnn_assert(req <= workspace.size,
"convolution fwd algo %s: required workspace %zu bytes, "
"got %zu",
name(), req, workspace.size);
return *this;
}
};
class ConvolutionForwardImpl::AlgoDefault final : public AlgoBase {
public:
AlgoDefault() = default;
bool is_available(const SizeArgs&) const override;
size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override;
const char* name() const override { return "DEFAULT"; }
void exec(const ExecArgs&) const override;
bool is_reproducible() const override { return true; }
std::vector<SearchItem> get_subopr_list(
const TensorLayoutArray& layouts,
const OperatorBase* opr) const override;
MEGDNN_DECL_ALGO_TYPE(CUDA_DEFAULT)
};
class ConvolutionForwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
public:
AlgoPack();
AlgoDefault algo_default;
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -12,6 +12,7 @@
#include "src/cuda/convolution/opr_impl.h"
#include "megdnn/dtype.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/convolution/forward/algos.h"
#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/convolution/backward_filter/algo.h"
#include "src/cuda/conv_bias/opr_impl.h"
......@@ -28,108 +29,34 @@ using namespace convolution;
TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL)
/* ============== ConvolutionForwardImpl ============== */
ConvolutionForwardImpl::ConvBiasExtraData
ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
auto conv_param = param();
DType bias_type;
if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::QuantizedS8>().scale *
filter.dtype.param<dtype::QuantizedS8>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized8Asymm>().scale *
filter.dtype.param<dtype::Quantized8Asymm>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
src.dtype.enumv() == DTypeEnum::Int8) {
bias_type = dtype::Int32{};
} else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized4Asymm>().scale *
filter.dtype.param<dtype::Quantized4Asymm>().scale);
} else {
megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
bias_type = src.dtype;
}
ConvBiasExtraData ret = {this->handle()->create_operator<ConvBiasForward>(),
TensorLayout(bias_type), TensorLayout(dst.dtype)};
ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
conv_param.mode,
conv_param.sparse,
conv_param.format,
conv_param.pad_h,
conv_param.pad_w,
conv_param.stride_h,
conv_param.stride_w,
conv_param.dilate_h,
conv_param.dilate_w,
conv_param.compute_mode};
ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
return ret;
}
ConvolutionForwardImpl::Algorithm*
ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) {
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_algorithm_heuristic(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst,
workspace_limit_in_bytes, reproducible);
}
ConvolutionForwardImpl::Algorithm*
ConvolutionForwardImpl::get_algorithm_from_desc(
const ConvolutionForward::AlgorithmDesc& desc) {
auto conv_param = param();
auto convbias_opr = this->handle()->create_operator<ConvBiasForward>();
convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
conv_param.mode,
conv_param.sparse,
conv_param.format,
conv_param.pad_h,
conv_param.pad_w,
conv_param.stride_h,
conv_param.stride_w,
conv_param.dilate_h,
conv_param.dilate_w,
conv_param.compute_mode};
convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
return static_cast<ConvBiasForwardImpl*>(convbias_opr.get())
->get_algorithm_from_desc(desc);
AlgoBase::SizeArgs args{this, src, filter, dst};
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
MEGDNN_MARK_USED_VAR(reproducible);
return &sm_algo_pack.algo_default;
}
std::vector<ConvolutionForwardImpl::Algorithm*>
ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_all_algorithms(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst);
AlgoBase::SizeArgs args{this, src, filter, dst};
return megdnn::get_all_algorithms<ConvolutionForwardImpl>(args);
}
size_t ConvolutionForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_workspace_in_bytes(
src, filter, extra_data.bias_layout, extra_data.z_layout,
dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter));
MEGDNN_MARK_USED_VAR(preprocessed_filter);
AlgoBase::SizeArgs args{this, src, filter, dst};
return megdnn::get_algorithm(this, src, filter, dst)
->get_workspace_in_bytes(args);
}
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
......@@ -137,20 +64,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto extra_data =
conv_bias_extra_data(src.layout, filter.layout, dst.layout);
TensorND bias(nullptr, extra_data.bias_layout);
TensorND z(nullptr, extra_data.z_layout);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->exec(src, filter, bias, z, dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter),
workspace);
check_exec(src.layout, filter.layout, dst.layout, workspace.size,
preprocessed_filter);
AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout);
algo->check_workspace(args, workspace).exec(args);
}
const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
return "CUDA CONVOLUTION_FORWARD" ;
}
/* ============== ConvolutionBackwardDataImpl ============== */
......
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
......@@ -16,58 +17,56 @@
namespace megdnn {
namespace cuda {
class ConvolutionForwardImpl: public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;
const char* get_algorithm_set_name() const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return {};
}
size_t get_preprocess_workspace_in_bytes(
const TensorLayout& , const TensorLayout& ,
const TensorLayout& ) override{
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda exec_preprocess has not implemeted yet");
}
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
protected:
struct ConvBiasExtraData{
std::unique_ptr<ConvBiasForward> convbias_opr;
TensorLayout bias_layout;
TensorLayout z_layout;
};
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) override;
private:
ConvBiasExtraData conv_bias_extra_data(const TensorLayout&,
const TensorLayout&,
const TensorLayout&);
class ConvolutionForwardImpl : public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;
const char* get_algorithm_set_name() const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return {};
}
size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda exec_preprocess has not implemeted yet");
}
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
class AlgoBase;
class AlgoDefault;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) override;
private:
static AlgoPack sm_algo_pack;
};
class ConvolutionBackwardDataImpl : public ConvolutionBackwardData {
......@@ -122,6 +121,7 @@ protected:
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) override;
private:
Algorithm* get_algorithm_heuristic(const TensorLayout& filter,
const CanonizedFilterMeta& filter_meta,
......@@ -141,12 +141,10 @@ public:
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& diff,
const TensorLayout& grad) override;
AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& src,
const TensorLayout& diff,
const TensorLayout& grad,
const CanonizedFilterMeta& grad_meta,
size_t workspace_limit_in_bytes,
bool reproducible) {
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& diff,
const TensorLayout& grad, const CanonizedFilterMeta& grad_meta,
size_t workspace_limit_in_bytes, bool reproducible) {
return get_algorithm_heuristic(src, diff, grad, grad_meta,
workspace_limit_in_bytes, reproducible)
->info();
......@@ -162,7 +160,6 @@ public:
->info();
}
const char* get_algorithm_set_name() const override;
class AlgoBase;
......@@ -187,6 +184,7 @@ protected:
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) override;
private:
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& diff,
......
......@@ -532,6 +532,30 @@ private:
bool* m_require_algo;
};
template <typename Opr>
void construct_sub_execution_policy_heuristic(ExecutionPolicy& policy,
const TensorLayoutArray& layouts,
const std::string& param,
Handle* handle) {
megdnn_assert(layouts.size() == OprTrait<Opr>::arity);
auto opr = handle->create_operator<Opr>();
opr->param() = Algorithm::deserialize_read_pod<typename Opr::Param>(param);
if (!policy.algo.valid()) {
policy.algo = AlgoProxy<Opr, OprTrait<Opr>::arity>::
get_algorithm_info_heuristic(opr.get(), layouts).desc;
}
Algorithm* algo = opr->get_algorithm_from_desc(policy.algo);
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(layouts, opr.get());
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
policy.sub_policy.push_back(ExecutionPolicy{});
construct_sub_execution_policy_heuristic<_Opr>(
policy.sub_policy.back(), _item.layouts, _item.param,
handle);
});
}
} // namespace test
} // namespace megdnn
......
......@@ -570,6 +570,8 @@ void convolution::test_conv_config_combinations(int k_size,
.set_param(param);
auto opr = checker.opr();
opr->param() = param;
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
TensorLayout ily{ishp, inp_type}, fly{fshp, inp_type}, oly;
oly.dtype = out_type;
opr->deduce_layout(ily, fly, oly);
......@@ -581,10 +583,14 @@ void convolution::test_conv_config_combinations(int k_size,
for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) {
used_algos.insert(algo.desc);
opr->execution_policy().algo = algo.desc;
construct_sub_execution_policy_heuristic<ConvolutionForward>(
opr->execution_policy(), {ily, fly, oly}, param_str,
opr->handle());
checker
.set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str()))
.execs({ishp, fshp, {}});
opr->execution_policy().algo.reset();
opr->execution_policy() = {};
ASSERT_TRUE(checker.prev_succ()) << errmsg(algo.name.c_str());
}
......@@ -597,13 +603,19 @@ void convolution::test_conv_config_combinations(int k_size,
auto opr = checker_bwd_data.opr();
opr->param() = param;
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) {
used_algos_bwd_data.insert(algo.desc);
opr->execution_policy().algo = algo.desc;
construct_sub_execution_policy_heuristic<
ConvolutionBackwardData>(opr->execution_policy(),
{fly, oly, ily}, param_str,
opr->handle());
checker_bwd_data
.set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str()))
.execl({fly, oly, ily});
opr->execution_policy().algo.reset();
opr->execution_policy() = {};
ASSERT_TRUE(checker_bwd_data.prev_succ()) <<
errmsg(algo.name.c_str());
}
......@@ -618,13 +630,19 @@ void convolution::test_conv_config_combinations(int k_size,
auto opr = checker_bwd_filter.opr();
opr->param() = param;
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) {
used_algos_bwd_flt.insert(algo.desc);
opr->execution_policy().algo = algo.desc;
construct_sub_execution_policy_heuristic<
ConvolutionBackwardFilter>(opr->execution_policy(),
{ily, oly, fly}, param_str,
opr->handle());
checker_bwd_filter
.set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str()))
.execl({ily, oly, fly});
opr->execution_policy().algo.reset();
opr->execution_policy() = {};
ASSERT_TRUE(checker_bwd_filter.prev_succ()) <<
errmsg(algo.name.c_str());
}
......
......@@ -338,6 +338,7 @@ struct OprProxyProfilingBase
FastRunCache& cache) {
megdnn_assert(layouts.size() == arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);
SmallVector<size_t> sizes_in_bytes;
......@@ -427,9 +428,9 @@ struct OprProxyProfilingBase
auto&& search_items =
flatten_search_space(layouts, param_str, opr->handle());
FOREACH_OPR_TYPE_DISPATCH(search_items, {
OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W,
opr->handle(), warmup_times,
exec_times, cache);
OprProxyProfilingBase<_Opr>::search(
_item.layouts, _item.param, W, opr->handle(),
warmup_times, exec_times, cache);
});
construct_execution_policy(layouts, param_str, opr->handle(), cache,
......
......@@ -273,10 +273,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
Checker<Convolution> checker(handle_cuda());
bool require_algo = false;
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str(),
ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str(),
{}}}},
&require_algo));
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
if (dtype.enumv() == DTypeEnum::Float16)
......@@ -306,8 +310,12 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
Checker<Convolution> checker(handle_cuda());
bool require_algo = false;
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE_SMALL", {}).c_str(),
ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE_SMALL", {})
.c_str(),
{}}}},
&require_algo));
for (auto dtype : std::vector<DType> {
dtype::Float32(),
......@@ -338,6 +346,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
bool require_algo = false;
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
"CHANNEL_WISE", &require_algo));
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
if (dtype.enumv() == DTypeEnum::Float16)
......@@ -368,9 +377,8 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
Checker<ConvolutionBackwardData> checker(handle_cuda());
bool require_algo = false;
checker.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>(
"CHANNEL_WISE_SMALL", &require_algo));
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
"CHANNEL_WISE_SMALL", &require_algo));
for (auto dtype : std::vector<DType> {
dtype::Float32(),
#if CUDA_VERSION >= 9000
......@@ -396,10 +404,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
Checker<ConvolutionBackwardFilter> checker(handle_cuda());
bool require_algo = false;
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
"CHANNEL_WISE", &require_algo));
"CHANNEL_WISE", &require_algo));
UniformFloatRNG rng(-0.1, 0.1);
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng);
checker.set_dtype(0, dtype)
.set_dtype(1, dtype)
.set_dtype(2, dtype)
.set_rng(0, &rng)
.set_rng(1, &rng);
if (dtype.enumv() == DTypeEnum::Float16)
checker.set_epsilon(2e-1);
// simple case
......@@ -514,7 +526,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) {
checker.proxy()->target_execution_policy.algo.reset();
checker.proxy()->target_execution_policy = {};
checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
};
......@@ -614,7 +626,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy.algo.reset();
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
bencher.set_param(param)
......@@ -623,7 +635,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy.algo.reset();
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
bencher.proxy()->target_execution_policy.algo.reset();
......@@ -677,10 +689,13 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
size_t RUNS = 1;
bencher.set_display(false).set_times(RUNS);
bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str()));
bencher.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str(),
{}}}}));
Convolution::Param param;
param.format = ConvBias::Param::Format::NCHW;
......@@ -783,17 +798,24 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<
ConvBiasForward::DirectParam>("CHANNEL_WISE",
{})
.c_str()));
.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<
ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str(),
{}}}}));
auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str()));
ExecutionPolicyAlgoName{"DEFAULT",
{{ConvBiasForward::algo_name<
ConvBiasForward::DirectParam>(
"CHANNEL_WISE", {})
.c_str(),
{}}}}));
auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
bencher.set_param(param)
......
......@@ -135,10 +135,13 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
.set_rng(1, &int_rng)
.set_param(param);
checker.set_before_exec_callback(AlgoChecker<Convolution>(
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"MATMUL8X8X32", {})
.c_str()));
checker.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"MATMUL8X8X32", {})
.c_str(),
{}}}}));
param.sparse = Convolution::Param::Sparse::DENSE;
param.pad_h = param.pad_w = 1;
......
......@@ -30,19 +30,26 @@ TEST_F(CUDA, DILATED_CONVOLUTION_FORWARD)
auto args = get_dilated_args();
Checker<ConvolutionForward> checker(handle_cuda());
#if CUDNN_VERSION >= 7500
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_"
"PRECOMP_"
"GEMM" CUDNN_VERSION_STRING,
{})
.c_str()));
checker.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_"
"IMPLICIT_"
"PRECOMP_"
"GEMM" CUDNN_VERSION_STRING,
{})
.c_str(),
{}}}}));
printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n");
#else
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>("MATMUL",
{})
.c_str()));
checker.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"MATMUL", {})
.c_str(),
{}}}}));
#endif
NormalRNG default_rng;
for (auto &&arg: args) {
......
......@@ -116,12 +116,17 @@ TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) {
std::string conv1x1_name =
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"MATMUL1X1", {});
checker.set_before_exec_callback(AlgoChecker<Convolution>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
ssprintf("%s:%s", "CUDA:GROUP_CONV",
conv1x1_name.c_str()),
{})
.c_str()));
checker.set_before_exec_callback(
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<
ConvBiasForward::DirectParam>(
ssprintf("%s:%s", "CUDA:GROUP_CONV",
conv1x1_name.c_str())
.c_str(),
{})
.c_str(),
{}}}}));
#endif
Convolution::Param param;
param.sparse = Convolution::Param::Sparse::GROUP;
......
......@@ -231,7 +231,7 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
algo.name.c_str(), str_on_inp_shape.c_str());
ImplExecutionPolicy policy;
policy.algo = algo.desc;
ctx.construct_execution_policy_from_cache(require_reproducible, policy);
ctx.construct_execution_policy(require_reproducible, policy);
if (ctx.get_workspace_size_bytes(policy) >= workspace_limit)
continue;
......@@ -302,7 +302,7 @@ AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
});
}
typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
ctx.construct_execution_policy_from_cache(require_reproducible, policy);
ctx.construct_execution_policy(require_reproducible, policy);
return policy;
MIDOUT_E
}
......@@ -324,6 +324,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
ImplExecutionPolicy policy;
if (auto algo_choose_hook = mgb_opr->algo_chooser()) {
policy = algo_choose_hook(mgb_opr);
ctx.construct_execution_policy(
mgb_opr->execution_policy().strategy ==
mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::
HEURISTIC_REPRODUCIBLE,
policy, false);
}
if (!policy.algo.valid()) {
policy = get_policy(ctx);
......@@ -520,13 +525,26 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const {
}
template <typename Opr>
void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache(
void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
bool require_reproducible,
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const {
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy,
bool retrive_from_cache) const {
if (!policy.algo.valid()) {
policy.algo = get_profile_result_from_cache(require_reproducible).desc;
if (retrive_from_cache) {
policy.algo =
get_profile_result_from_cache(require_reproducible).desc;
} else {
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
owner_graph(), m_cn, m_execution_policy.workspace_limit);
policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
args..., workspace_limit,
require_reproducible),
m_layouts)
.desc;
}
mgb_assert(policy.algo.valid(),
"No cache found, maybe some error occured");
"No algo found from cache or heuristic, maybe some error "
"occured");
}
Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo);
......@@ -544,8 +562,9 @@ void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache(
_item.param, m_base_mgb_opr, m_cn, m_execution_policy,
m_allow_weight_preprocess);
policy.sub_policy.push_back({});
sub_ctx.construct_execution_policy_from_cache(require_reproducible,
policy.sub_policy.back());
sub_ctx.construct_execution_policy(require_reproducible,
policy.sub_policy.back(),
retrive_from_cache);
});
return;
......@@ -672,11 +691,11 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
policy) const; \
template void AlgoChooser<megdnn::Opr>::ExeContext:: \
construct_execution_policy_from_cache( \
bool require_reproducible, \
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
policy) const; \
template void \
AlgoChooser<megdnn::Opr>::ExeContext::construct_execution_policy( \
bool require_reproducible, \
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy, \
bool retrive_from_cache) const; \
template Maybe<AlgoChooserProfileCache::ResultEntry> \
AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
......
......@@ -129,13 +129,16 @@ public:
ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;
/**
* \brief construct execution policy from cache.
* \brief construct execution policy from cache or heuristic.
*
* \param require_reproducible select algo which is reproducible
* \param policy execution policy
* \param retrive_from_cache retrive algo from cache if set True, get
* from heuristic otherwise.
*/
void construct_execution_policy_from_cache(
bool require_reproducible, ImplExecutionPolicy& policy) const;
void construct_execution_policy(
bool require_reproducible, ImplExecutionPolicy& policy,
bool retrive_from_cache = true) const;
private:
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册