提交 75eebb7c 编写于 作者: M Megvii Engine Team

feat(opr): use weight preprocess feature of MegDNN

GitOrigin-RevId: 779041f8a87051e58d5e0ca289773b05a261a8a0
上级 66509a54
...@@ -51,6 +51,17 @@ protected: ...@@ -51,6 +51,17 @@ protected:
}; };
using SeparableConv = SeparableConvForward; using SeparableConv = SeparableConvForward;
namespace detail {
struct PreprocessedFilter {
//! user data; its lifetime should be bound to MegDNN Convolution
//! operator
void* algorithm_id;
TensorNDArray tensors;
};
} // namespace intl
/** /**
* \brief base class for convolution operation * \brief base class for convolution operation
* *
...@@ -131,13 +142,7 @@ public: ...@@ -131,13 +142,7 @@ public:
return flag; return flag;
} }
}; };
using PreprocessedFilter = detail::PreprocessedFilter;
struct PreprocessedFilter {
//! user data; its lifetime should be bound to MegDNN Convolution
//! operator
void* algorithm_id;
TensorNDArray tensors;
};
protected: protected:
// Check or deduce output DType // Check or deduce output DType
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
*/ */
#include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/io.h"
#include "megbrain/graph/grad_impl.h" #include "megbrain/graph/grad_impl.h"
#include "megbrain/system.h" #include "megbrain/system.h"
...@@ -95,67 +96,14 @@ MGB_FOREACH_FASTRUN_OPR(cb) ...@@ -95,67 +96,14 @@ MGB_FOREACH_FASTRUN_OPR(cb)
#undef cb #undef cb
template <class MGBOpr>
struct OprAttributeTrait {
static bool is_weights_persistent(const MGBOpr*) { return false; }
};
template <>
struct OprAttributeTrait<opr::ConvBias> {
//! return true if the flag of weights is PERSISTENT_DEVICE_VALUE, false
//! otherwise. True means weights can be tranformed in the first run.
static bool is_weights_persistent(const opr::ConvBias* opr) {
return opr->input()[1]->contain_flag(
VarNode::Flag::PERSISTENT_DEVICE_VALUE);
}
};
template <typename Opr>
constexpr bool opr_supports_preprocess() {
return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
std::is_same<Opr, megdnn::ConvBias>::value;
}
template <typename Opr> template <typename Opr>
struct OprArityTrait; struct OprArityTrait;
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
template <typename Opr, int _arity_in, int _arity_out> template <typename Opr, int _arity_in, int _arity_out>
struct OprArityTraitTmpl { struct OprArityTraitTmpl {
static constexpr int arity_in = _arity_in; static constexpr int arity_in = _arity_in;
static constexpr int arity_out = _arity_out; static constexpr int arity_out = _arity_out;
static constexpr int arity = arity_in + arity_out; static constexpr int arity = arity_in + arity_out;
using Algorithm = typename Opr::Algorithm;
using TensorLayoutArray = std::array<TensorLayout, arity>;
static size_t get_workspace_in_bytes(Opr* opr, Algorithm* algo,
const TensorLayoutArray& layouts) {
opr->execution_policy() = {algo};
size_t workspace_size;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
workspace_size = APPLY(
opr->get_workspace_in_bytes(args..., nullptr), layouts);
}, /* else */ [&](auto) {
workspace_size =
APPLY(opr->get_workspace_in_bytes(args...), layouts);
});
return workspace_size;
}
static void exec(Opr* opr,
const std::array<DeviceTensorND, arity_in>& inp_val,
const std::array<DeviceTensorND, arity_out>& out_val,
megdnn::Workspace& workspace) {
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
APPLY(opr->exec(args.as_megdnn()..., nullptr, workspace), inp_val,
out_val);
}, /* else */ [&](auto) {
APPLY(opr->exec(args.as_megdnn()..., workspace), inp_val, out_val);
});
}
}; };
#define INST_ARITY(_Opr, _in, _out) \ #define INST_ARITY(_Opr, _in, _out) \
...@@ -179,6 +127,26 @@ INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); ...@@ -179,6 +127,26 @@ INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);
#undef INST_ARITY #undef INST_ARITY
template <typename Opr>
constexpr bool opr_supports_preprocess() {
return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
std::is_same<Opr, megdnn::ConvBias>::value;
}
template <typename Opr, bool has_prep>
struct PreprocessFilterImpl {
using T = union {};
};
template <typename Opr>
struct PreprocessFilterImpl<Opr, true> {
using T = typename Opr::PreprocessedFilter;
};
template <typename Opr>
using PreprocessFilter =
typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;
// timeout delta to be added with fastest known algorithm for new algos // timeout delta to be added with fastest known algorithm for new algos
constexpr double TIMEOUT_TOLERANCE = 2; constexpr double TIMEOUT_TOLERANCE = 2;
...@@ -225,6 +193,7 @@ public: ...@@ -225,6 +193,7 @@ public:
CompNode::Locator comp_node_loc; CompNode::Locator comp_node_loc;
ConvTensorShapes shapes; ConvTensorShapes shapes;
typename Opr::Param opr_param; typename Opr::Param opr_param;
bool allow_weight_preprocess;
//! filled by profile() //! filled by profile()
mutable double actual_timeout; mutable double actual_timeout;
...@@ -277,6 +246,10 @@ double TimedProfiler<Opr>::init_timeout_setting() { ...@@ -277,6 +246,10 @@ double TimedProfiler<Opr>::init_timeout_setting() {
return 0; return 0;
} }
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
template <typename Opr> template <typename Opr>
typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
const TParam& raw_param) { const TParam& raw_param) {
...@@ -324,6 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -324,6 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
megdnn_opr->execution_policy() = {algo}; megdnn_opr->execution_policy() = {algo};
} }
// Allocate preprocessed weight buffers.
TensorLayoutArray preprocessed_layout;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
if (param.allow_weight_preprocess) {
preprocessed_layout = APPLY(
_(megdnn_opr)->deduce_preprocessed_filter_layout(args...),
layouts);
}
});
{ {
// first allocate a whole chunk to avoid memory fragmentation (here we // first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory) // rely on memory allocator to reuse memory)
...@@ -332,6 +315,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -332,6 +315,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
for (int i = 0; i < arity; ++i) { for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align; tot_size += layouts[i].span().high_byte + align;
} }
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace; tot_size += param.workspace;
DeviceTensorStorage storage{cn}; DeviceTensorStorage storage{cn};
storage.ensure_size(tot_size); storage.ensure_size(tot_size);
...@@ -362,15 +348,46 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -362,15 +348,46 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
mdn_workspace.raw_ptr = workspace.raw_ptr(); mdn_workspace.raw_ptr = workspace.raw_ptr();
} }
// allocate storage for preprocessed filter
SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i] = {cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
preprocessed_layout[i].format};
}
for (int i = 0; i < arity_in; ++i) { for (int i = 0; i < arity_in; ++i) {
fill_zero_dev_tensor(inp_val[i]); fill_zero_dev_tensor(inp_val[i]);
} }
PreprocessFilter<Opr> prep_flt;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
if (!preprocessed_layout.empty()) {
auto&& pf = _(prep_flt);
pf.algorithm_id = nullptr;
pf.tensors.resize(flt_val.size());
for (size_t i = 0; i < flt_val.size(); i++) {
pf.tensors[i] = flt_val[i].as_megdnn();
}
APPLY(_(megdnn_opr)->exec_preprocess(args..., &pf, mdn_workspace),
std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
array_skip<2>(layouts));
}
});
RealTimer timer; RealTimer timer;
auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
ev_end = cn.create_event(CompNode::Event::NEED_TIMER); ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
ev_start->record(); ev_start->record();
OprArityTrait<Opr>::exec(megdnn_opr.get(), inp_val, out_val, mdn_workspace); if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
auto&& opr = _(megdnn_opr);
PreprocessFilter<Opr>* pf =
preprocessed_layout.empty() ? nullptr : &prep_flt;
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
out_val);
}, /* else */ [&](auto _) {
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
out_val);
});
ev_end->record(); ev_end->record();
double next_report_time = 0.5; double next_report_time = 0.5;
...@@ -425,13 +442,15 @@ class AlgoChooser { ...@@ -425,13 +442,15 @@ class AlgoChooser {
const ConvTensorLayouts& m_layouts; const ConvTensorLayouts& m_layouts;
Opr* m_megdnn_opr; Opr* m_megdnn_opr;
const MGBOpr* m_mgb_opr; const MGBOpr* m_mgb_opr;
bool m_allow_weight_preprocess;
public: public:
ExeContext(const ConvTensorLayouts& layouts, Opr* megdnn_opr, ExeContext(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr) const MGBOpr* mgb_opr, bool allow_weight_preprocess)
: m_layouts{layouts}, : m_layouts{layouts},
m_megdnn_opr{megdnn_opr}, m_megdnn_opr{megdnn_opr},
m_mgb_opr{mgb_opr} { m_mgb_opr{mgb_opr},
m_allow_weight_preprocess{allow_weight_preprocess} {
mgb_assert(m_layouts.size() == layouts.size()); mgb_assert(m_layouts.size() == layouts.size());
static_assert( static_assert(
std::tuple_size<ConvTensorLayouts>::value == 3 || std::tuple_size<ConvTensorLayouts>::value == 3 ||
...@@ -499,8 +518,23 @@ class AlgoChooser { ...@@ -499,8 +518,23 @@ class AlgoChooser {
//! get workspace size required for specific algo //! get workspace size required for specific algo
size_t get_workspace_size_bytes(ImplAlgo algo) const { size_t get_workspace_size_bytes(ImplAlgo algo) const {
return OprArityTrait<Opr>::get_workspace_in_bytes(m_megdnn_opr, m_megdnn_opr->execution_policy() = {algo};
algo, m_layouts); size_t result;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
auto&& opr = _(m_megdnn_opr);
auto prep = construct_fake_preprocess_filter();
PreprocessFilter<Opr>* prep_ptr =
prep.valid() ? &prep.val() : nullptr;
result = std::max(
APPLY(opr->get_preprocess_workspace_in_bytes(args...),
m_layouts),
APPLY(opr->get_workspace_in_bytes(args..., prep_ptr),
m_layouts));
}, /* else */ [&](auto _) {
result = APPLY(_(m_megdnn_opr)->get_workspace_in_bytes(args...),
m_layouts);
});
return result;
} }
/*! /*!
...@@ -525,6 +559,28 @@ class AlgoChooser { ...@@ -525,6 +559,28 @@ class AlgoChooser {
*/ */
void modify_param_with_weights_preprocessed( void modify_param_with_weights_preprocessed(
typename TimedProfiler<Opr>::Param& param) const {} typename TimedProfiler<Opr>::Param& param) const {}
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const {
Maybe<PreprocessFilter<Opr>> result = None;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
if (!m_allow_weight_preprocess)
return;
auto opr = _(m_megdnn_opr);
auto layout =
APPLY(opr->deduce_preprocessed_filter_layout(args...),
m_layouts);
if (layout.empty())
return;
result = PreprocessFilter<Opr>{};
auto& res = result.val();
res.algorithm_id = nullptr;
res.tensors.resize(layout.size());
for (size_t i = 0; i < layout.size(); i++) {
res.tensors[i] = megdnn::TensorND(nullptr, layout[i]);
}
});
return result;
}
}; };
//! entrance for getting algorithm according to execution strategy //! entrance for getting algorithm according to execution strategy
...@@ -571,12 +627,13 @@ public: ...@@ -571,12 +627,13 @@ public:
* \brief setup algorithm and return workspace size * \brief setup algorithm and return workspace size
*/ */
static size_t setup_algo(const ConvTensorLayouts& layouts, Opr* megdnn_opr, static size_t setup_algo(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr) { const MGBOpr* mgb_opr,
bool allow_weight_preprocess = false) {
if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) {
return 0; return 0;
} }
ExeContext ctx(layouts, megdnn_opr, mgb_opr); ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess);
auto algo = get_algo(ctx); auto algo = get_algo(ctx);
size_t workspace = ctx.get_workspace_size_bytes(algo); size_t workspace = ctx.get_workspace_size_bytes(algo);
...@@ -780,9 +837,6 @@ Maybe<AlgoChooserProfileCache::ResultEntry> ...@@ -780,9 +837,6 @@ Maybe<AlgoChooserProfileCache::ResultEntry>
AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
double& timeout) const { double& timeout) const {
typename TimedProfiler<Opr>::Param param; typename TimedProfiler<Opr>::Param param;
bool is_weights_persistent =
OprAttributeTrait<typename MegDNNOpr2MGBOpr<Opr>::MGBOpr>::
is_weights_persistent(m_mgb_opr);
auto name = algo->name(); auto name = algo->name();
// force check copy size <= dest len-1 from gcc8 for safe // force check copy size <= dest len-1 from gcc8 for safe
auto len = sizeof(param.algo_name); auto len = sizeof(param.algo_name);
...@@ -806,8 +860,9 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, ...@@ -806,8 +860,9 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
for (size_t i = 0; i < param.shapes.size(); ++i) for (size_t i = 0; i < param.shapes.size(); ++i)
param.shapes[i] = m_layouts[i]; param.shapes[i] = m_layouts[i];
param.opr_param = m_megdnn_opr->param(); param.opr_param = m_megdnn_opr->param();
param.allow_weight_preprocess = m_allow_weight_preprocess;
if (is_weights_persistent) { if (m_allow_weight_preprocess) {
modify_param_with_weights_preprocessed(param); modify_param_with_weights_preprocessed(param);
} }
...@@ -911,6 +966,78 @@ AlgoChooserProfileCache& mixin::Convolution::profile_cache() const { ...@@ -911,6 +966,78 @@ AlgoChooserProfileCache& mixin::Convolution::profile_cache() const {
return *m_profile_cache; return *m_profile_cache;
} }
class mixin::WeightPreprocessExecutor::PreprocessedFilterExecDep final
: public cg::GraphExecutable::ExecDependency {
std::unique_ptr<PreprocessedFilter> m_pf;
SmallVector<DeviceTensorND> m_filter_storage;
public:
explicit PreprocessedFilterExecDep(
std::unique_ptr<PreprocessedFilter> preprocessed_filter,
SmallVector<DeviceTensorND> filter_storage)
: m_pf(std::move(preprocessed_filter)),
m_filter_storage(std::move(filter_storage)) {}
};
void mixin::WeightPreprocessExecutor::mixin_update_preprocessed_filter(
cg::OperatorNodeBase& opr) {
if (!mixin_allow_weight_preprocess(opr)) return;
auto new_layout = deduce_preprocessed_filter_layout();
if (new_layout.empty()) {
// Weight preprocess was needed before, but no longer needed.
if (m_preprocessed_filter) {
m_preprocessed_filter.reset();
m_filter_storage.clear();
}
return;
}
bool should_update = false;
size_t new_size = new_layout.size();
if (!m_preprocessed_filter ||
m_preprocessed_filter->tensors.size() != new_size) {
should_update = true;
} else {
for (size_t i = 0; i < new_size; i++) {
if (!new_layout[i].eq_layout(
m_preprocessed_filter->tensors[i].layout)) {
should_update = true;
break;
}
}
}
if (!should_update) return;
if (!m_preprocessed_filter) {
m_preprocessed_filter.reset(new PreprocessedFilter{});
}
m_preprocessed_filter->tensors.resize(new_size);
m_filter_storage.resize(new_size);
m_preprocessed_filter->algorithm_id = nullptr;
for (size_t i = 0; i < new_size; i++) {
m_filter_storage[i] = {opr.output(0)->comp_node(), new_layout[i],
new_layout[i].dtype, new_layout[i].format};
m_preprocessed_filter->tensors[i] = m_filter_storage[i].as_megdnn();
}
scn_do_execute_preprocess();
}
void mixin::WeightPreprocessExecutor::record_preprocessed_weight(
cg::GraphExecutable::ExecDependencyArray& deps) {
deps.emplace_back(new PreprocessedFilterExecDep{
std::move(m_preprocessed_filter), std::move(m_filter_storage)});
}
bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
const cg::OperatorNodeBase& opr) const {
bool param_merged = opr.input(1)
->owner_opr()
->same_type<opr::MultipleDeviceTensorHolder>();
return opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
(cg::is_const_var_value(opr.input(1)) || param_merged);
}
/* ==================== ConvolutionForward ==================== */ /* ==================== ConvolutionForward ==================== */
IMPL_CONV(ConvolutionForward, "conv_fwd"); IMPL_CONV(ConvolutionForward, "conv_fwd");
...@@ -971,7 +1098,7 @@ size_t ConvolutionForward::get_workspace_size_bytes( ...@@ -971,7 +1098,7 @@ size_t ConvolutionForward::get_workspace_size_bytes(
input(0)->format()}, input(0)->format()},
{input_shapes[1], input(1)->dtype(), input(1)->format()}, {input_shapes[1], input(1)->dtype(), input(1)->format()},
{output_shapes[0], output(0)->dtype(), output(0)->format()}}, {output_shapes[0], output(0)->dtype(), output(0)->format()}},
megdnn_opr(), this); megdnn_opr(), this, allow_weight_preprocess());
} }
void ConvolutionForward::init_output_format() { void ConvolutionForward::init_output_format() {
...@@ -980,9 +1107,14 @@ void ConvolutionForward::init_output_format() { ...@@ -980,9 +1107,14 @@ void ConvolutionForward::init_output_format() {
} }
void ConvolutionForward::scn_do_execute() { void ConvolutionForward::scn_do_execute() {
if (input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
cg::is_const_var_value(input(1))) {
update_preprocessed_filter();
}
megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(), megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(),
input(1)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(),
output(0)->dev_tensor().as_megdnn(), nullptr, output(0)->dev_tensor().as_megdnn(),
preprocessed_filter(),
intl::get_megdnn_workspace_from_var(output().back())); intl::get_megdnn_workspace_from_var(output().back()));
} }
...@@ -1012,6 +1144,20 @@ void ConvolutionForward::get_output_var_shape( ...@@ -1012,6 +1144,20 @@ void ConvolutionForward::get_output_var_shape(
void ConvolutionForward::record_execute_deps( void ConvolutionForward::record_execute_deps(
cg::GraphExecutable::ExecDependencyArray& deps) { cg::GraphExecutable::ExecDependencyArray& deps) {
record_megdnn_opr(deps); record_megdnn_opr(deps);
record_preprocessed_weight(deps);
}
SmallVector<TensorLayout>
ConvolutionForward::deduce_preprocessed_filter_layout() {
return megdnn_opr()->deduce_preprocessed_filter_layout(
input(0)->layout(), input(1)->layout(), output(0)->layout());
}
void ConvolutionForward::scn_do_execute_preprocess() {
megdnn_opr()->exec_preprocess(
input(0)->layout(), input(1)->dev_tensor().as_megdnn(),
output(0)->layout(), preprocessed_filter(),
intl::get_megdnn_workspace_from_var(output().back()));
} }
/* ==================== ConvolutionBackwardData ==================== */ /* ==================== ConvolutionBackwardData ==================== */
...@@ -1504,10 +1650,12 @@ size_t ConvBiasForward::get_workspace_size_bytes( ...@@ -1504,10 +1650,12 @@ size_t ConvBiasForward::get_workspace_size_bytes(
i2, i2,
i3, i3,
{output_shapes[0], output(0)->dtype(), output(0)->format()}}, {output_shapes[0], output(0)->dtype(), output(0)->format()}},
mo, this); mo, this, allow_weight_preprocess());
} }
void ConvBiasForward::scn_do_execute() { void ConvBiasForward::scn_do_execute() {
update_preprocessed_filter();
auto&& inp = input(); auto&& inp = input();
auto mo = megdnn_opr(); auto mo = megdnn_opr();
if (inp.size() == 2) { if (inp.size() == 2) {
...@@ -1621,6 +1769,33 @@ megdnn::param::MatrixMul::Format ConvBiasForward::get_matmul_format( ...@@ -1621,6 +1769,33 @@ megdnn::param::MatrixMul::Format ConvBiasForward::get_matmul_format(
} }
} }
SmallVector<TensorLayout> ConvBiasForward::deduce_preprocessed_filter_layout() {
TensorLayout i2, i3;
if (input().size() > 2) {
i2 = input(2)->layout();
}
if (input().size() > 3) {
i3 = input(3)->layout();
}
return megdnn_opr()->deduce_preprocessed_filter_layout(
input(0)->layout(), input(1)->layout(), i2, i3,
output(0)->layout());
}
void ConvBiasForward::scn_do_execute_preprocess() {
TensorLayout bias_layout(output(0)->dtype()), z_layout(output(0)->dtype());
if (input().size() > 2) {
bias_layout = input(2)->layout();
}
if (input().size() > 3) {
z_layout = input(3)->layout();
}
megdnn_opr()->exec_preprocess(
input(0)->layout(), input(1)->dev_tensor().as_megdnn(), bias_layout,
z_layout, output(0)->layout(), preprocessed_filter(),
intl::get_megdnn_workspace_from_var(output().back()));
}
/* ===================== LocalShareForward ==================== */ /* ===================== LocalShareForward ==================== */
IMPL_CONV(LocalShareForward, "local_share"); IMPL_CONV(LocalShareForward, "local_share");
......
...@@ -72,13 +72,52 @@ class Convolution { ...@@ -72,13 +72,52 @@ class Convolution {
cg::OperatorNodeBase* self); cg::OperatorNodeBase* self);
}; };
class WeightPreprocessExecutor : public cg::OperatorNodeMixinBase {
class PreprocessedFilterExecDep;
using PreprocessedFilter = megdnn::detail::PreprocessedFilter;
std::unique_ptr<PreprocessedFilter> m_preprocessed_filter;
SmallVector<DeviceTensorND> m_filter_storage;
protected:
//! this should only be called in scn_do_execute or similar functions (i.e.
//! post dispatch-to-ExecEnv)
void mixin_update_preprocessed_filter(OperatorNodeBase& opr);
void record_preprocessed_weight(
cg::GraphExecutable::ExecDependencyArray& deps);
PreprocessedFilter* preprocessed_filter() const {
return m_preprocessed_filter.get();
}
bool mixin_allow_weight_preprocess(const OperatorNodeBase& opr) const;
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout() = 0;
virtual void scn_do_execute_preprocess() = 0;
};
} // namespace mixin } // namespace mixin
namespace intl { namespace intl {
//! glue class to apply mixin::WeightPreprocessExecutor
template<class Base = cg::OperatorNodeBase,
class MixinImpl = mixin::WeightPreprocessExecutor>
class OprWithWeightPreprocess: public mixin::CheckBase<Base>::Base,
public MixinImpl {
protected:
using Base::Base;
void update_preprocessed_filter() {
this->mixin_update_preprocessed_filter(*this);
}
bool allow_weight_preprocess() const {
return this->mixin_allow_weight_preprocess(*this);
}
};
using ConvBiasBase = cg::SingleCNOperatorNode< using ConvBiasBase = cg::SingleCNOperatorNode<
cg::OutshapePureByInshapeOpr<>, cg::OutshapePureByInshapeOpr<>,
mixin::MegDNNOprHolderImpl<megdnn::ConvBiasForward>>; mixin::MegDNNOprHolderImpl<megdnn::ConvBiasForward>>;
using ConvBiasForwardBase = WorkspaceSizeInfer<ConvBiasBase>; using ConvBiasForwardBase =
OprWithWeightPreprocess<WorkspaceSizeInfer<ConvBiasBase>>;
using DeformableConvBackwardDataT = cg::SingleCNOperatorNode< using DeformableConvBackwardDataT = cg::SingleCNOperatorNode<
cg::OutshapePureByInshapeOpr<>, cg::OutshapePureByInshapeOpr<>,
...@@ -90,12 +129,20 @@ namespace intl { ...@@ -90,12 +129,20 @@ namespace intl {
mixin::MegDNNOprHolderImpl<megdnn::BatchConvBiasForward>>; mixin::MegDNNOprHolderImpl<megdnn::BatchConvBiasForward>>;
using BatchConvBiasForwardBase = WorkspaceSizeInfer<BatchConvBiasBase>; using BatchConvBiasForwardBase = WorkspaceSizeInfer<BatchConvBiasBase>;
using ConvolutionForwardBase = WorkspaceSizeInfer< using ConvolutionForwardBase = OprWithWeightPreprocess<
typename MegDNNOprWrapperFwdBase<megdnn::ConvolutionForward>::Base>; WorkspaceSizeInfer<typename MegDNNOprWrapperFwdBase<
megdnn::ConvolutionForward>::Base>>;
} // namespace intl } // namespace intl
namespace testing {
class ConvolutionTestingPeer;
} // namespace testing
MGB_DEFINE_OPR_CLASS(ConvolutionForward, MGB_DEFINE_OPR_CLASS(ConvolutionForward,
intl::ConvolutionForwardBase, public mixin::Convolution) // { intl::ConvolutionForwardBase, public mixin::Convolution) // {
void init_profile_cache() override; void init_profile_cache() override;
void init_output_dtype() override; void init_output_dtype() override;
size_t get_workspace_size_bytes( size_t get_workspace_size_bytes(
...@@ -109,6 +156,10 @@ MGB_DEFINE_OPR_CLASS(ConvolutionForward, ...@@ -109,6 +156,10 @@ MGB_DEFINE_OPR_CLASS(ConvolutionForward,
TensorShapeArray& out_shape) const override final; TensorShapeArray& out_shape) const override final;
void record_execute_deps( void record_execute_deps(
cg::GraphExecutable::ExecDependencyArray& deps) override; cg::GraphExecutable::ExecDependencyArray& deps) override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
void scn_do_execute_preprocess() override;
friend testing::ConvolutionTestingPeer;
public: public:
ConvolutionForward(VarNode *src, VarNode *filter, ConvolutionForward(VarNode *src, VarNode *filter,
...@@ -142,7 +193,10 @@ MGB_DEFINE_OPR_CLASS(ConvBiasForward, intl::ConvBiasForwardBase, ...@@ -142,7 +193,10 @@ MGB_DEFINE_OPR_CLASS(ConvBiasForward, intl::ConvBiasForwardBase,
void record_execute_deps( void record_execute_deps(
cg::GraphExecutable::ExecDependencyArray& deps) override { cg::GraphExecutable::ExecDependencyArray& deps) override {
this->record_megdnn_opr(deps); this->record_megdnn_opr(deps);
this->record_preprocessed_weight(deps);
} }
SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
void scn_do_execute_preprocess() override;
public: public:
//! src * filter //! src * filter
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include "megbrain/gopt/inference.h" #include "megbrain/gopt/inference.h"
#include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/tensor_manip.h"
#include <gmock/gmock.h>
#include <cmath> #include <cmath>
#include <random> #include <random>
...@@ -244,7 +246,6 @@ opr::Convolution::Param convert_to_conv_param( ...@@ -244,7 +246,6 @@ opr::Convolution::Param convert_to_conv_param(
param.dilate_w, param.sparse, param.format}; param.dilate_w, param.sparse, param.format};
}; };
#endif #endif
} // anonymous namespace
TEST(TestOprDNN, ConvolutionForward) { TEST(TestOprDNN, ConvolutionForward) {
uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2; uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
...@@ -1172,6 +1173,7 @@ TEST(TestOprDNN, ConvBiasForward) { ...@@ -1172,6 +1173,7 @@ TEST(TestOprDNN, ConvBiasForward) {
{1, OC, 1, 1}}, {1, OC, 1, 1}},
opt3); opt3);
}; };
run(1, 1, 1, 5, 5, 1, 1);
run(1, 1, 1, 5, 5, 3, 3); run(1, 1, 1, 5, 5, 3, 3);
run(2, 3, 4, 5, 5, 3, 3); run(2, 3, 4, 5, 5, 3, 3);
run(3, 3, 4, 224, 223, 3, 3); run(3, 3, 4, 224, 223, 3, 3);
...@@ -2124,4 +2126,225 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) { ...@@ -2124,4 +2126,225 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) {
#endif #endif
} // anonymous namespace
namespace mgb {
namespace opr {
namespace testing {
class ConvolutionTestingPeer {
opr::ConvolutionForward& m_conv_opr;
public:
explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
: m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
void set_megdnn_opr(
std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
}
};
} // namespace testing
} // namespace opr
} // namespace mgb
namespace {
using megdnn::TensorND;
using megdnn::Workspace;
using opr::testing::ConvolutionTestingPeer;
class MockConvolutionForward : public megdnn::ConvolutionForward {
const char* m_algorithm_set_name;
public:
MockConvolutionForward(megdnn::ConvolutionForward* orig,
const char* algo_set_name)
: megdnn::ConvolutionForward(orig->handle()),
m_algorithm_set_name(algo_set_name) {}
MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace));
MOCK_METHOD5(exec_preprocess,
void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace));
MOCK_METHOD4(get_workspace_in_bytes,
size_t(const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter));
MOCK_METHOD3(deduce_preprocessed_filter_layout,
SmallVector<TensorLayout>(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst));
MOCK_METHOD3(get_preprocess_workspace_in_bytes,
size_t(const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst));
MOCK_METHOD3(get_all_algorithms,
std::vector<Algorithm*>(const TensorLayout& p0,
const TensorLayout& p1,
const TensorLayout& p2));
MOCK_METHOD5(get_algorithm_heuristic,
Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2,
size_t workspace_limit_in_bytes,
bool reproducible));
const char* get_algorithm_set_name() const override {
return m_algorithm_set_name;
}
};
class MockAlgorithm : public megdnn::detail::Algorithm {
const char* m_name;
public:
MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
bool is_reproducible() const override { return true; }
const char* name() const override { return m_name; }
virtual ~MockAlgorithm() = default;
};
class TestWeightPreprocess : public ::testing::Test {
protected:
CompNode comp_node;
std::shared_ptr<ComputingGraph> graph;
std::shared_ptr<HostTensorND> x_host;
MockConvolutionForward* mock_conv_ptr;
SymbolVar y;
HostTensorND y_host;
std::unique_ptr<cg::AsyncExecutable> func;
MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
void SetUp() override {
constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
iw = ih;
comp_node = CompNode::load("cpux");
graph = ComputingGraph::make();
TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
auto x = opr::Host2DeviceCopy::make(*graph, x_host);
auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
Param param;
param.pad_h = param.pad_w = ph;
param.stride_h = param.stride_w = sh;
param.format = Param::Format::NCHW;
y = opr::ConvolutionForward::make(x, w, param);
auto& opr =
y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
auto mock = std::make_unique<MockConvolutionForward>(
opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
->current_test_info()
->name());
mock_conv_ptr = mock.get();
ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
func = graph->compile({make_callback_copy(y, y_host)});
}
void run() { func->execute().wait(); }
void TearDown() override {
func.reset();
// Triggers mock check
graph.reset();
x_host.reset();
}
};
TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
using ::testing::_;
using ::testing::Return;
auto& mock = mock_conv();
MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
.WillRepeatedly(Return(0));
{
::testing::InSequence seq;
// Return empty preprocess filters, indicating no need to preprocess
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(SmallVector<TensorLayout>{}));
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
run();
}
}
TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
using ::testing::_;
using ::testing::Return;
using ::testing::Field;
using ::testing::Invoke;
using ::testing::Expectation;
using PF = MockConvolutionForward::PreprocessedFilter;
auto& mock = mock_conv();
MockAlgorithm algo;
SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
{{5, 6, 7, 8}, dtype::Float32()}};
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(filter_layout));
Expectation algo_call =
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillOnce(Return(&algo));
Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.After(algo_call)
.WillOnce(Return(0));
Expectation pre_ws_call =
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
.After(algo_call)
.WillOnce(Return(233));
{
::testing::InSequence seq;
// exec_preprocess should be called only once, with workspace allocated
int salt = 0;
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
.After(ws_call, pre_ws_call)
.WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PF* pf,
_megdnn_workspace workspace) {
ASSERT_EQ(workspace.size, 233);
ASSERT_NE(pf, nullptr);
pf->algorithm_id = &salt;
ASSERT_EQ(pf->tensors.size(), 2);
ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
pf->tensors[0].ptr<float>()[0] = 114.514f;
pf->tensors[1].ptr<float>()[0] = 1926.0817f;
}));
// Run the graph multiple times.
for (int i = 0; i < 3; i++) {
if (i > 0) {
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
}
EXPECT_CALL(mock, exec(_, _, _, _, _))
.WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
_megdnn_tensor_out, const PF* pf,
_megdnn_workspace) {
ASSERT_NE(pf, nullptr);
ASSERT_EQ(pf->algorithm_id, &salt);
ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
}));
run();
}
}
}
} // anonymous namespace
>>>>>>> 11c3561ca... feat(opr): use weight preprocess feature of MegDNN
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册