提交 633016a9 编写于 作者: M Megvii Engine Team 提交者: huangxinda

fix(dnn/cuda): fix AlgoFallbackNCHWQS8 to support Float32 dst

GitOrigin-RevId: 06f90f5cf384bc4ddb2f97860e4f530ee9a85705
上级 e6caa9ff
......@@ -50,15 +50,23 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout(
deduce_reformat_layout(relayout_src, *args.filter_layout,
inner_weight_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT);
deduce_reformat_layout(relayout_src, *args.dst_layout, inner_dst_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
deduce_reformat_layout(relayout_src, *args.bias_layout, inner_bias_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
deduce_reformat_layout(relayout_src, *args.z_layout, inner_z_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32;
if (dst_float) {
inner_dst_layout = *args.dst_layout;
inner_bias_layout = *args.bias_layout;
inner_z_layout = *args.z_layout;
} else {
deduce_reformat_layout(relayout_src, *args.dst_layout, inner_dst_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
deduce_reformat_layout(relayout_src, *args.bias_layout,
inner_bias_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
deduce_reformat_layout(relayout_src, *args.z_layout, inner_z_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
}
};
bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available(
......@@ -70,8 +78,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available(
auto&& param = args.opr->param();
bool is_format_ok = param.format == param::ConvBias::Format::NCHW;
bool is_version_ok = CUDNN_VERSION >= 7500;
bool is_dtype_ok =
args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8;
bool is_dtype_ok = args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8;
bool is_bias_ok =
args.bias_layout->ndim == 0 ||
(args.bias_layout->ndim == 4 && args.bias_layout->shape[0] == 1 &&
......@@ -90,17 +97,23 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_bundle(
TensorLayout inner_z_layout;
make_inner_layout(args, inner_src_layout, inner_weight_layout,
inner_dst_layout, inner_bias_layout, inner_z_layout);
auto opr = args.handle->create_operator<ConvBiasForward>();
Param inner_conv_param = args.opr->param();
inner_conv_param.format = Param::Format::NCHW4;
size_t ws_dst = 0, ws_bias = 0, ws_z = 0;
if (args.dst_layout->dtype.enumv() == DTypeEnum::Float32) {
inner_conv_param.format = Param::Format::NCHW4_NCHW;
} else {
inner_conv_param.format = Param::Format::NCHW4;
ws_dst = inner_dst_layout.span().dist_byte();
ws_bias = inner_bias_layout.span().dist_byte();
ws_z = inner_z_layout.span().dist_byte();
}
auto opr = args.handle->create_operator<ConvBiasForward>();
opr->param() = inner_conv_param;
return WorkspaceBundle(ptr, {inner_src_layout.span().dist_byte(),
inner_weight_layout.span().dist_byte(),
inner_dst_layout.span().dist_byte(),
inner_bias_layout.span().dist_byte(),
inner_z_layout.span().dist_byte(),
opr->get_workspace_in_bytes(
inner_src_layout, inner_weight_layout,
return WorkspaceBundle(
ptr,
{inner_src_layout.span().dist_byte(),
inner_weight_layout.span().dist_byte(), ws_dst, ws_bias, ws_z,
opr->get_workspace_in_bytes(inner_src_layout, inner_weight_layout,
inner_bias_layout, inner_z_layout,
inner_dst_layout, nullptr)});
}
......@@ -145,22 +158,33 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::exec(
TensorND inner_bias(bundle.get(3), inner_bias_layout);
TensorND inner_z(bundle.get(4), inner_z_layout);
bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32;
Param inner_conv_param = args.opr->param();
inner_conv_param.format = Param::Format::NCHW4;
inner_conv_param.format =
dst_float ? Param::Format::NCHW4_NCHW : Param::Format::NCHW4;
auto inner_opr = args.handle->create_operator<ConvBiasForward>();
inner_opr->param() = inner_conv_param;
relayout_nchw_nchw4->exec(*args.src_tensor, inner_src, {});
relayout_weight->exec(*args.filter_tensor, inner_weight, {});
if (inner_bias_layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {});
}
if (inner_z_layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {});
if (dst_float) {
inner_opr->exec(inner_src, inner_weight, *args.bias_tensor,
*args.z_tensor, *args.dst_tensor, nullptr,
Workspace((dt_byte*)bundle.get(5), bundle.get_size(5)));
} else {
if (inner_bias_layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {});
}
if (inner_z_layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {});
}
inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst,
nullptr,
Workspace((dt_byte*)bundle.get(5), bundle.get_size(5)));
relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {});
}
inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst,
nullptr, Workspace((dt_byte*)bundle.get(5), bundle.get_size(5)));
relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {});
}
// vim: syntax=cpp.doxygen
......@@ -192,8 +192,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
dst_dtype = args.dst_layout->dtype;
megdnn_assert(
(src_dtype.category() == dst_dtype.category()) ||
(args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW &&
src_dtype.enumv() == DTypeEnum::QuantizedS8 &&
(src_dtype.enumv() == DTypeEnum::QuantizedS8 &&
dst_dtype.enumv() == DTypeEnum::Float32));
megdnn_assert(src_dtype.category() == filter_dtype.category());
......
......@@ -28,6 +28,15 @@ namespace megdnn {
namespace test {
namespace conv{
TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CUDNN_CONVOLUTION) {
require_compute_capability(7, 5);
conv_bias::check_conv_bias(
dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
handle_cuda(), "DEFAULT:CUDNN:ConvBiasActivation:",
param::ConvBias::Format::NCHW4);
}
TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
require_compute_capability(6, 1);
conv_bias::check_conv_bias(
......@@ -689,6 +698,82 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
}
TEST_F(CUDA, FALLBACK_CONV_QS8) {
require_compute_capability_eq(7, 5);
Checker<ConvBiasForward> checker(handle_cuda());
auto check = [&checker](const std::string&& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-3, 3};
UniformIntRNG bias_rng{-50, 50};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
.set_dtype(3, dtype::QuantizedS8{19.990229f})
.set_dtype(4, dtype::QuantizedS8{19.990228f})
.set_epsilon(1e-3)
.set_max_avg_error(1e-1)
.set_max_avg_biased_error(1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 2;
param.format = param::ConvBias::Format::NCHW;
checker.set_param(param).execs({{16, 15, 14, 14},
{28, 15, 3, 3},
{1, 28, 1, 1},
{16, 28, 7, 7},
{}});
checker.set_param(param).execs({{16, 32, 14, 14},
{32, 32, 3, 3},
{1, 32, 1, 1},
{},
{}});
};
check("FALLBACK_CONV_NCHW_QS8");
}
TEST_F(CUDA, FALLBACK_CONV_QS8_F32) {
require_compute_capability_eq(7, 5);
Checker<ConvBiasForward> checker(handle_cuda());
auto check = [&checker](const std::string&& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-3, 3};
UniformFloatRNG bias_rng{-50.f, 50.f};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::Float32{})
.set_dtype(3, dtype::Float32{})
.set_dtype(4, dtype::Float32{})
.set_epsilon(1e-3)
.set_max_avg_error(1e-1)
.set_max_avg_biased_error(1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 2;
param.format = param::ConvBias::Format::NCHW;
checker.set_param(param).execs({{16, 15, 14, 14},
{28, 15, 3, 3},
{1, 28, 1, 1},
{16, 28, 7, 7},
{}});
checker.set_param(param).execs({{16, 32, 14, 14},
{32, 32, 3, 3},
{1, 32, 1, 1},
{},
{}});
};
check("FALLBACK_CONV_NCHW_QS8");
}
TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_WEIGHT_PREPROCESS) {
require_compute_capability(6, 1);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册