From d58b5ef74b91ba68ef89f9ce8be8725d0dfcaa0d Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Mon, 29 Nov 2021 14:20:53 +0300 Subject: [PATCH] Merge pull request #21119 from anna-khakimova:ak/simd_addc * GAPI Fluid: SIMD for AddC kernel * Final version * Applied comments. --- modules/gapi/include/opencv2/gapi/core.hpp | 1 + .../gapi/perf/common/gapi_core_perf_tests.hpp | 2 +- .../perf/common/gapi_core_perf_tests_inl.hpp | 16 +- .../perf/cpu/gapi_core_perf_tests_cpu.cpp | 3 +- .../perf/cpu/gapi_core_perf_tests_fluid.cpp | 11 +- .../perf/gpu/gapi_core_perf_tests_gpu.cpp | 3 +- .../gapi/src/backends/fluid/gfluidcore.cpp | 150 ++++++---- .../fluid/gfluidcore_func.dispatch.cpp | 27 ++ .../src/backends/fluid/gfluidcore_func.hpp | 23 ++ .../backends/fluid/gfluidcore_func.simd.hpp | 265 +++++++++++++++++- 10 files changed, 424 insertions(+), 77 deletions(-) diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp index 35f875a809..052c6a944c 100644 --- a/modules/gapi/include/opencv2/gapi/core.hpp +++ b/modules/gapi/include/opencv2/gapi/core.hpp @@ -57,6 +57,7 @@ namespace core { G_TYPED_KERNEL(GAddC, , "org.opencv.core.math.addC") { static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) { + GAPI_Assert(a.chan <= 4); return a.withDepth(ddepth); } }; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index f5916a6aaf..39557f1acb 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -28,7 +28,7 @@ namespace opencv_test //------------------------------------------------------------------------------ class AddPerfTest : public TestPerfParams> {}; - class AddCPerfTest : public TestPerfParams> {}; + class AddCPerfTest : public TestPerfParams> {}; class SubPerfTest : public TestPerfParams> {}; class SubCPerfTest : public TestPerfParams> {}; class SubRCPerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index fbbda1a31d..b0568f9bae 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -61,10 +61,13 @@ PERF_TEST_P_(AddPerfTest, TestPerformance) PERF_TEST_P_(AddCPerfTest, TestPerformance) { - Size sz = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - cv::GCompileArgs compile_args = get<3>(GetParam()); + compare_f cmpF; + cv::Size sz; + MatType type = -1; + int dtype = -1; + cv::GCompileArgs compile_args; + + std::tie(cmpF, sz, type, dtype, compile_args) = GetParam(); initMatsRandU(type, sz, dtype, false); @@ -88,8 +91,9 @@ PERF_TEST_P_(AddCPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 09196fd24f..d7ead88327 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -22,7 +22,8 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestCPU, AddPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(AddCPerfTestCPU, AddCPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(-1, CV_8U, CV_16U, CV_32F), Values(cv::compile_args(CORE_CPU)))); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 6c80231f32..a367937520 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -18,11 +18,12 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestFluid, AddPerfTest, Values(-1, CV_8U, CV_32F), Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest, -// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, Combine(Values(szSmall128, szVGA, sz720p, sz1080p), diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index 0b260bf553..d1d6deff2d 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -20,7 +20,8 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestGPU, AddPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(AddCPerfTestGPU, AddCPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), Values(cv::compile_args(CORE_GPU)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 7a3d90acc7..16a87ea314 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -645,8 +645,8 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l #endif // CV_SIMD template -static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm, - double scale=1) +static CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2, + Arithm arithm, double scale=1) { static_assert(std::is_same::value, "wrong types"); @@ -844,19 +844,15 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false) // //-------------------------------------- -static inline v_uint16x8 v_add_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x + y; } static inline v_uint16x8 v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; } static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; } -static inline v_float32x4 v_add_32f(const v_float32x4 &x, const v_float32x4 &y) { return x + y; } static inline v_float32x4 v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; } static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; } -static inline int s_add_8u(uchar x, uchar y) { return x + y; } static inline int s_sub_8u(uchar x, uchar y) { return x - y; } static inline int s_subr_8u(uchar x, uchar y) { return y - x; } -static inline float s_add_32f(float x, float y) { return x + y; } static inline float s_sub_32f(float x, float y) { return x - y; } static inline float s_subr_32f(float x, float y) { return y - x; } @@ -946,11 +942,6 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float } } -static void run_arithm_s_add3(uchar out[], const uchar in[], int width, const uchar scalar[]) -{ - run_arithm_s3(out, in, width, scalar, v_add_16u, s_add_8u); -} - static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[]) { run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u); @@ -961,11 +952,6 @@ static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const u run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr } -static void run_arithm_s_add1(uchar out[], const float in[], int width, const float scalar[]) -{ - run_arithm_s1(out, in, width, scalar, v_add_32f, s_add_32f); -} - static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[]) { run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f); @@ -1279,8 +1265,8 @@ static void run_absdiffc(Buffer &dst, const View &src, const float scalar[]) } template -static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm, - float scale=1) +CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float scalar[], + Arithm arithm, float scale=1) { const auto *in = src.InLine(0); auto *out = dst.OutLine(); @@ -1288,48 +1274,45 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar int width = dst.length(); int chan = dst.meta().chan; - // What if we cast the scalar into the SRC type? - const SRC myscal[4] = { static_cast(scalar[0]), static_cast(scalar[1]), - static_cast(scalar[2]), static_cast(scalar[3]) }; - bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) && - (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]); - switch (arithm) { case ARITHM_ADD: - if (usemyscal) - { - if (std::is_same::value && - std::is_same::value && - chan == 3) - run_arithm_s_add3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal); - else if (std::is_same::value && - std::is_same::value && - chan == 1) - run_arithm_s_add1((uchar*)out, (const float*)in, width, (const float*)myscal); - else - run_arithm_s(out, in, width, chan, myscal, add); - } - else - run_arithm_s(out, in, width, chan, scalar, add); + { + int w = 0; +#if CV_SIMD + w = addc_simd(in, scalar, out, width, chan); +#endif + + for (; w < width * chan; ++w) + out[w] = add(in[w], scalar[w % chan]); + break; + } case ARITHM_SUBTRACT: + { + // What if we cast the scalar into the SRC type? + const SRC myscal[4] = { static_cast(scalar[0]), static_cast(scalar[1]), + static_cast(scalar[2]), static_cast(scalar[3]) }; + bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) && + (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]); + if (usemyscal) { - if (std::is_same::value && - std::is_same::value && + if (std::is_same::value && + std::is_same::value && chan == 3) run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal); - else if (std::is_same::value && - std::is_same::value && - chan == 1) + else if (std::is_same::value && + std::is_same::value && + chan == 1) run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal); else - run_arithm_s(out, in, width, chan, myscal, sub); + run_arithm_s(out, in, width, chan, myscal, sub); } else - run_arithm_s(out, in, width, chan, scalar, sub); + run_arithm_s(out, in, width, chan, scalar, sub); break; + } // TODO: optimize miltiplication and division case ARITHM_MULTIPLY: for (int w=0; w < width; w++) @@ -1433,30 +1416,75 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) } }; -GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false) +GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst) + static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst, Buffer &scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar[i % chan]); + } + + const float* scalar = scratch.OutLine(); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); - UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(float, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_ADD); + UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_ADD); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch) + { +#if CV_SIMD + // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. + constexpr int maxNlanes = 16; + + // +2 is offset for 3-channel case. + // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. + // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} + // The first scalar SIMD vector should looks like: + // C1 C2 C3 C1 + // The second: + // C2 C3 C1 C2 + // The third: + // C3 C1 C2 C3 + constexpr int offset = 2; + constexpr int buflen = maxNlanes + offset; +#else + constexpr int buflen = 4; +#endif + cv::Size bufsize(buflen, 1); + GMatDesc bufdesc = { CV_32F, 1, bufsize }; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } }; GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 297c065427..b6842e2390 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -85,6 +85,33 @@ MUL_SIMD(float, float) #undef MUL_SIMD +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ + const int width, const int chan) \ +{ \ + CV_CPU_DISPATCH(addc_simd, (in, scalar, out, width, chan), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +ADDC_SIMD(uchar, uchar) +ADDC_SIMD(ushort, uchar) +ADDC_SIMD(short, uchar) +ADDC_SIMD(float, uchar) +ADDC_SIMD(short, short) +ADDC_SIMD(ushort, short) +ADDC_SIMD(uchar, short) +ADDC_SIMD(float, short) +ADDC_SIMD(ushort, ushort) +ADDC_SIMD(uchar, ushort) +ADDC_SIMD(short, ushort) +ADDC_SIMD(float, ushort) +ADDC_SIMD(uchar, float) +ADDC_SIMD(ushort, float) +ADDC_SIMD(short, float) +ADDC_SIMD(float, float) + +#undef ADDC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 3ae41c6aef..ba48f7a621 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -60,6 +60,29 @@ MUL_SIMD(float, float) #undef MUL_SIMD +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ + const int width, const int chan); + +ADDC_SIMD(uchar, uchar) +ADDC_SIMD(ushort, uchar) +ADDC_SIMD(short, uchar) +ADDC_SIMD(float, uchar) +ADDC_SIMD(short, short) +ADDC_SIMD(ushort, short) +ADDC_SIMD(uchar, short) +ADDC_SIMD(float, short) +ADDC_SIMD(ushort, ushort) +ADDC_SIMD(uchar, ushort) +ADDC_SIMD(short, ushort) +ADDC_SIMD(float, ushort) +ADDC_SIMD(uchar, float) +ADDC_SIMD(ushort, float) +ADDC_SIMD(short, float) +ADDC_SIMD(float, float) + +#undef ADDC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 5139d54745..071c05633a 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -81,6 +81,29 @@ MUL_SIMD(float, float) #undef MUL_SIMD +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ + const int width, const int chan); + +ADDC_SIMD(uchar, uchar) +ADDC_SIMD(ushort, uchar) +ADDC_SIMD(short, uchar) +ADDC_SIMD(float, uchar) +ADDC_SIMD(short, short) +ADDC_SIMD(ushort, short) +ADDC_SIMD(uchar, short) +ADDC_SIMD(float, short) +ADDC_SIMD(ushort, ushort) +ADDC_SIMD(uchar, ushort) +ADDC_SIMD(short, ushort) +ADDC_SIMD(float, ushort) +ADDC_SIMD(uchar, float) +ADDC_SIMD(ushort, float) +ADDC_SIMD(short, float) +ADDC_SIMD(float, float) + +#undef ADDC_SIMD + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -95,6 +118,7 @@ using vector_type_of_t = typename vector_type_of::type; template<> struct vector_type_of { using type = v_uint8; }; template<> struct vector_type_of { using type = v_uint16; }; template<> struct vector_type_of { using type = v_int16; }; +template<> struct vector_type_of { using type = v_float32; }; CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in) { @@ -136,12 +160,12 @@ CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_flo return a / div; } -CV_ALWAYS_INLINE void v_store_i16(short* dst, v_int32& res1, v_int32& res2) +CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2) { vx_store(dst, v_pack(res1, res2)); } -CV_ALWAYS_INLINE void v_store_i16(ushort* dst, v_int32& res1, v_int32& res2) +CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int32& res2) { vx_store(dst, v_pack_u(res1, res2)); } @@ -821,6 +845,243 @@ MUL_SIMD(float, float) #undef MUL_SIMD +//------------------------- +// +// Fluid kernels: AddC +// +//------------------------- + +CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) +{ + constexpr int nlanes = v_int16::nlanes; + vx_store(outx, v_pack(c1, c2)); + vx_store(&outx[nlanes], v_pack(c3, c4)); + vx_store(&outx[2*nlanes], v_pack(c5, c6)); +} + +CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) +{ + constexpr int nlanes = v_uint16::nlanes; + vx_store(outx, v_pack_u(c1, c2)); + vx_store(&outx[nlanes], v_pack_u(c3, c4)); + vx_store(&outx[2*nlanes], v_pack_u(c5, c6)); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value || + std::is_same::value), void>::type +addc_simd_common_impl(const SRC* inx, DST* outx, const v_float32& sc, const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes/2]); + + v_store_i16(outx, v_round(a1 + sc), v_round(a2 + sc)); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, uchar* outx, const v_float32& sc, const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes/4]); + v_float32 a3 = vg_load_f32(&inx[nlanes/2]); + v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]); + + vx_store(outx, v_pack_u(v_pack(v_round(a1 + sc), + v_round(a2 + sc)), + v_pack(v_round(a3 + sc), + v_round(a4 + sc)))); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, float* outx, const v_float32& sc, const int) +{ + v_float32 a1 = vg_load_f32(inx); + vx_store(outx, a1 + sc); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes / 2]); + v_float32 a3 = vg_load_f32(&inx[nlanes]); + v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]); + v_float32 a5 = vg_load_f32(&inx[2 * nlanes]); + v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]); + + addc_pack_store_c3(outx, v_round(a1 + s1), + v_round(a2 + s2), + v_round(a3 + s3), + v_round(a4 + s1), + v_round(a5 + s2), + v_round(a6 + s3)); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int nlanes) +{ + vx_store(outx, + v_pack_u(v_pack(v_round(vg_load_f32(inx) + s1), + v_round(vg_load_f32(&inx[nlanes/4]) + s2)), + v_pack(v_round(vg_load_f32(&inx[nlanes/2]) + s3), + v_round(vg_load_f32(&inx[3*nlanes/4]) + s1)))); + + vx_store(&outx[nlanes], + v_pack_u(v_pack(v_round(vg_load_f32(&inx[nlanes]) + s2), + v_round(vg_load_f32(&inx[5*nlanes/4]) + s3)), + v_pack(v_round(vg_load_f32(&inx[3*nlanes/2]) + s1), + v_round(vg_load_f32(&inx[7*nlanes/4]) + s2)))); + + vx_store(&outx[2 * nlanes], + v_pack_u(v_pack(v_round(vg_load_f32(&inx[2*nlanes]) + s3), + v_round(vg_load_f32(&inx[9*nlanes/4]) + s1)), + v_pack(v_round(vg_load_f32(&inx[5*nlanes/2]) + s2), + v_round(vg_load_f32(&inx[11*nlanes/4]) + s3)))); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int nlanes) +{ + v_float32 a1 = vg_load_f32(in); + v_float32 a2 = vg_load_f32(&in[nlanes]); + v_float32 a3 = vg_load_f32(&in[2*nlanes]); + + vx_store(out, a1 + s1); + vx_store(&out[nlanes], a2 + s2); + vx_store(&out[2*nlanes], a3 + s3); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[], const int length) +{ + constexpr int chan = 3; + constexpr int nlanes = vector_type_of_t::nlanes; + constexpr int lanes = chan * nlanes; + + if (length < lanes) + return 0; + + v_float32 s1 = vx_load(scalar); +#if CV_SIMD_WIDTH == 32 + v_float32 s2 = vx_load(&scalar[2]); + v_float32 s3 = vx_load(&scalar[1]); +#else + v_float32 s2 = vx_load(&scalar[1]); + v_float32 s3 = vx_load(&scalar[2]); +#endif + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + addc_simd_c3_impl(&in[x], &out[x], s1, s2, s3, nlanes); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +template +CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST out[], const int length) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 sc = vx_load(scalar); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + addc_simd_common_impl(&in[x], &out[x], sc, nlanes); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ + const int width, const int chan) \ +{ \ + const int length = width * chan; \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + return addc_simd_common(in, scalar, out, length); \ + case 3: \ + return addc_simd_c3(in, scalar, out, length); \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +ADDC_SIMD(uchar, uchar) +ADDC_SIMD(ushort, uchar) +ADDC_SIMD(short, uchar) +ADDC_SIMD(float, uchar) +ADDC_SIMD(short, short) +ADDC_SIMD(ushort, short) +ADDC_SIMD(uchar, short) +ADDC_SIMD(float, short) +ADDC_SIMD(ushort, ushort) +ADDC_SIMD(uchar, ushort) +ADDC_SIMD(short, ushort) +ADDC_SIMD(float, ushort) +ADDC_SIMD(uchar, float) +ADDC_SIMD(ushort, float) +ADDC_SIMD(short, float) +ADDC_SIMD(float, float) + +#undef ADDC_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END -- GitLab