From 369b260e1208f6f182c6c1da4dac9c6ed30950fa Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Thu, 2 Dec 2021 00:58:30 +0300 Subject: [PATCH] Merge pull request #21158 from anna-khakimova:ak/simd_subC * GAPI Fluid: SIMD for SubC kernel. * Applied comments --- .../gapi/perf/common/gapi_core_perf_tests.hpp | 2 +- .../perf/common/gapi_core_perf_tests_inl.hpp | 16 +- .../perf/cpu/gapi_core_perf_tests_cpu.cpp | 3 +- .../perf/cpu/gapi_core_perf_tests_fluid.cpp | 11 +- .../perf/gpu/gapi_core_perf_tests_gpu.cpp | 3 +- .../gapi/src/backends/fluid/gfluidcore.cpp | 152 +++++------ .../fluid/gfluidcore_func.dispatch.cpp | 32 ++- .../src/backends/fluid/gfluidcore_func.hpp | 25 +- .../backends/fluid/gfluidcore_func.simd.hpp | 237 ++++++++++++------ 9 files changed, 311 insertions(+), 170 deletions(-) diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 39557f1acb..97b12f86b1 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -30,7 +30,7 @@ namespace opencv_test class AddPerfTest : public TestPerfParams> {}; class AddCPerfTest : public TestPerfParams> {}; class SubPerfTest : public TestPerfParams> {}; - class SubCPerfTest : public TestPerfParams> {}; + class SubCPerfTest : public TestPerfParams> {}; class SubRCPerfTest : public TestPerfParams> {}; class MulPerfTest : public TestPerfParams> {}; class MulDoublePerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index b0568f9bae..6c286a5ce2 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -138,10 +138,13 @@ PERF_TEST_P_(SubPerfTest, TestPerformance) PERF_TEST_P_(SubCPerfTest, TestPerformance) { - Size sz = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - cv::GCompileArgs compile_args = get<3>(GetParam()); + compare_f cmpF; + cv::Size sz; + MatType type = -1; + int dtype = -1; + cv::GCompileArgs compile_args; + + std::tie(cmpF, sz, type, dtype, compile_args) = GetParam(); initMatsRandU(type, sz, dtype, false); @@ -165,8 +168,9 @@ PERF_TEST_P_(SubCPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index d7ead88327..31e9d25610 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -35,7 +35,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestCPU, SubPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(SubCPerfTestCPU, SubCPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(-1, CV_8U, CV_16U, CV_32F), Values(cv::compile_args(CORE_CPU)))); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index a367937520..6ebd92dc4a 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -31,11 +31,12 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, Values(-1, CV_8U, CV_32F), Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest, -// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(cv::compile_args(CORE_FLUID)))); // INSTANTIATE_TEST_CASE_P(SubRCPerfTestFluid, SubRCPerfTest, // Combine(Values(szSmall128, szVGA, sz720p, sz1080p), diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index d1d6deff2d..b4207c266d 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -33,7 +33,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestGPU, SubPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(SubCPerfTestGPU, SubCPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), Values(cv::compile_args(CORE_GPU)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 16a87ea314..a737ad627b 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -844,16 +844,12 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false) // //-------------------------------------- -static inline v_uint16x8 v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; } static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; } -static inline v_float32x4 v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; } static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; } -static inline int s_sub_8u(uchar x, uchar y) { return x - y; } static inline int s_subr_8u(uchar x, uchar y) { return y - x; } -static inline float s_sub_32f(float x, float y) { return x - y; } static inline float s_subr_32f(float x, float y) { return y - x; } // manual SIMD if important case 8UC3 @@ -942,21 +938,11 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float } } -static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[]) -{ - run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u); -} - static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[]) { run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr } -static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[]) -{ - run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f); -} - static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[]) { run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr @@ -1273,6 +1259,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca int width = dst.length(); int chan = dst.meta().chan; + const int length = width * chan; switch (arithm) { @@ -1280,37 +1267,21 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca { int w = 0; #if CV_SIMD - w = addc_simd(in, scalar, out, width, chan); + w = addc_simd(in, scalar, out, length, chan); #endif - - for (; w < width * chan; ++w) + for (; w < length; ++w) out[w] = add(in[w], scalar[w % chan]); break; } case ARITHM_SUBTRACT: { - // What if we cast the scalar into the SRC type? - const SRC myscal[4] = { static_cast(scalar[0]), static_cast(scalar[1]), - static_cast(scalar[2]), static_cast(scalar[3]) }; - bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) && - (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]); - - if (usemyscal) - { - if (std::is_same::value && - std::is_same::value && - chan == 3) - run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal); - else if (std::is_same::value && - std::is_same::value && - chan == 1) - run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal); - else - run_arithm_s(out, in, width, chan, myscal, sub); - } - else - run_arithm_s(out, in, width, chan, scalar, sub); + int w = 0; +#if CV_SIMD + w = subc_simd(in, scalar, out, length, chan); +#endif + for (; w < length; ++w) + out[w] = sub(in[w], scalar[w % chan]); break; } // TODO: optimize miltiplication and division @@ -1416,6 +1387,32 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) } }; +CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) +{ +#if CV_SIMD + // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. + constexpr int maxNlanes = 16; + + // +2 is offset for 3-channel case. + // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. + // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} + // The first scalar SIMD vector should looks like: + // C1 C2 C3 C1 + // The second: + // C2 C3 C1 C2 + // The third: + // C3 C1 C2 C3 + constexpr int offset = 2; + constexpr int buflen = maxNlanes + offset; +#else + constexpr int buflen = 4; +#endif + cv::Size bufsize(buflen, 1); + GMatDesc bufdesc = { CV_32F, 1, bufsize }; + Buffer buffer(bufdesc); + scratch = std::move(buffer); +} + GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) { static const int Window = 1; @@ -1458,59 +1455,62 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch) { -#if CV_SIMD - // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. - constexpr int maxNlanes = 16; - - // +2 is offset for 3-channel case. - // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. - // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} - // The first scalar SIMD vector should looks like: - // C1 C2 C3 C1 - // The second: - // C2 C3 C1 C2 - // The third: - // C3 C1 C2 C3 - constexpr int offset = 2; - constexpr int buflen = maxNlanes + offset; -#else - constexpr int buflen = 4; -#endif - cv::Size bufsize(buflen, 1); - GMatDesc bufdesc = { CV_32F, 1, bufsize }; - Buffer buffer(bufdesc); - scratch = std::move(buffer); + initScratchBuffer(scratch); } - static void resetScratch(Buffer& /* scratch */) + static void resetScratch(Buffer& /*scratch*/) { } }; -GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false) +GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst) + static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/, Buffer& dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar[i % chan]); + } + + const float* scalar = scratch.OutLine(); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch) + { + initScratchBuffer(scratch); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index b6842e2390..668ac3a4bb 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -65,7 +65,6 @@ int mul_simd(const SRC in1[], const SRC in2[], DST out[], \ CV_CPU_DISPATCH_MODES_ALL); \ } - MUL_SIMD(uchar, uchar) MUL_SIMD(ushort, uchar) MUL_SIMD(short, uchar) @@ -87,9 +86,9 @@ MUL_SIMD(float, float) #define ADDC_SIMD(SRC, DST) \ int addc_simd(const SRC in[], const float scalar[], DST out[], \ - const int width, const int chan) \ + const int length, const int chan) \ { \ - CV_CPU_DISPATCH(addc_simd, (in, scalar, out, width, chan), \ + CV_CPU_DISPATCH(addc_simd, (in, scalar, out, length, chan), \ CV_CPU_DISPATCH_MODES_ALL); \ } @@ -112,6 +111,33 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD +#define SUBC_SIMD(SRC, DST) \ +int subc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan) \ +{ \ + CV_CPU_DISPATCH(subc_simd, (in, scalar, out, length, chan), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +SUBC_SIMD(uchar, uchar) +SUBC_SIMD(ushort, uchar) +SUBC_SIMD(short, uchar) +SUBC_SIMD(float, uchar) +SUBC_SIMD(short, short) +SUBC_SIMD(ushort, short) +SUBC_SIMD(uchar, short) +SUBC_SIMD(float, short) +SUBC_SIMD(ushort, ushort) +SUBC_SIMD(uchar, ushort) +SUBC_SIMD(short, ushort) +SUBC_SIMD(float, ushort) +SUBC_SIMD(uchar, float) +SUBC_SIMD(ushort, float) +SUBC_SIMD(short, float) +SUBC_SIMD(float, float) + +#undef SUBC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index ba48f7a621..e6c0d4fe9b 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -62,7 +62,7 @@ MUL_SIMD(float, float) #define ADDC_SIMD(SRC, DST) \ int addc_simd(const SRC in[], const float scalar[], DST out[], \ - const int width, const int chan); + const int length, const int chan); ADDC_SIMD(uchar, uchar) ADDC_SIMD(ushort, uchar) @@ -83,6 +83,29 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD +#define SUBC_SIMD(SRC, DST) \ +int subc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan); + +SUBC_SIMD(uchar, uchar) +SUBC_SIMD(ushort, uchar) +SUBC_SIMD(short, uchar) +SUBC_SIMD(float, uchar) +SUBC_SIMD(short, short) +SUBC_SIMD(ushort, short) +SUBC_SIMD(uchar, short) +SUBC_SIMD(float, short) +SUBC_SIMD(ushort, ushort) +SUBC_SIMD(uchar, ushort) +SUBC_SIMD(short, ushort) +SUBC_SIMD(float, ushort) +SUBC_SIMD(uchar, float) +SUBC_SIMD(ushort, float) +SUBC_SIMD(short, float) +SUBC_SIMD(float, float) + +#undef SUBC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 071c05633a..aed5359e7b 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -83,7 +83,7 @@ MUL_SIMD(float, float) #define ADDC_SIMD(SRC, DST) \ int addc_simd(const SRC in[], const float scalar[], DST out[], \ - const int width, const int chan); + const int length, const int chan); ADDC_SIMD(uchar, uchar) ADDC_SIMD(ushort, uchar) @@ -104,6 +104,29 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD +#define SUBC_SIMD(SRC, DST) \ +int subc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan); + +SUBC_SIMD(uchar, uchar) +SUBC_SIMD(ushort, uchar) +SUBC_SIMD(short, uchar) +SUBC_SIMD(float, uchar) +SUBC_SIMD(short, short) +SUBC_SIMD(ushort, short) +SUBC_SIMD(uchar, short) +SUBC_SIMD(float, short) +SUBC_SIMD(ushort, ushort) +SUBC_SIMD(uchar, ushort) +SUBC_SIMD(short, ushort) +SUBC_SIMD(float, ushort) +SUBC_SIMD(uchar, float) +SUBC_SIMD(ushort, float) +SUBC_SIMD(short, float) +SUBC_SIMD(float, float) + +#undef SUBC_SIMD + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -851,10 +874,13 @@ MUL_SIMD(float, float) // //------------------------- -CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1, - const v_int32& c2, const v_int32& c3, - const v_int32& c4, const v_int32& c5, - const v_int32& c6) +struct add_tag {}; +struct sub_tag {}; + +CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) { constexpr int nlanes = v_int16::nlanes; vx_store(outx, v_pack(c1, c2)); @@ -862,10 +888,10 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1, vx_store(&outx[2*nlanes], v_pack(c5, c6)); } -CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1, - const v_int32& c2, const v_int32& c3, - const v_int32& c4, const v_int32& c5, - const v_int32& c6) +CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) { constexpr int nlanes = v_uint16::nlanes; vx_store(outx, v_pack_u(c1, c2)); @@ -873,50 +899,64 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1, vx_store(&outx[2*nlanes], v_pack_u(c5, c6)); } -template +CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc) +{ + return a + sc; +} + +CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc) +{ + return a - sc; +} + +template CV_ALWAYS_INLINE typename std::enable_if<(std::is_same::value || std::is_same::value), void>::type -addc_simd_common_impl(const SRC* inx, DST* outx, const v_float32& sc, const int nlanes) +arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx, DST* outx, + const v_float32& sc, const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes/2]); - v_store_i16(outx, v_round(a1 + sc), v_round(a2 + sc)); + v_store_i16(outx, v_round(oper(t, a1, sc)), v_round(oper(t, a2, sc))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, uchar* outx, const v_float32& sc, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx, + uchar* outx, const v_float32& sc, + const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes/4]); v_float32 a3 = vg_load_f32(&inx[nlanes/2]); v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]); - vx_store(outx, v_pack_u(v_pack(v_round(a1 + sc), - v_round(a2 + sc)), - v_pack(v_round(a3 + sc), - v_round(a4 + sc)))); + vx_store(outx, v_pack_u(v_pack(v_round(oper(t, a1, sc)), + v_round(oper(t, a2, sc))), + v_pack(v_round(oper(t, a3, sc)), + v_round(oper(t, a4, sc))))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, float* outx, const v_float32& sc, const int) +template +CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx, + float* outx, const v_float32& sc, const int) { v_float32 a1 = vg_load_f32(inx); - vx_store(outx, a1 + sc); + vx_store(outx, oper(t, a1, sc)); } //------------------------------------------------------------------------------------------------- -template +template CV_ALWAYS_INLINE typename std::enable_if::value || std::is_same::value, void>::type -addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, +arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, const v_float32& s3, const int nlanes) { v_float32 a1 = vg_load_f32(inx); @@ -926,60 +966,62 @@ addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float3 v_float32 a5 = vg_load_f32(&inx[2 * nlanes]); v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]); - addc_pack_store_c3(outx, v_round(a1 + s1), - v_round(a2 + s2), - v_round(a3 + s3), - v_round(a4 + s1), - v_round(a5 + s2), - v_round(a6 + s3)); + arithmOpScalar_pack_store_c3(outx, v_round(oper(t, a1, s1)), + v_round(oper(t, a2, s2)), + v_round(oper(t, a3, s3)), + v_round(oper(t, a4, s1)), + v_round(oper(t, a5, s2)), + v_round(oper(t, a6, s3))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* inx, uchar* outx, - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int nlanes) { vx_store(outx, - v_pack_u(v_pack(v_round(vg_load_f32(inx) + s1), - v_round(vg_load_f32(&inx[nlanes/4]) + s2)), - v_pack(v_round(vg_load_f32(&inx[nlanes/2]) + s3), - v_round(vg_load_f32(&inx[3*nlanes/4]) + s1)))); + v_pack_u(v_pack(v_round(oper(t, vg_load_f32(inx), s1)), + v_round(oper(t, vg_load_f32(&inx[nlanes/4]), s2))), + v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes/2]), s3)), + v_round(oper(t, vg_load_f32(&inx[3*nlanes/4]), s1))))); vx_store(&outx[nlanes], - v_pack_u(v_pack(v_round(vg_load_f32(&inx[nlanes]) + s2), - v_round(vg_load_f32(&inx[5*nlanes/4]) + s3)), - v_pack(v_round(vg_load_f32(&inx[3*nlanes/2]) + s1), - v_round(vg_load_f32(&inx[7*nlanes/4]) + s2)))); + v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes]), s2)), + v_round(oper(t, vg_load_f32(&inx[5*nlanes/4]), s3))), + v_pack(v_round(oper(t, vg_load_f32(&inx[3*nlanes/2]), s1)), + v_round(oper(t, vg_load_f32(&inx[7*nlanes/4]), s2))))); vx_store(&outx[2 * nlanes], - v_pack_u(v_pack(v_round(vg_load_f32(&inx[2*nlanes]) + s3), - v_round(vg_load_f32(&inx[9*nlanes/4]) + s1)), - v_pack(v_round(vg_load_f32(&inx[5*nlanes/2]) + s2), - v_round(vg_load_f32(&inx[11*nlanes/4]) + s3)))); + v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[2*nlanes]), s3)), + v_round(oper(t, vg_load_f32(&inx[9*nlanes/4]), s1))), + v_pack(v_round(oper(t, vg_load_f32(&inx[5*nlanes/2]), s2)), + v_round(oper(t, vg_load_f32(&inx[11*nlanes/4]), s3))))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* in, float* out, - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int nlanes) { v_float32 a1 = vg_load_f32(in); v_float32 a2 = vg_load_f32(&in[nlanes]); v_float32 a3 = vg_load_f32(&in[2*nlanes]); - vx_store(out, a1 + s1); - vx_store(&out[nlanes], a2 + s2); - vx_store(&out[2*nlanes], a3 + s3); + vx_store(out, oper(t, a1, s1)); + vx_store(&out[nlanes], oper(t, a2, s2)); + vx_store(&out[2*nlanes], oper(t, a3, s3)); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[], const int length) +template +CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[], + const float scalar[], DST out[], + const int length) { constexpr int chan = 3; constexpr int nlanes = vector_type_of_t::nlanes; @@ -1002,7 +1044,7 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[ { for (; x <= length - lanes; x += lanes) { - addc_simd_c3_impl(&in[x], &out[x], s1, s2, s3, nlanes); + arithmOpScalar_simd_c3_impl(t, &in[x], &out[x], s1, s2, s3, nlanes); } if (x < length) @@ -1015,8 +1057,12 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[ return x; } -template -CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST out[], const int length) +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[], + const float scalar[], DST out[], + const int length) { constexpr int nlanes = vector_type_of_t::nlanes; @@ -1030,7 +1076,7 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST { for (; x <= length - nlanes; x += nlanes) { - addc_simd_common_impl(&in[x], &out[x], sc, nlanes); + arithmOpScalar_simd_common_impl(t, &in[x], &out[x], sc, nlanes); } if (x < length) @@ -1043,24 +1089,25 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST return x; } -#define ADDC_SIMD(SRC, DST) \ -int addc_simd(const SRC in[], const float scalar[], DST out[], \ - const int width, const int chan) \ -{ \ - const int length = width * chan; \ - switch (chan) \ - { \ - case 1: \ - case 2: \ - case 4: \ - return addc_simd_common(in, scalar, out, length); \ - case 3: \ - return addc_simd_c3(in, scalar, out, length); \ - default: \ - GAPI_Assert(chan <= 4); \ - break; \ - } \ - return 0; \ + + +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + return arithmOpScalar_simd_common(add_tag{}, in, scalar, out, length); \ + case 3: \ + return arithmOpScalar_simd_c3(add_tag{}, in, scalar, out, length); \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ } ADDC_SIMD(uchar, uchar) @@ -1082,6 +1129,44 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD +#define SUBC_SIMD(SRC, DST) \ +int subc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + return arithmOpScalar_simd_common(sub_tag{}, in, scalar, out, length); \ + case 3: \ + return arithmOpScalar_simd_c3(sub_tag{}, in, scalar, out, length); \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +SUBC_SIMD(uchar, uchar) +SUBC_SIMD(ushort, uchar) +SUBC_SIMD(short, uchar) +SUBC_SIMD(float, uchar) +SUBC_SIMD(short, short) +SUBC_SIMD(ushort, short) +SUBC_SIMD(uchar, short) +SUBC_SIMD(float, short) +SUBC_SIMD(ushort, ushort) +SUBC_SIMD(uchar, ushort) +SUBC_SIMD(short, ushort) +SUBC_SIMD(float, ushort) +SUBC_SIMD(uchar, float) +SUBC_SIMD(ushort, float) +SUBC_SIMD(short, float) +SUBC_SIMD(float, float) + +#undef SUBC_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END -- GitLab