Merge pull request #21158 from anna-khakimova:ak/simd_subC

* GAPI Fluid: SIMD for SubC kernel. * Applied comments

Merge pull request #21158 from anna-khakimova:ak/simd_subC
* GAPI Fluid: SIMD for SubC kernel. * Applied comments
369b260e · Anna Khakimova · GitHub · d9e7c162 · 369b260e · 369b260e
9 changed file
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -30,7 +30,7 @@ namespace opencv_test
    class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class AddCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
    class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
-    class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
    class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
    class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};

--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -138,10 +138,13 @@ PERF_TEST_P_(SubPerfTest, TestPerformance)

 PERF_TEST_P_(SubCPerfTest, TestPerformance)
 {
-    Size sz = get<0>(GetParam());
-    MatType type = get<1>(GetParam());
-    int dtype = get<2>(GetParam());
-    cv::GCompileArgs compile_args = get<3>(GetParam());
+    compare_f cmpF;
+    cv::Size sz;
+    MatType type = -1;
+    int dtype = -1;
+    cv::GCompileArgs compile_args;
+
+    std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();

    initMatsRandU(type, sz, dtype, false);

@@ -165,8 +168,9 @@ PERF_TEST_P_(SubCPerfTest, TestPerformance)
    }

    // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
-    EXPECT_EQ(out_mat_gapi.size(), sz);
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }

    SANITY_CHECK_NOTHING();
 }

--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -35,7 +35,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestCPU, SubPerfTest,
        Values(cv::compile_args(CORE_CPU))));

 INSTANTIATE_TEST_CASE_P(SubCPerfTestCPU, SubCPerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
        Values(-1, CV_8U, CV_16U, CV_32F),
        Values(cv::compile_args(CORE_CPU))));

--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -31,11 +31,12 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
            Values(-1, CV_8U, CV_32F),
            Values(cv::compile_args(CORE_FLUID))));

-// INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest,
-//     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(-1, CV_8U, CV_16U, CV_32F),
-//         Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest,
+     Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+         Values(szSmall128, szVGA, sz720p, sz1080p),
+         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+         Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
+         Values(cv::compile_args(CORE_FLUID))));

 // INSTANTIATE_TEST_CASE_P(SubRCPerfTestFluid, SubRCPerfTest,
 //     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),

--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -33,7 +33,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestGPU, SubPerfTest,
                                Values(cv::compile_args(CORE_GPU))));

 INSTANTIATE_TEST_CASE_P(SubCPerfTestGPU, SubCPerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                Values( -1, CV_8U, CV_16U, CV_32F ),
                                Values(cv::compile_args(CORE_GPU))));

--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -844,16 +844,12 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
 //
 //--------------------------------------

-static inline v_uint16x8  v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
 static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }

-static inline v_float32x4  v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
 static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }

-static inline int  s_sub_8u(uchar x, uchar y) { return x - y; }
 static inline int s_subr_8u(uchar x, uchar y) { return y - x; }

-static inline float  s_sub_32f(float x, float y) { return x - y; }
 static inline float s_subr_32f(float x, float y) { return y - x; }

 // manual SIMD if important case 8UC3
@@ -942,21 +938,11 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float
    }
 }

-static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
-{
-    run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
-}
-
 static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[])
 {
    run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
 }

-static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
-{
-    run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
-}
-
 static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[])
 {
    run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr
@@ -1273,6 +1259,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca

    int width  = dst.length();
    int chan   = dst.meta().chan;
+    const int length = width * chan;

    switch (arithm)
    {
@@ -1280,37 +1267,21 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
    {
            int w = 0;
 #if CV_SIMD
-            w = addc_simd(in, scalar, out, width, chan);
+            w = addc_simd(in, scalar, out, length, chan);
 #endif
-
-            for (; w < width * chan; ++w)
+            for (; w < length; ++w)
                out[w] = add<DST>(in[w], scalar[w % chan]);

        break;
    }
    case ARITHM_SUBTRACT:
    {
-        // What if we cast the scalar into the SRC type?
-        const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
-                                static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
-        bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
-            (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
-
-        if (usemyscal)
-        {
-            if (std::is_same<DST, uchar>::value &&
-                std::is_same<SRC, uchar>::value &&
-                chan == 3)
-                run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
-            else if (std::is_same<DST, uchar>::value &&
-                std::is_same<SRC, float>::value &&
-                chan == 1)
-                run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
-            else
-                run_arithm_s(out, in, width, chan, myscal, sub<DST, SRC, SRC>);
-        }
-        else
-            run_arithm_s(out, in, width, chan, scalar, sub<DST, SRC, float>);
+        int w = 0;
+#if CV_SIMD
+        w = subc_simd(in, scalar, out, length, chan);
+#endif
+        for (; w < length; ++w)
+            out[w] = sub<DST>(in[w], scalar[w % chan]);
        break;
    }
    // TODO: optimize miltiplication and division
@@ -1416,6 +1387,32 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
    }
 };

+CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
+{
+#if CV_SIMD
+    // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
+    constexpr int maxNlanes = 16;
+
+    // +2 is offset for 3-channel case.
+    // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
+    // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
+    // The first scalar SIMD vector should looks like:
+    // C1 C2 C3 C1
+    // The second:
+    // C2 C3 C1 C2
+    // The third:
+    // C3 C1 C2 C3
+    constexpr int offset = 2;
+    constexpr int buflen = maxNlanes + offset;
+#else
+    constexpr int buflen = 4;
+#endif
+    cv::Size bufsize(buflen, 1);
+    GMatDesc bufdesc = { CV_32F, 1, bufsize };
+    Buffer buffer(bufdesc);
+    scratch = std::move(buffer);
+}
+
 GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
 {
    static const int Window = 1;
@@ -1458,59 +1455,62 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)

    static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
    {
-#if CV_SIMD
-        // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
-        constexpr int maxNlanes = 16;
-
-        // +2 is offset for 3-channel case.
-        // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
-        // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
-        // The first scalar SIMD vector should looks like:
-        // C1 C2 C3 C1
-        // The second:
-        // C2 C3 C1 C2
-        // The third:
-        // C3 C1 C2 C3
-        constexpr int offset = 2;
-        constexpr int buflen = maxNlanes + offset;
-#else
-        constexpr int buflen = 4;
-#endif
-        cv::Size bufsize(buflen, 1);
-        GMatDesc bufdesc = { CV_32F, 1, bufsize };
-        Buffer buffer(bufdesc);
-        scratch = std::move(buffer);
+        initScratchBuffer(scratch);
    }

-    static void resetScratch(Buffer& /* scratch */)
+    static void resetScratch(Buffer& /*scratch*/)
    {
    }
 };

-GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false)
+GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true)
 {
    static const int Window = 1;

-    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+    static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/, Buffer& dst, Buffer& scratch)
    {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
+        GAPI_Assert(src.meta().chan <= 4);
+
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar[i % chan]);
+        }
+
+        const float* scalar = scratch.OutLine<float>();

        //     DST     SRC     OP            __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  uchar,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  short,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  float,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, short,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, uchar,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, float,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  short,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  uchar,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  float,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  short,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  float,  run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
+
+    static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
+    {
+        initScratchBuffer(scratch);
+    }
+
+    static void resetScratch(Buffer& /*scratch*/)
+    {
+    }
 };

 GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -65,7 +65,6 @@ int mul_simd(const SRC in1[], const SRC in2[], DST out[],                   \
                    CV_CPU_DISPATCH_MODES_ALL);                             \
 }

-
 MUL_SIMD(uchar, uchar)
 MUL_SIMD(ushort, uchar)
 MUL_SIMD(short, uchar)
@@ -87,9 +86,9 @@ MUL_SIMD(float, float)

 #define ADDC_SIMD(SRC, DST)                                               \
 int addc_simd(const SRC in[], const float scalar[], DST out[],            \
-              const int width, const int chan)                            \
+              const int length, const int chan)                           \
 {                                                                         \
-    CV_CPU_DISPATCH(addc_simd, (in, scalar, out, width, chan),            \
+    CV_CPU_DISPATCH(addc_simd, (in, scalar, out, length, chan),           \
                    CV_CPU_DISPATCH_MODES_ALL);                           \
 }

@@ -112,6 +111,33 @@ ADDC_SIMD(float, float)

 #undef ADDC_SIMD

+#define SUBC_SIMD(SRC, DST)                                               \
+int subc_simd(const SRC in[], const float scalar[], DST out[],            \
+              const int length, const int chan)                           \
+{                                                                         \
+    CV_CPU_DISPATCH(subc_simd, (in, scalar, out, length, chan),           \
+                    CV_CPU_DISPATCH_MODES_ALL);                           \
+}
+
+SUBC_SIMD(uchar, uchar)
+SUBC_SIMD(ushort, uchar)
+SUBC_SIMD(short, uchar)
+SUBC_SIMD(float, uchar)
+SUBC_SIMD(short, short)
+SUBC_SIMD(ushort, short)
+SUBC_SIMD(uchar, short)
+SUBC_SIMD(float, short)
+SUBC_SIMD(ushort, ushort)
+SUBC_SIMD(uchar, ushort)
+SUBC_SIMD(short, ushort)
+SUBC_SIMD(float, ushort)
+SUBC_SIMD(uchar, float)
+SUBC_SIMD(ushort, float)
+SUBC_SIMD(short, float)
+SUBC_SIMD(float, float)
+
+#undef SUBC_SIMD
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -62,7 +62,7 @@ MUL_SIMD(float, float)

 #define ADDC_SIMD(SRC, DST)                                                              \
 int addc_simd(const SRC in[], const float scalar[], DST out[],                           \
-              const int width, const int chan);
+              const int length, const int chan);

 ADDC_SIMD(uchar, uchar)
 ADDC_SIMD(ushort, uchar)
@@ -83,6 +83,29 @@ ADDC_SIMD(float, float)

 #undef ADDC_SIMD

+#define SUBC_SIMD(SRC, DST)                                                              \
+int subc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan);
+
+SUBC_SIMD(uchar, uchar)
+SUBC_SIMD(ushort, uchar)
+SUBC_SIMD(short, uchar)
+SUBC_SIMD(float, uchar)
+SUBC_SIMD(short, short)
+SUBC_SIMD(ushort, short)
+SUBC_SIMD(uchar, short)
+SUBC_SIMD(float, short)
+SUBC_SIMD(ushort, ushort)
+SUBC_SIMD(uchar, ushort)
+SUBC_SIMD(short, ushort)
+SUBC_SIMD(float, ushort)
+SUBC_SIMD(uchar, float)
+SUBC_SIMD(ushort, float)
+SUBC_SIMD(short, float)
+SUBC_SIMD(float, float)
+
+#undef SUBC_SIMD
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -83,7 +83,7 @@ MUL_SIMD(float, float)

 #define ADDC_SIMD(SRC, DST)                                                              \
 int addc_simd(const SRC in[], const float scalar[], DST out[],                           \
-              const int width, const int chan);
+              const int length, const int chan);

 ADDC_SIMD(uchar, uchar)
 ADDC_SIMD(ushort, uchar)
@@ -104,6 +104,29 @@ ADDC_SIMD(float, float)

 #undef ADDC_SIMD

+#define SUBC_SIMD(SRC, DST)                                                              \
+int subc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan);
+
+SUBC_SIMD(uchar, uchar)
+SUBC_SIMD(ushort, uchar)
+SUBC_SIMD(short, uchar)
+SUBC_SIMD(float, uchar)
+SUBC_SIMD(short, short)
+SUBC_SIMD(ushort, short)
+SUBC_SIMD(uchar, short)
+SUBC_SIMD(float, short)
+SUBC_SIMD(ushort, ushort)
+SUBC_SIMD(uchar, ushort)
+SUBC_SIMD(short, ushort)
+SUBC_SIMD(float, ushort)
+SUBC_SIMD(uchar, float)
+SUBC_SIMD(ushort, float)
+SUBC_SIMD(short, float)
+SUBC_SIMD(float, float)
+
+#undef SUBC_SIMD
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 struct scale_tag {};
@@ -851,10 +874,13 @@ MUL_SIMD(float, float)
 //
 //-------------------------

-CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx,       const v_int32& c1,
-                                         const v_int32& c2, const v_int32& c3,
-                                         const v_int32& c4, const v_int32& c5,
-                                         const v_int32& c6)
+struct add_tag {};
+struct sub_tag {};
+
+CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_int32& c1,
+                                                   const v_int32& c2, const v_int32& c3,
+                                                   const v_int32& c4, const v_int32& c5,
+                                                   const v_int32& c6)
 {
    constexpr int nlanes = v_int16::nlanes;
    vx_store(outx,           v_pack(c1, c2));
@@ -862,10 +888,10 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx,       const v_int32& c1,
    vx_store(&outx[2*nlanes], v_pack(c5, c6));
 }

-CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx,      const v_int32& c1,
-                                         const v_int32& c2, const v_int32& c3,
-                                         const v_int32& c4, const v_int32& c5,
-                                         const v_int32& c6)
+CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_int32& c1,
+                                                   const v_int32& c2, const v_int32& c3,
+                                                   const v_int32& c4, const v_int32& c5,
+                                                   const v_int32& c6)
 {
    constexpr int nlanes = v_uint16::nlanes;
    vx_store(outx,            v_pack_u(c1, c2));
@@ -873,50 +899,64 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx,      const v_int32& c1,
    vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
 }

-template<typename SRC, typename DST>
+CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
+{
+    return a + sc;
+}
+
+CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
+{
+    return a - sc;
+}
+
+template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<(std::is_same<DST, ushort>::value ||
                         std::is_same<DST, short>::value), void>::type
-addc_simd_common_impl(const SRC* inx, DST* outx, const v_float32& sc, const int nlanes)
+arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx, DST* outx,
+                                const v_float32& sc, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes/2]);

-    v_store_i16(outx, v_round(a1 + sc), v_round(a2 + sc));
+    v_store_i16(outx, v_round(oper(t, a1, sc)), v_round(oper(t, a2, sc)));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, uchar* outx, const v_float32& sc, const int nlanes)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx,
+                                                      uchar* outx, const v_float32& sc,
+                                                      const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
    v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
    v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);

-    vx_store(outx, v_pack_u(v_pack(v_round(a1 + sc),
-                                   v_round(a2 + sc)),
-                            v_pack(v_round(a3 + sc),
-                                   v_round(a4 + sc))));
+    vx_store(outx, v_pack_u(v_pack(v_round(oper(t, a1, sc)),
+                                   v_round(oper(t, a2, sc))),
+                            v_pack(v_round(oper(t, a3, sc)),
+                                   v_round(oper(t, a4, sc)))));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, float* outx, const v_float32& sc, const int)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx,
+                                                      float* outx, const v_float32& sc, const int)
 {
    v_float32 a1 = vg_load_f32(inx);
-    vx_store(outx, a1 + sc);
+    vx_store(outx, oper(t, a1, sc));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC, typename DST>
+template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                        std::is_same<DST, ushort>::value, void>::type
-addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
+arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
                  const v_float32& s3, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
@@ -926,60 +966,62 @@ addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float3
    v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
    v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);

-    addc_pack_store_c3(outx, v_round(a1 + s1),
-                             v_round(a2 + s2),
-                             v_round(a3 + s3),
-                             v_round(a4 + s1),
-                             v_round(a5 + s2),
-                             v_round(a6 + s3));
+    arithmOpScalar_pack_store_c3(outx, v_round(oper(t, a1, s1)),
+                                       v_round(oper(t, a2, s2)),
+                                       v_round(oper(t, a3, s3)),
+                                       v_round(oper(t, a4, s1)),
+                                       v_round(oper(t, a5, s2)),
+                                       v_round(oper(t, a6, s3)));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* inx, uchar* outx,
-                                       const v_float32& s1, const v_float32& s2,
-                                       const v_float32& s3, const int nlanes)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, uchar* outx,
+                                                  const v_float32& s1, const v_float32& s2,
+                                                  const v_float32& s3, const int nlanes)
 {
    vx_store(outx,
-               v_pack_u(v_pack(v_round(vg_load_f32(inx) + s1),
-                               v_round(vg_load_f32(&inx[nlanes/4]) + s2)),
-                        v_pack(v_round(vg_load_f32(&inx[nlanes/2]) + s3),
-                               v_round(vg_load_f32(&inx[3*nlanes/4]) + s1))));
+               v_pack_u(v_pack(v_round(oper(t, vg_load_f32(inx), s1)),
+                               v_round(oper(t, vg_load_f32(&inx[nlanes/4]), s2))),
+                        v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes/2]), s3)),
+                               v_round(oper(t, vg_load_f32(&inx[3*nlanes/4]), s1)))));

    vx_store(&outx[nlanes],
-                v_pack_u(v_pack(v_round(vg_load_f32(&inx[nlanes]) + s2),
-                                v_round(vg_load_f32(&inx[5*nlanes/4]) + s3)),
-                         v_pack(v_round(vg_load_f32(&inx[3*nlanes/2]) + s1),
-                                v_round(vg_load_f32(&inx[7*nlanes/4]) + s2))));
+                v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes]), s2)),
+                                v_round(oper(t, vg_load_f32(&inx[5*nlanes/4]), s3))),
+                         v_pack(v_round(oper(t, vg_load_f32(&inx[3*nlanes/2]), s1)),
+                                v_round(oper(t, vg_load_f32(&inx[7*nlanes/4]), s2)))));

    vx_store(&outx[2 * nlanes],
-                v_pack_u(v_pack(v_round(vg_load_f32(&inx[2*nlanes]) + s3),
-                                v_round(vg_load_f32(&inx[9*nlanes/4]) + s1)),
-                         v_pack(v_round(vg_load_f32(&inx[5*nlanes/2]) + s2),
-                                v_round(vg_load_f32(&inx[11*nlanes/4]) + s3))));
+                v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[2*nlanes]), s3)),
+                                v_round(oper(t, vg_load_f32(&inx[9*nlanes/4]), s1))),
+                         v_pack(v_round(oper(t, vg_load_f32(&inx[5*nlanes/2]), s2)),
+                                v_round(oper(t, vg_load_f32(&inx[11*nlanes/4]), s3)))));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* in, float* out,
-                                        const v_float32& s1, const v_float32& s2,
-                                        const v_float32& s3, const int nlanes)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* in, float* out,
+                                                  const v_float32& s1, const v_float32& s2,
+                                                  const v_float32& s3, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(in);
    v_float32 a2 = vg_load_f32(&in[nlanes]);
    v_float32 a3 = vg_load_f32(&in[2*nlanes]);

-    vx_store(out, a1 + s1);
-    vx_store(&out[nlanes], a2 + s2);
-    vx_store(&out[2*nlanes], a3 + s3);
+    vx_store(out, oper(t, a1, s1));
+    vx_store(&out[nlanes], oper(t, a2, s2));
+    vx_store(&out[2*nlanes], oper(t, a3, s3));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC, typename DST>
-CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[], const int length)
+template<typename oper_tag, typename SRC, typename DST>
+CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
+                                            const float scalar[], DST out[],
+                                            const int length)
 {
    constexpr int chan = 3;
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
@@ -1002,7 +1044,7 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
    {
        for (; x <= length - lanes; x += lanes)
        {
-            addc_simd_c3_impl(&in[x], &out[x], s1, s2, s3, nlanes);
+            arithmOpScalar_simd_c3_impl(t, &in[x], &out[x], s1, s2, s3, nlanes);
        }

        if (x < length)
@@ -1015,8 +1057,12 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
    return x;
 }

-template<typename SRC, typename DST>
-CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST out[], const int length)
+//-------------------------------------------------------------------------------------------------
+
+template<typename oper_tag, typename SRC, typename DST>
+CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
+                                                const float scalar[], DST out[],
+                                                const int length)
 {
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;

@@ -1030,7 +1076,7 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
    {
        for (; x <= length - nlanes; x += nlanes)
        {
-            addc_simd_common_impl(&in[x], &out[x], sc, nlanes);
+            arithmOpScalar_simd_common_impl(t, &in[x], &out[x], sc, nlanes);
        }

        if (x < length)
@@ -1043,24 +1089,25 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
    return x;
 }

-#define ADDC_SIMD(SRC, DST)                                       \
-int addc_simd(const SRC in[], const float scalar[], DST out[],    \
-              const int width, const int chan)                    \
-{                                                                 \
-    const int length = width * chan;                              \
-    switch (chan)                                                 \
-    {                                                             \
-    case 1:                                                       \
-    case 2:                                                       \
-    case 4:                                                       \
-        return addc_simd_common(in, scalar, out, length);         \
-    case 3:                                                       \
-        return addc_simd_c3(in, scalar, out, length);             \
-    default:                                                      \
-        GAPI_Assert(chan <= 4);                                   \
-        break;                                                    \
-    }                                                             \
-    return 0;                                                     \
+
+
+#define ADDC_SIMD(SRC, DST)                                                         \
+int addc_simd(const SRC in[], const float scalar[], DST out[],                      \
+              const int length, const int chan)                                     \
+{                                                                                   \
+    switch (chan)                                                                   \
+    {                                                                               \
+    case 1:                                                                         \
+    case 2:                                                                         \
+    case 4:                                                                         \
+        return arithmOpScalar_simd_common(add_tag{}, in, scalar, out, length);      \
+    case 3:                                                                         \
+        return arithmOpScalar_simd_c3(add_tag{}, in, scalar, out, length);          \
+    default:                                                                        \
+        GAPI_Assert(chan <= 4);                                                     \
+        break;                                                                      \
+    }                                                                               \
+    return 0;                                                                       \
 }

 ADDC_SIMD(uchar, uchar)
@@ -1082,6 +1129,44 @@ ADDC_SIMD(float, float)

 #undef ADDC_SIMD

+#define SUBC_SIMD(SRC, DST)                                                         \
+int subc_simd(const SRC in[], const float scalar[], DST out[],                      \
+              const int length, const int chan)                                     \
+{                                                                                   \
+    switch (chan)                                                                   \
+    {                                                                               \
+    case 1:                                                                         \
+    case 2:                                                                         \
+    case 4:                                                                         \
+        return arithmOpScalar_simd_common(sub_tag{}, in, scalar, out, length);      \
+    case 3:                                                                         \
+        return arithmOpScalar_simd_c3(sub_tag{}, in, scalar, out, length);          \
+    default:                                                                        \
+        GAPI_Assert(chan <= 4);                                                     \
+        break;                                                                      \
+    }                                                                               \
+    return 0;                                                                       \
+}
+
+SUBC_SIMD(uchar, uchar)
+SUBC_SIMD(ushort, uchar)
+SUBC_SIMD(short, uchar)
+SUBC_SIMD(float, uchar)
+SUBC_SIMD(short, short)
+SUBC_SIMD(ushort, short)
+SUBC_SIMD(uchar, short)
+SUBC_SIMD(float, short)
+SUBC_SIMD(ushort, ushort)
+SUBC_SIMD(uchar, ushort)
+SUBC_SIMD(short, ushort)
+SUBC_SIMD(float, ushort)
+SUBC_SIMD(uchar, float)
+SUBC_SIMD(ushort, float)
+SUBC_SIMD(short, float)
+SUBC_SIMD(float, float)
+
+#undef SUBC_SIMD
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 CV_CPU_OPTIMIZATION_NAMESPACE_END