Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenCV
opencv
提交
369b260e
O
opencv
项目概览
OpenCV
/
opencv
上一次同步 7 个月
通知
979
Star
71099
Fork
55580
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
369b260e
编写于
12月 02, 2021
作者:
A
Anna Khakimova
提交者:
GitHub
12月 01, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Merge pull request #21158 from anna-khakimova:ak/simd_subC
* GAPI Fluid: SIMD for SubC kernel. * Applied comments
上级
d9e7c162
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
311 addition
and
170 deletion
+311
-170
modules/gapi/perf/common/gapi_core_perf_tests.hpp
modules/gapi/perf/common/gapi_core_perf_tests.hpp
+1
-1
modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+10
-6
modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+2
-1
modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+6
-5
modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+2
-1
modules/gapi/src/backends/fluid/gfluidcore.cpp
modules/gapi/src/backends/fluid/gfluidcore.cpp
+76
-76
modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+29
-3
modules/gapi/src/backends/fluid/gfluidcore_func.hpp
modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+24
-1
modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+161
-76
未找到文件。
modules/gapi/perf/common/gapi_core_perf_tests.hpp
浏览文件 @
369b260e
...
...
@@ -30,7 +30,7 @@ namespace opencv_test
class
AddPerfTest
:
public
TestPerfParams
<
tuple
<
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
AddCPerfTest
:
public
TestPerfParams
<
tuple
<
compare_f
,
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
SubPerfTest
:
public
TestPerfParams
<
tuple
<
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
SubCPerfTest
:
public
TestPerfParams
<
tuple
<
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
SubCPerfTest
:
public
TestPerfParams
<
tuple
<
c
ompare_f
,
c
v
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
SubRCPerfTest
:
public
TestPerfParams
<
tuple
<
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
class
MulPerfTest
:
public
TestPerfParams
<
tuple
<
compare_f
,
cv
::
Size
,
MatType
,
int
,
double
,
cv
::
GCompileArgs
>>
{};
class
MulDoublePerfTest
:
public
TestPerfParams
<
tuple
<
cv
::
Size
,
MatType
,
int
,
cv
::
GCompileArgs
>>
{};
...
...
modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
浏览文件 @
369b260e
...
...
@@ -138,10 +138,13 @@ PERF_TEST_P_(SubPerfTest, TestPerformance)
PERF_TEST_P_
(
SubCPerfTest
,
TestPerformance
)
{
Size
sz
=
get
<
0
>
(
GetParam
());
MatType
type
=
get
<
1
>
(
GetParam
());
int
dtype
=
get
<
2
>
(
GetParam
());
cv
::
GCompileArgs
compile_args
=
get
<
3
>
(
GetParam
());
compare_f
cmpF
;
cv
::
Size
sz
;
MatType
type
=
-
1
;
int
dtype
=
-
1
;
cv
::
GCompileArgs
compile_args
;
std
::
tie
(
cmpF
,
sz
,
type
,
dtype
,
compile_args
)
=
GetParam
();
initMatsRandU
(
type
,
sz
,
dtype
,
false
);
...
...
@@ -165,8 +168,9 @@ PERF_TEST_P_(SubCPerfTest, TestPerformance)
}
// Comparison ////////////////////////////////////////////////////////////
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
EXPECT_EQ
(
out_mat_gapi
.
size
(),
sz
);
{
EXPECT_TRUE
(
cmpF
(
out_mat_gapi
,
out_mat_ocv
));
}
SANITY_CHECK_NOTHING
();
}
...
...
modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
浏览文件 @
369b260e
...
...
@@ -35,7 +35,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestCPU, SubPerfTest,
Values
(
cv
::
compile_args
(
CORE_CPU
))));
INSTANTIATE_TEST_CASE_P
(
SubCPerfTestCPU
,
SubCPerfTest
,
Combine
(
Values
(
szSmall128
,
szVGA
,
sz720p
,
sz1080p
),
Combine
(
Values
(
AbsExact
().
to_compare_f
()),
Values
(
szSmall128
,
szVGA
,
sz720p
,
sz1080p
),
Values
(
CV_8UC1
,
CV_8UC3
,
CV_16UC1
,
CV_16SC1
,
CV_32FC1
),
Values
(
-
1
,
CV_8U
,
CV_16U
,
CV_32F
),
Values
(
cv
::
compile_args
(
CORE_CPU
))));
...
...
modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
浏览文件 @
369b260e
...
...
@@ -31,11 +31,12 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
Values
(
-
1
,
CV_8U
,
CV_32F
),
Values
(
cv
::
compile_args
(
CORE_FLUID
))));
// INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
// Values(-1, CV_8U, CV_16U, CV_32F),
// Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P
(
SubCPerfTestFluid
,
SubCPerfTest
,
Combine
(
Values
(
Tolerance_FloatRel_IntAbs
(
1e-6
,
1
).
to_compare_f
()),
Values
(
szSmall128
,
szVGA
,
sz720p
,
sz1080p
),
Values
(
CV_8UC1
,
CV_8UC3
,
CV_16UC1
,
CV_16SC1
,
CV_32FC1
),
Values
(
-
1
,
CV_8U
,
CV_16U
,
CV_16S
,
CV_32F
),
Values
(
cv
::
compile_args
(
CORE_FLUID
))));
// INSTANTIATE_TEST_CASE_P(SubRCPerfTestFluid, SubRCPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
...
...
modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
浏览文件 @
369b260e
...
...
@@ -33,7 +33,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestGPU, SubPerfTest,
Values
(
cv
::
compile_args
(
CORE_GPU
))));
INSTANTIATE_TEST_CASE_P
(
SubCPerfTestGPU
,
SubCPerfTest
,
Combine
(
Values
(
szSmall128
,
szVGA
,
sz720p
,
sz1080p
),
Combine
(
Values
(
AbsExact
().
to_compare_f
()),
Values
(
szSmall128
,
szVGA
,
sz720p
,
sz1080p
),
Values
(
CV_8UC1
,
CV_8UC3
,
CV_16UC1
,
CV_16SC1
,
CV_32FC1
),
Values
(
-
1
,
CV_8U
,
CV_16U
,
CV_32F
),
Values
(
cv
::
compile_args
(
CORE_GPU
))));
...
...
modules/gapi/src/backends/fluid/gfluidcore.cpp
浏览文件 @
369b260e
...
...
@@ -844,16 +844,12 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
//
//--------------------------------------
static
inline
v_uint16x8
v_sub_16u
(
const
v_uint16x8
&
x
,
const
v_uint16x8
&
y
)
{
return
x
-
y
;
}
static
inline
v_uint16x8
v_subr_16u
(
const
v_uint16x8
&
x
,
const
v_uint16x8
&
y
)
{
return
y
-
x
;
}
static
inline
v_float32x4
v_sub_32f
(
const
v_float32x4
&
x
,
const
v_float32x4
&
y
)
{
return
x
-
y
;
}
static
inline
v_float32x4
v_subr_32f
(
const
v_float32x4
&
x
,
const
v_float32x4
&
y
)
{
return
y
-
x
;
}
static
inline
int
s_sub_8u
(
uchar
x
,
uchar
y
)
{
return
x
-
y
;
}
static
inline
int
s_subr_8u
(
uchar
x
,
uchar
y
)
{
return
y
-
x
;
}
static
inline
float
s_sub_32f
(
float
x
,
float
y
)
{
return
x
-
y
;
}
static
inline
float
s_subr_32f
(
float
x
,
float
y
)
{
return
y
-
x
;
}
// manual SIMD if important case 8UC3
...
...
@@ -942,21 +938,11 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float
}
}
static
void
run_arithm_s_sub3
(
uchar
out
[],
const
uchar
in
[],
int
width
,
const
uchar
scalar
[])
{
run_arithm_s3
(
out
,
in
,
width
,
scalar
,
v_sub_16u
,
s_sub_8u
);
}
static
void
run_arithm_s_subr3
(
uchar
out
[],
const
uchar
in
[],
int
width
,
const
uchar
scalar
[])
{
run_arithm_s3
(
out
,
in
,
width
,
scalar
,
v_subr_16u
,
s_subr_8u
);
// reverse: subr
}
static
void
run_arithm_s_sub1
(
uchar
out
[],
const
float
in
[],
int
width
,
const
float
scalar
[])
{
run_arithm_s1
(
out
,
in
,
width
,
scalar
,
v_sub_32f
,
s_sub_32f
);
}
static
void
run_arithm_s_subr1
(
uchar
out
[],
const
float
in
[],
int
width
,
const
float
scalar
[])
{
run_arithm_s1
(
out
,
in
,
width
,
scalar
,
v_subr_32f
,
s_subr_32f
);
// reverse: subr
...
...
@@ -1273,6 +1259,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
int
width
=
dst
.
length
();
int
chan
=
dst
.
meta
().
chan
;
const
int
length
=
width
*
chan
;
switch
(
arithm
)
{
...
...
@@ -1280,37 +1267,21 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
{
int
w
=
0
;
#if CV_SIMD
w
=
addc_simd
(
in
,
scalar
,
out
,
wid
th
,
chan
);
w
=
addc_simd
(
in
,
scalar
,
out
,
leng
th
,
chan
);
#endif
for
(;
w
<
width
*
chan
;
++
w
)
for
(;
w
<
length
;
++
w
)
out
[
w
]
=
add
<
DST
>
(
in
[
w
],
scalar
[
w
%
chan
]);
break
;
}
case
ARITHM_SUBTRACT
:
{
// What if we cast the scalar into the SRC type?
const
SRC
myscal
[
4
]
=
{
static_cast
<
SRC
>
(
scalar
[
0
]),
static_cast
<
SRC
>
(
scalar
[
1
]),
static_cast
<
SRC
>
(
scalar
[
2
]),
static_cast
<
SRC
>
(
scalar
[
3
])
};
bool
usemyscal
=
(
myscal
[
0
]
==
scalar
[
0
])
&&
(
myscal
[
1
]
==
scalar
[
1
])
&&
(
myscal
[
2
]
==
scalar
[
2
])
&&
(
myscal
[
3
]
==
scalar
[
3
]);
if
(
usemyscal
)
{
if
(
std
::
is_same
<
DST
,
uchar
>::
value
&&
std
::
is_same
<
SRC
,
uchar
>::
value
&&
chan
==
3
)
run_arithm_s_sub3
((
uchar
*
)
out
,
(
const
uchar
*
)
in
,
width
,
(
const
uchar
*
)
myscal
);
else
if
(
std
::
is_same
<
DST
,
uchar
>::
value
&&
std
::
is_same
<
SRC
,
float
>::
value
&&
chan
==
1
)
run_arithm_s_sub1
((
uchar
*
)
out
,
(
const
float
*
)
in
,
width
,
(
const
float
*
)
myscal
);
else
run_arithm_s
(
out
,
in
,
width
,
chan
,
myscal
,
sub
<
DST
,
SRC
,
SRC
>
);
}
else
run_arithm_s
(
out
,
in
,
width
,
chan
,
scalar
,
sub
<
DST
,
SRC
,
float
>
);
int
w
=
0
;
#if CV_SIMD
w
=
subc_simd
(
in
,
scalar
,
out
,
length
,
chan
);
#endif
for
(;
w
<
length
;
++
w
)
out
[
w
]
=
sub
<
DST
>
(
in
[
w
],
scalar
[
w
%
chan
]);
break
;
}
// TODO: optimize miltiplication and division
...
...
@@ -1416,6 +1387,32 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
}
};
CV_ALWAYS_INLINE
void
initScratchBuffer
(
Buffer
&
scratch
)
{
#if CV_SIMD
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr
int
maxNlanes
=
16
;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr
int
offset
=
2
;
constexpr
int
buflen
=
maxNlanes
+
offset
;
#else
constexpr
int
buflen
=
4
;
#endif
cv
::
Size
bufsize
(
buflen
,
1
);
GMatDesc
bufdesc
=
{
CV_32F
,
1
,
bufsize
};
Buffer
buffer
(
bufdesc
);
scratch
=
std
::
move
(
buffer
);
}
GAPI_FLUID_KERNEL
(
GFluidAddC
,
cv
::
gapi
::
core
::
GAddC
,
true
)
{
static
const
int
Window
=
1
;
...
...
@@ -1458,59 +1455,62 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
static
void
initScratch
(
const
GMatDesc
&
,
const
GScalarDesc
&
,
int
,
Buffer
&
scratch
)
{
#if CV_SIMD
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr
int
maxNlanes
=
16
;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr
int
offset
=
2
;
constexpr
int
buflen
=
maxNlanes
+
offset
;
#else
constexpr
int
buflen
=
4
;
#endif
cv
::
Size
bufsize
(
buflen
,
1
);
GMatDesc
bufdesc
=
{
CV_32F
,
1
,
bufsize
};
Buffer
buffer
(
bufdesc
);
scratch
=
std
::
move
(
buffer
);
initScratchBuffer
(
scratch
);
}
static
void
resetScratch
(
Buffer
&
/*
scratch
*/
)
static
void
resetScratch
(
Buffer
&
/*
scratch
*/
)
{
}
};
GAPI_FLUID_KERNEL
(
GFluidSubC
,
cv
::
gapi
::
core
::
GSubC
,
fals
e
)
GAPI_FLUID_KERNEL
(
GFluidSubC
,
cv
::
gapi
::
core
::
GSubC
,
tru
e
)
{
static
const
int
Window
=
1
;
static
void
run
(
const
View
&
src
,
const
cv
::
Scalar
&
_scalar
,
int
/*dtype*/
,
Buffer
&
dst
)
static
void
run
(
const
View
&
src
,
const
cv
::
Scalar
&
_scalar
,
int
/*dtype*/
,
Buffer
&
dst
,
Buffer
&
scratch
)
{
const
float
scalar
[
4
]
=
{
static_cast
<
float
>
(
_scalar
[
0
]),
static_cast
<
float
>
(
_scalar
[
1
]),
static_cast
<
float
>
(
_scalar
[
2
]),
static_cast
<
float
>
(
_scalar
[
3
])
};
GAPI_Assert
(
src
.
meta
().
chan
<=
4
);
if
(
dst
.
y
()
==
0
)
{
const
int
chan
=
src
.
meta
().
chan
;
float
*
sc
=
scratch
.
OutLine
<
float
>
();
for
(
int
i
=
0
;
i
<
scratch
.
length
();
++
i
)
sc
[
i
]
=
static_cast
<
float
>
(
_scalar
[
i
%
chan
]);
}
const
float
*
scalar
=
scratch
.
OutLine
<
float
>
();
// DST SRC OP __VA_ARGS__
UNARY_
(
uchar
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
short
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
ushort
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
uchar
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
ushort
,
ushort
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
ushort
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
ushort
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
ushort
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
short
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
short
,
ushort
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
short
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
short
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
uchar
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
ushort
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
short
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
UNARY_
(
float
,
float
,
run_arithm_s
,
dst
,
src
,
scalar
,
ARITHM_SUBTRACT
);
CV_Error
(
cv
::
Error
::
StsBadArg
,
"unsupported combination of types"
);
}
static
void
initScratch
(
const
GMatDesc
&
,
const
GScalarDesc
&
,
int
,
Buffer
&
scratch
)
{
initScratchBuffer
(
scratch
);
}
static
void
resetScratch
(
Buffer
&
/*scratch*/
)
{
}
};
GAPI_FLUID_KERNEL
(
GFluidSubRC
,
cv
::
gapi
::
core
::
GSubRC
,
false
)
...
...
modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
浏览文件 @
369b260e
...
...
@@ -65,7 +65,6 @@ int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
CV_CPU_DISPATCH_MODES_ALL); \
}
MUL_SIMD
(
uchar
,
uchar
)
MUL_SIMD
(
ushort
,
uchar
)
MUL_SIMD
(
short
,
uchar
)
...
...
@@ -87,9 +86,9 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int
width, const int chan)
\
const int
length, const int chan)
\
{ \
CV_CPU_DISPATCH(addc_simd, (in, scalar, out,
width, chan),
\
CV_CPU_DISPATCH(addc_simd, (in, scalar, out,
length, chan),
\
CV_CPU_DISPATCH_MODES_ALL); \
}
...
...
@@ -112,6 +111,33 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
{ \
CV_CPU_DISPATCH(subc_simd, (in, scalar, out, length, chan), \
CV_CPU_DISPATCH_MODES_ALL); \
}
SUBC_SIMD
(
uchar
,
uchar
)
SUBC_SIMD
(
ushort
,
uchar
)
SUBC_SIMD
(
short
,
uchar
)
SUBC_SIMD
(
float
,
uchar
)
SUBC_SIMD
(
short
,
short
)
SUBC_SIMD
(
ushort
,
short
)
SUBC_SIMD
(
uchar
,
short
)
SUBC_SIMD
(
float
,
short
)
SUBC_SIMD
(
ushort
,
ushort
)
SUBC_SIMD
(
uchar
,
ushort
)
SUBC_SIMD
(
short
,
ushort
)
SUBC_SIMD
(
float
,
ushort
)
SUBC_SIMD
(
uchar
,
float
)
SUBC_SIMD
(
ushort
,
float
)
SUBC_SIMD
(
short
,
float
)
SUBC_SIMD
(
float
,
float
)
#undef SUBC_SIMD
}
// namespace fluid
}
// namespace gapi
}
// namespace cv
...
...
modules/gapi/src/backends/fluid/gfluidcore_func.hpp
浏览文件 @
369b260e
...
...
@@ -62,7 +62,7 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int
wid
th, const int chan);
const int
leng
th, const int chan);
ADDC_SIMD
(
uchar
,
uchar
)
ADDC_SIMD
(
ushort
,
uchar
)
...
...
@@ -83,6 +83,29 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
SUBC_SIMD
(
uchar
,
uchar
)
SUBC_SIMD
(
ushort
,
uchar
)
SUBC_SIMD
(
short
,
uchar
)
SUBC_SIMD
(
float
,
uchar
)
SUBC_SIMD
(
short
,
short
)
SUBC_SIMD
(
ushort
,
short
)
SUBC_SIMD
(
uchar
,
short
)
SUBC_SIMD
(
float
,
short
)
SUBC_SIMD
(
ushort
,
ushort
)
SUBC_SIMD
(
uchar
,
ushort
)
SUBC_SIMD
(
short
,
ushort
)
SUBC_SIMD
(
float
,
ushort
)
SUBC_SIMD
(
uchar
,
float
)
SUBC_SIMD
(
ushort
,
float
)
SUBC_SIMD
(
short
,
float
)
SUBC_SIMD
(
float
,
float
)
#undef SUBC_SIMD
}
// namespace fluid
}
// namespace gapi
}
// namespace cv
...
...
modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
浏览文件 @
369b260e
...
...
@@ -83,7 +83,7 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int
wid
th, const int chan);
const int
leng
th, const int chan);
ADDC_SIMD
(
uchar
,
uchar
)
ADDC_SIMD
(
ushort
,
uchar
)
...
...
@@ -104,6 +104,29 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
SUBC_SIMD
(
uchar
,
uchar
)
SUBC_SIMD
(
ushort
,
uchar
)
SUBC_SIMD
(
short
,
uchar
)
SUBC_SIMD
(
float
,
uchar
)
SUBC_SIMD
(
short
,
short
)
SUBC_SIMD
(
ushort
,
short
)
SUBC_SIMD
(
uchar
,
short
)
SUBC_SIMD
(
float
,
short
)
SUBC_SIMD
(
ushort
,
ushort
)
SUBC_SIMD
(
uchar
,
ushort
)
SUBC_SIMD
(
short
,
ushort
)
SUBC_SIMD
(
float
,
ushort
)
SUBC_SIMD
(
uchar
,
float
)
SUBC_SIMD
(
ushort
,
float
)
SUBC_SIMD
(
short
,
float
)
SUBC_SIMD
(
float
,
float
)
#undef SUBC_SIMD
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct
scale_tag
{};
...
...
@@ -851,10 +874,13 @@ MUL_SIMD(float, float)
//
//-------------------------
CV_ALWAYS_INLINE
void
addc_pack_store_c3
(
short
*
outx
,
const
v_int32
&
c1
,
const
v_int32
&
c2
,
const
v_int32
&
c3
,
const
v_int32
&
c4
,
const
v_int32
&
c5
,
const
v_int32
&
c6
)
struct
add_tag
{};
struct
sub_tag
{};
CV_ALWAYS_INLINE
void
arithmOpScalar_pack_store_c3
(
short
*
outx
,
const
v_int32
&
c1
,
const
v_int32
&
c2
,
const
v_int32
&
c3
,
const
v_int32
&
c4
,
const
v_int32
&
c5
,
const
v_int32
&
c6
)
{
constexpr
int
nlanes
=
v_int16
::
nlanes
;
vx_store
(
outx
,
v_pack
(
c1
,
c2
));
...
...
@@ -862,10 +888,10 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1,
vx_store
(
&
outx
[
2
*
nlanes
],
v_pack
(
c5
,
c6
));
}
CV_ALWAYS_INLINE
void
a
ddc
_pack_store_c3
(
ushort
*
outx
,
const
v_int32
&
c1
,
const
v_int32
&
c2
,
const
v_int32
&
c3
,
const
v_int32
&
c4
,
const
v_int32
&
c5
,
const
v_int32
&
c6
)
CV_ALWAYS_INLINE
void
a
rithmOpScalar
_pack_store_c3
(
ushort
*
outx
,
const
v_int32
&
c1
,
const
v_int32
&
c2
,
const
v_int32
&
c3
,
const
v_int32
&
c4
,
const
v_int32
&
c5
,
const
v_int32
&
c6
)
{
constexpr
int
nlanes
=
v_uint16
::
nlanes
;
vx_store
(
outx
,
v_pack_u
(
c1
,
c2
));
...
...
@@ -873,50 +899,64 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1,
vx_store
(
&
outx
[
2
*
nlanes
],
v_pack_u
(
c5
,
c6
));
}
template
<
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
v_float32
oper
(
add_tag
,
const
v_float32
&
a
,
const
v_float32
&
sc
)
{
return
a
+
sc
;
}
CV_ALWAYS_INLINE
v_float32
oper
(
sub_tag
,
const
v_float32
&
a
,
const
v_float32
&
sc
)
{
return
a
-
sc
;
}
template
<
typename
oper_tag
,
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
typename
std
::
enable_if
<
(
std
::
is_same
<
DST
,
ushort
>::
value
||
std
::
is_same
<
DST
,
short
>::
value
),
void
>::
type
addc_simd_common_impl
(
const
SRC
*
inx
,
DST
*
outx
,
const
v_float32
&
sc
,
const
int
nlanes
)
arithmOpScalar_simd_common_impl
(
oper_tag
t
,
const
SRC
*
inx
,
DST
*
outx
,
const
v_float32
&
sc
,
const
int
nlanes
)
{
v_float32
a1
=
vg_load_f32
(
inx
);
v_float32
a2
=
vg_load_f32
(
&
inx
[
nlanes
/
2
]);
v_store_i16
(
outx
,
v_round
(
a1
+
sc
),
v_round
(
a2
+
sc
));
v_store_i16
(
outx
,
v_round
(
oper
(
t
,
a1
,
sc
)),
v_round
(
oper
(
t
,
a2
,
sc
)
));
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
>
CV_ALWAYS_INLINE
void
addc_simd_common_impl
(
const
SRC
*
inx
,
uchar
*
outx
,
const
v_float32
&
sc
,
const
int
nlanes
)
template
<
typename
oper_tag
,
typename
SRC
>
CV_ALWAYS_INLINE
void
arithmOpScalar_simd_common_impl
(
oper_tag
t
,
const
SRC
*
inx
,
uchar
*
outx
,
const
v_float32
&
sc
,
const
int
nlanes
)
{
v_float32
a1
=
vg_load_f32
(
inx
);
v_float32
a2
=
vg_load_f32
(
&
inx
[
nlanes
/
4
]);
v_float32
a3
=
vg_load_f32
(
&
inx
[
nlanes
/
2
]);
v_float32
a4
=
vg_load_f32
(
&
inx
[
3
*
nlanes
/
4
]);
vx_store
(
outx
,
v_pack_u
(
v_pack
(
v_round
(
a1
+
sc
),
v_round
(
a2
+
sc
)),
v_pack
(
v_round
(
a3
+
sc
),
v_round
(
a4
+
sc
))));
vx_store
(
outx
,
v_pack_u
(
v_pack
(
v_round
(
oper
(
t
,
a1
,
sc
)
),
v_round
(
oper
(
t
,
a2
,
sc
)
)),
v_pack
(
v_round
(
oper
(
t
,
a3
,
sc
)
),
v_round
(
oper
(
t
,
a4
,
sc
)
))));
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
>
CV_ALWAYS_INLINE
void
addc_simd_common_impl
(
const
SRC
*
inx
,
float
*
outx
,
const
v_float32
&
sc
,
const
int
)
template
<
typename
oper_tag
,
typename
SRC
>
CV_ALWAYS_INLINE
void
arithmOpScalar_simd_common_impl
(
oper_tag
t
,
const
SRC
*
inx
,
float
*
outx
,
const
v_float32
&
sc
,
const
int
)
{
v_float32
a1
=
vg_load_f32
(
inx
);
vx_store
(
outx
,
a1
+
sc
);
vx_store
(
outx
,
oper
(
t
,
a1
,
sc
)
);
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
,
typename
DST
>
template
<
typename
oper_tag
,
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
typename
std
::
enable_if
<
std
::
is_same
<
DST
,
short
>::
value
||
std
::
is_same
<
DST
,
ushort
>::
value
,
void
>::
type
a
ddc_simd_c3_impl
(
const
SRC
*
inx
,
DST
*
outx
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
a
rithmOpScalar_simd_c3_impl
(
oper_tag
t
,
const
SRC
*
inx
,
DST
*
outx
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
const
v_float32
&
s3
,
const
int
nlanes
)
{
v_float32
a1
=
vg_load_f32
(
inx
);
...
...
@@ -926,60 +966,62 @@ addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float3
v_float32
a5
=
vg_load_f32
(
&
inx
[
2
*
nlanes
]);
v_float32
a6
=
vg_load_f32
(
&
inx
[
5
*
nlanes
/
2
]);
a
ddc_pack_store_c3
(
outx
,
v_round
(
a1
+
s1
),
v_round
(
a2
+
s2
),
v_round
(
a3
+
s3
),
v_round
(
a4
+
s1
),
v_round
(
a5
+
s2
),
v_round
(
a6
+
s3
));
a
rithmOpScalar_pack_store_c3
(
outx
,
v_round
(
oper
(
t
,
a1
,
s1
)
),
v_round
(
oper
(
t
,
a2
,
s2
)
),
v_round
(
oper
(
t
,
a3
,
s3
)
),
v_round
(
oper
(
t
,
a4
,
s1
)
),
v_round
(
oper
(
t
,
a5
,
s2
)
),
v_round
(
oper
(
t
,
a6
,
s3
)
));
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
>
CV_ALWAYS_INLINE
void
a
ddc_simd_c3_impl
(
const
SRC
*
inx
,
uchar
*
outx
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
const
v_float32
&
s3
,
const
int
nlanes
)
template
<
typename
oper_tag
,
typename
SRC
>
CV_ALWAYS_INLINE
void
a
rithmOpScalar_simd_c3_impl
(
oper_tag
t
,
const
SRC
*
inx
,
uchar
*
outx
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
const
v_float32
&
s3
,
const
int
nlanes
)
{
vx_store
(
outx
,
v_pack_u
(
v_pack
(
v_round
(
vg_load_f32
(
inx
)
+
s1
),
v_round
(
vg_load_f32
(
&
inx
[
nlanes
/
4
])
+
s2
)),
v_pack
(
v_round
(
vg_load_f32
(
&
inx
[
nlanes
/
2
])
+
s3
),
v_round
(
vg_load_f32
(
&
inx
[
3
*
nlanes
/
4
])
+
s1
))));
v_pack_u
(
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
inx
),
s1
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
nlanes
/
4
]),
s2
)
)),
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
nlanes
/
2
]),
s3
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
3
*
nlanes
/
4
]),
s1
)
))));
vx_store
(
&
outx
[
nlanes
],
v_pack_u
(
v_pack
(
v_round
(
vg_load_f32
(
&
inx
[
nlanes
])
+
s2
),
v_round
(
vg_load_f32
(
&
inx
[
5
*
nlanes
/
4
])
+
s3
)),
v_pack
(
v_round
(
vg_load_f32
(
&
inx
[
3
*
nlanes
/
2
])
+
s1
),
v_round
(
vg_load_f32
(
&
inx
[
7
*
nlanes
/
4
])
+
s2
))));
v_pack_u
(
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
nlanes
]),
s2
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
5
*
nlanes
/
4
]),
s3
)
)),
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
3
*
nlanes
/
2
]),
s1
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
7
*
nlanes
/
4
]),
s2
)
))));
vx_store
(
&
outx
[
2
*
nlanes
],
v_pack_u
(
v_pack
(
v_round
(
vg_load_f32
(
&
inx
[
2
*
nlanes
])
+
s3
),
v_round
(
vg_load_f32
(
&
inx
[
9
*
nlanes
/
4
])
+
s1
)),
v_pack
(
v_round
(
vg_load_f32
(
&
inx
[
5
*
nlanes
/
2
])
+
s2
),
v_round
(
vg_load_f32
(
&
inx
[
11
*
nlanes
/
4
])
+
s3
))));
v_pack_u
(
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
2
*
nlanes
]),
s3
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
9
*
nlanes
/
4
]),
s1
)
)),
v_pack
(
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
5
*
nlanes
/
2
]),
s2
)
),
v_round
(
oper
(
t
,
vg_load_f32
(
&
inx
[
11
*
nlanes
/
4
]),
s3
)
))));
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
>
CV_ALWAYS_INLINE
void
a
ddc_simd_c3_impl
(
const
SRC
*
in
,
float
*
out
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
const
v_float32
&
s3
,
const
int
nlanes
)
template
<
typename
oper_tag
,
typename
SRC
>
CV_ALWAYS_INLINE
void
a
rithmOpScalar_simd_c3_impl
(
oper_tag
t
,
const
SRC
*
in
,
float
*
out
,
const
v_float32
&
s1
,
const
v_float32
&
s2
,
const
v_float32
&
s3
,
const
int
nlanes
)
{
v_float32
a1
=
vg_load_f32
(
in
);
v_float32
a2
=
vg_load_f32
(
&
in
[
nlanes
]);
v_float32
a3
=
vg_load_f32
(
&
in
[
2
*
nlanes
]);
vx_store
(
out
,
a1
+
s1
);
vx_store
(
&
out
[
nlanes
],
a2
+
s2
);
vx_store
(
&
out
[
2
*
nlanes
],
a3
+
s3
);
vx_store
(
out
,
oper
(
t
,
a1
,
s1
)
);
vx_store
(
&
out
[
nlanes
],
oper
(
t
,
a2
,
s2
)
);
vx_store
(
&
out
[
2
*
nlanes
],
oper
(
t
,
a3
,
s3
)
);
}
//-------------------------------------------------------------------------------------------------
template
<
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
int
addc_simd_c3
(
const
SRC
in
[],
const
float
scalar
[],
DST
out
[],
const
int
length
)
template
<
typename
oper_tag
,
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
int
arithmOpScalar_simd_c3
(
oper_tag
t
,
const
SRC
in
[],
const
float
scalar
[],
DST
out
[],
const
int
length
)
{
constexpr
int
chan
=
3
;
constexpr
int
nlanes
=
vector_type_of_t
<
DST
>::
nlanes
;
...
...
@@ -1002,7 +1044,7 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
{
for
(;
x
<=
length
-
lanes
;
x
+=
lanes
)
{
a
ddc_simd_c3_impl
(
&
in
[
x
],
&
out
[
x
],
s1
,
s2
,
s3
,
nlanes
);
a
rithmOpScalar_simd_c3_impl
(
t
,
&
in
[
x
],
&
out
[
x
],
s1
,
s2
,
s3
,
nlanes
);
}
if
(
x
<
length
)
...
...
@@ -1015,8 +1057,12 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
return
x
;
}
template
<
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
int
addc_simd_common
(
const
SRC
in
[],
const
float
scalar
[],
DST
out
[],
const
int
length
)
//-------------------------------------------------------------------------------------------------
template
<
typename
oper_tag
,
typename
SRC
,
typename
DST
>
CV_ALWAYS_INLINE
int
arithmOpScalar_simd_common
(
oper_tag
t
,
const
SRC
in
[],
const
float
scalar
[],
DST
out
[],
const
int
length
)
{
constexpr
int
nlanes
=
vector_type_of_t
<
DST
>::
nlanes
;
...
...
@@ -1030,7 +1076,7 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
{
for
(;
x
<=
length
-
nlanes
;
x
+=
nlanes
)
{
a
ddc_simd_common_impl
(
&
in
[
x
],
&
out
[
x
],
sc
,
nlanes
);
a
rithmOpScalar_simd_common_impl
(
t
,
&
in
[
x
],
&
out
[
x
],
sc
,
nlanes
);
}
if
(
x
<
length
)
...
...
@@ -1043,24 +1089,25 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
return
x
;
}
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int width, const int chan) \
{ \
const int length = width * chan; \
switch (chan) \
{ \
case 1: \
case 2: \
case 4: \
return addc_simd_common(in, scalar, out, length); \
case 3: \
return addc_simd_c3(in, scalar, out, length); \
default: \
GAPI_Assert(chan <= 4); \
break; \
} \
return 0; \
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
{ \
switch (chan) \
{ \
case 1: \
case 2: \
case 4: \
return arithmOpScalar_simd_common(add_tag{}, in, scalar, out, length); \
case 3: \
return arithmOpScalar_simd_c3(add_tag{}, in, scalar, out, length); \
default: \
GAPI_Assert(chan <= 4); \
break; \
} \
return 0; \
}
ADDC_SIMD
(
uchar
,
uchar
)
...
...
@@ -1082,6 +1129,44 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
{ \
switch (chan) \
{ \
case 1: \
case 2: \
case 4: \
return arithmOpScalar_simd_common(sub_tag{}, in, scalar, out, length); \
case 3: \
return arithmOpScalar_simd_c3(sub_tag{}, in, scalar, out, length); \
default: \
GAPI_Assert(chan <= 4); \
break; \
} \
return 0; \
}
SUBC_SIMD
(
uchar
,
uchar
)
SUBC_SIMD
(
ushort
,
uchar
)
SUBC_SIMD
(
short
,
uchar
)
SUBC_SIMD
(
float
,
uchar
)
SUBC_SIMD
(
short
,
short
)
SUBC_SIMD
(
ushort
,
short
)
SUBC_SIMD
(
uchar
,
short
)
SUBC_SIMD
(
float
,
short
)
SUBC_SIMD
(
ushort
,
ushort
)
SUBC_SIMD
(
uchar
,
ushort
)
SUBC_SIMD
(
short
,
ushort
)
SUBC_SIMD
(
float
,
ushort
)
SUBC_SIMD
(
uchar
,
float
)
SUBC_SIMD
(
ushort
,
float
)
SUBC_SIMD
(
short
,
float
)
SUBC_SIMD
(
float
,
float
)
#undef SUBC_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录