From 5d33bf7219aec3406ee6baf6e6592873cdd13955 Mon Sep 17 00:00:00 2001 From: MaoXianxin Date: Thu, 10 Jun 2021 10:21:45 +0800 Subject: [PATCH] =?UTF-8?q?OpenCV=E5=BC=80=E5=8F=91=E7=B3=BB=E5=88=97=201?= =?UTF-8?q?=EF=BC=9A=E5=A6=82=E4=BD=95=E4=B8=BADNN=E5=A2=9E=E5=8A=A0Tengin?= =?UTF-8?q?e=E5=90=8E=E7=AB=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..8cd9dce --- /dev/null +++ b/README.md @@ -0,0 +1,235 @@ +# OpenCV开发系列 1:如何为DNN增加Tengine后端 + +OpenCV 4.3.0集成Tengine为DNN模块的一个后端,实现了DNN在ARM上的推理速度最快达到翻倍。OpenCV 4.5.0 Tengine升级为Tengine-Lite,又将DNN的速度最高缩短207%。OpenCV为什么将Tengine作为DNN ARM后端?为DNN添加Tengine后端或者其它新的后端要怎样开发?OPEN AI LAB(开放智能)Tengine for OpenCV项目负责人李琦工程师对此进行了详细介绍。 + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101605.png) + +很荣幸能加入OPEN AI LAB , 遇到一些很棒的人和事,这样层层的荣幸叠加,让我有幸能遇到OpenCV中国团队,并且能借此将Tengine和OpenCV结合起来。我这篇将文章围绕OpenCV里面集成Tengine的这项功能的开发流程来讲。 + +Tengine是OPEN AI LAB(开放智能)的开源边缘AI推理框架,本身是聚焦在端侧的推理,针对ARM不同的核都有不同的汇编优化实现,在现在国内推理框架层出不穷的时代,Tengine还能稳稳的守住性能王者的位置,也是得益于这一块的优化能力。大家肯定也知道,OpenCV是宇宙最强的计算机视觉库,在神经网络大火的年代也是很早就做了很全的推理的实现,而且接口简单,对老用户来说极其方便,但是在ARM上的性能确实也是还有很大的优化空间。在这样的一个前提下,强强联合,便产生了这样的一个需求。 + +实现的总体方案是先解决性能的大头,神经网络推理性能耗时八成是在卷积的计算,Tengine在卷积的实现上有采用了高效的手工汇编优化,所以就按照**将卷积移植到OpenCV**的逻辑来做,如下图示: + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101630.png) + +这里主要有以下两个问题: + +- 如何在OpenCV的卷积运算的时候调用Tengine?这里面包括了OpenCV的图调用逻辑、卷积的调用逻辑、卷积的参数传递、数据排布等等兼容性问题; +- 如何将Tengine顺利嫁接到OpenCV上?仅仅移植卷积实现,还是移植整个Tengine的架构?编译如何无缝链接? + +在方案的早期基本就确定了将Tengine作为整体嵌入,编译直接对接,卷积计算以整个图的方式被调用,并以单层构图的方式运行,逻辑如下图示: + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101652.png) + +这种方式将Tengine作为一个外挂的库动态编译到OpenCV中,并且被调用执行,需要完成以下工作来实现: + +- **OpenCV的集成编译**。此步骤需要在OpenCV编译的时候将Tengine编译进去,涉及到了解OpenCV的编译以及Tengine的编译和调用。 +- **卷积计算图的调用**。要了解OpenCV的单层计算的参数传递和流程,保证能顺利调用Tengine进行计算。 +- **完整的测试**。包括OpenCV的CI测试和性能测试。 + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101722.png) + +**集成编译** + +下图是集成编译的调用关系。 + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101741.png) + +实际代码的修改和解释包括: + +**a. 主CMakeList.txt** + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101757.png) + +**b.** **opencv/cmake/OpenCVFindTengine.cmake** + +``` +set(OPENCV_LIBTENGINE_ROOT_DIR "" CACHE PATH "Where to look for additional OpenCV modules (can be ;-separated list of paths)") ## 设置用户可配置Tengine的目录。 + +IF(OPENCV_LIBTENGINE_ROOT_DIR) ## 如果配置了Tengine的目录,使能对应的开关 + MESSAGE(STATUS "TENGINE:-- Set tengine lib dir by user ") + + SET(Tengine_FOUND ON) + set(BUILD_TENGINE OFF) + + SET(Tengine_INCLUDE_DIR ${OPENCV_LIBTENGINE_ROOT_DIR}/include) + SET(Tengine_LIB ${OPENCV_LIBTENGINE_ROOT_DIR}/lib/libtengine.a) + +ELSE() ## 如果没有配置目录,就会调用到tengine.cmake的脚本去下载tengine源码,并编译 + + MESSAGE(STATUS "TENGINE:-- Auto download Tengine source code. ") + include("${OpenCV_SOURCE_DIR}/3rdparty/libtengine/tengine.cmake") + +ENDIF() + +IF(NOT Tengine_LIB) ## 对库文件的检测,如果没有,会报异常,并关掉Tengine + SET(Tengine_FOUND OFF) + MESSAGE(STATUS "#### Could not find Tengine lib. Turning Tengine_FOUND off") +ENDIF() + +IF (Tengine_FOUND) ## 不管是配置了库,还是自动下载源码了,此处都会配置相关的头文件和库文件路径 + MESSAGE(STATUS "Found Tengine include: ${Tengine_INCLUDE_DIR}") + MESSAGE(STATUS "Found Tengine libraries: ${Tengine_LIB}") + set(HAVE_TENGINE 1) + set(TENGINE_LIBRARIES ${Tengine_LIB}) + set(TENGINE_INCLUDE_DIRS ${Tengine_INCLUDE_DIR}) +ENDIF (Tengine_FOUND) + +MESSAGE(STATUS "Tengine include is:" ${Tengine_INCLUDE_DIR}) +MESSAGE(STATUS "Tengine library is:" ${Tengine_LIB}) + +MARK_AS_ADVANCED( + Tengine_INCLUDE_DIR + Tengine_LIB + Tengine +) +``` + +**c.** **opencv/3rdparty/libtengine/tengine.cmake** + +``` +SET(TENGINE_VERSION "tengine-opencv") +SET(OCV_TENGINE_DSTDIRECTORY ${OpenCV_BINARY_DIR}/3rdparty/libtengine) +SET(DEFAULT_OPENCV_TENGINE_SOURCE_PATH ${OCV_TENGINE_DSTDIRECTORY}/Tengine-${TENGINE_VERSION}) + +IF(EXISTS ${DEFAULT_OPENCV_TENGINE_SOURCE_PATH}) +## 如果存在Tengine已经下载好的源码,那么不会重复下载,自动编译即可 + MESSAGE(STATUS "Tengine is exist already .") + + SET(Tengine_FOUND ON) + set(BUILD_TENGINE ON) +ELSE() + SET(OCV_TENGINE_FILENAME "${TENGINE_VERSION}.zip") #name2 + SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url2 + SET(tengine_md5sum 9c80d91dc8413911522ec80cde013ae2) #md5sum2 + + MESSAGE(STATUS "**** TENGINE DOWNLOAD BEGIN ****") + ocv_download(FILENAME ${OCV_TENGINE_FILENAME} ## 下载Tengine源码 + HASH ${tengine_md5sum} + URL + "${OPENCV_TENGINE_URL}" + "$ENV{OPENCV_TENGINE_URL}" + "${OCV_TENGINE_URL}" + DESTINATION_DIR ${OCV_TENGINE_DSTDIRECTORY} + ID TENGINE + STATUS res + UNPACK RELATIVE_URL) + + if (NOT res) ## 下载不成功,关掉TENGINE + MESSAGE(STATUS "TENGINE DOWNLOAD FAILED .Turning Tengine_FOUND off.") + SET(Tengine_FOUND OFF) + else () + MESSAGE(STATUS "TENGINE DOWNLOAD success . ") + SET(Tengine_FOUND ON) + set(BUILD_TENGINE ON) + endif() +ENDIF() + +if (BUILD_TENGINE) + set(HAVE_TENGINE 1) + + # android system + if(ANDROID) ## 配置android系统下需要传递给tengine的参数,是arm32还是arm64 + if(${ANDROID_ABI} STREQUAL "armeabi-v7a") + set(CONFIG_ARCH_ARM32 ON) + elseif(${ANDROID_ABI} STREQUAL "arm64-v8a") + set(CONFIG_ARCH_ARM64 ON) + endif() + endif() + + # linux system ## 配置linux系统下需要传递给tengine的参数,是arm32还是arm64 + if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm) + set(CONFIG_ARCH_ARM32 ON) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64 + set(CONFIG_ARCH_ARM64 ON) + endif() + + SET(DEFAULT_OPENCV_TENGINE_SOURCE_PATH ${OCV_TENGINE_DSTDIRECTORY}/Tengine-${TENGINE_VERSION}) + set(BUILT_IN_OPENCV ON) ## set for tengine compile discern. + set(Tengine_INCLUDE_DIR ${DEFAULT_OPENCV_TENGINE_SOURCE_PATH}/core/include) + set(Tengine_LIB ${CMAKE_BINARY_DIR}/lib/${ANDROID_ABI}/libtengine.a) + if ( IS_DIRECTORY ${DEFAULT_OPENCV_TENGINE_SOURCE_PATH}) ## 添加编译Tengine + add_subdirectory("${DEFAULT_OPENCV_TENGINE_SOURCE_PATH}" ${OCV_TENGINE_DSTDIRECTORY}/build) + endif() +endif() +``` + +**d. modules/dnn/CMakeLists.txt** + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101815.png) + +完成如上修改基本上就达到了可以直接从OpenCV中调用Tengine,自动下载Tengine并且编译好给后面卷积计算的调用和链接。 + +**卷积推理的调用** + +关于卷积的计算调用流程如下: + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210610101829.png) + +看上图就会明白,如果需要修改卷积最底层的实现,最终需要修改和了解的是接口:**cv::dnn::ConvolutionLayerImpl::forward**。该接口的实现是在文件convolution_layer.cpp 中。 + +实际上,在该接口中调用Tengine还需要了解卷积计算需要的一些参数,以下是实际调用的参数传递过程: + +``` +bool tengine_ret = tengine_forward(input_, inch, ngroups, in_h, in_w, ## 输入的数据和尺寸 + output_, out_b, outch, out_h, out_w, ## 输出的数据和尺寸 + kernel_, kernel_size.size(), kernel.height, kernel.width, ##输入的参数和尺寸 + teg_bias, stride.height, stride.width, + pad.height, pad.width, dilation.height, dilation.width, + weightsMat.step1(), padMode); +``` + +详细实现如下: + +``` +// 添加头文件 +#ifdef HAVE_TENGINE +#include "../tengine4dnn/include/tengine_graph_convolution.hpp" +#endif + +#ifdef HAVE_TENGINE + int inch = inputs[0].size[1]; // inch + int in_h = inputs[0].size[2]; // in_h + int in_w = inputs[0].size[3]; // in_w + + int out_b = outputs[0].size[0]; // out batch size + int outch = outputs[0].size[1]; // outch + int out_h = outputs[0].size[2]; // out_h + int out_w = outputs[0].size[3]; // out_w + + float *input_ = inputs[0].ptr(); + float *output_ = outputs[0].ptr(); + float *kernel_ = weightsMat.ptr(); + float *teg_bias = &biasvec[0]; +## 调用tengine的forward,所有的参数都在该函数传递进去 + bool tengine_ret = tengine_forward(input_, inch, ngroups, in_h, in_w, + output_, out_b, outch, out_h, out_w, + kernel_, kernel_size.size(), kernel.height, kernel.width, + teg_bias, stride.height, stride.width, + pad.height, pad.width, dilation.height, dilation.width, + weightsMat.step1(), padMode); + /* activation */ + if((true == tengine_ret) && activ ) +## 如果Tengine推理成功且带有activation的实现,则会调用OpenCV去进行activation的计算 + { + int out_cstep = out_h * out_w; // out_cstep + + ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope, + kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes); + ActivationLayer* activ_ = activ.get(); + activ_->forwardSlice(output_, output_, out_cstep, out_cstep, 0, outch); + } + if(false == tengine_ret) ## 如果使用tengine推理失败,会自动调用OpenCV原始的实现 +#endif + { + int nstripes = std::max(getNumThreads(), 1); + + ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope, + kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes); + } + } +``` + +上面就是将Tengine集成进OpenCV的最主要两大块工作的介绍,实际上还有更多的技术细节此处没有涉及到。比如Tengine里面怎么实现单层的卷积计算,怎么能完全复用OpenCV传递过来的数据地址,而不做重复的数据拷贝,性能的提升主要原因,在编译成功Tengine的库之后怎么能在DNN模块里面调用到Tengine的接口,OpenCV里面自动下载第三方的库是怎么实现的,有没有其他路径,每个convolution都创建一遍图对性能不会有很大的损耗吗?CI测试等等。由于篇幅有限,此处不做介绍,这些将会在后续的技术博文中进行一一介绍。OpenCV是一个宝藏,大家可以多去看看相关代码探秘。 + +![](https://maoxianxin1996.oss-accelerate.aliyuncs.com/codechina/20210608112105.png) \ No newline at end of file -- GitLab