/** * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. * Copyright (C) 2019-2020, Xperience AI, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. * * --------------------------------------------------------------------------- * \file dnn/src/arm_common/warp_affine/warp_affine_cv.cpp * * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * This file has been modified by Megvii ("Megvii Modifications"). * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. * * --------------------------------------------------------------------------- */ #include "src/arm_common/warp_affine/warp_affine_cv.h" #include "src/arm_common/handle.h" #include "src/common/cv/common.h" #include "src/common/cv/helper.h" #include "src/common/cv/interp_helper.h" #include "src/common/utils.h" #include "src/common/warp_common.h" #include #include #include "src/arm_common/simd_macro/marm_neon.h" using namespace megdnn; using namespace arm_common; using namespace megcv; using namespace warp; #include "midout.h" MIDOUT_DECL(megdnn_arm_common_warp_affine_cv) namespace { constexpr size_t BLOCK_SZ = 64_z; template void warp_affine_cv(const Mat& src, Mat& dst, const float* trans, const float border_value, size_t task_id) { // no extra padding double M[6]; rep(i, 6) M[i] = trans[i]; T bvalue[3] = {(T)border_value, (T)border_value, (T)border_value}; std::vector _adelta(dst.cols() * 2); int *adelta = _adelta.data(), *bdelta = adelta + dst.cols(); // clang 3.6 can not deduce that `std::max(10, (int)INTER_BITS)' is a // constant, which will cause compilation error in subsequent vshrq_n_s32. constexpr int AB_BITS = 10 > INTER_BITS ? 10 : INTER_BITS; constexpr int AB_SCALE = 1 << AB_BITS; size_t dstcols = dst.cols(); for (size_t x = 0; x < dstcols; ++x) { adelta[x] = saturate_cast(M[0] * x * AB_SCALE); bdelta[x] = saturate_cast(M[3] * x * AB_SCALE); } size_t x1, y1, dstrows = dst.rows(); size_t BLOCK_SZ_H = std::min(BLOCK_SZ / 2, dstrows); size_t BLOCK_SZ_W = std::min(BLOCK_SZ * BLOCK_SZ / BLOCK_SZ_H, dstcols); BLOCK_SZ_H = std::min(BLOCK_SZ * BLOCK_SZ / BLOCK_SZ_W, dstrows); size_t width_block_size = div_ceil(dstcols, BLOCK_SZ_W); size_t y = (task_id / width_block_size) * BLOCK_SZ_H; size_t x = (task_id % width_block_size) * BLOCK_SZ_W; short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ]; int round_delta = (imode == IMode::INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2); size_t bw = std::min(BLOCK_SZ_W, dstcols - x); size_t bh = std::min(BLOCK_SZ_H, dstrows - y); Mat _XY(bh, bw, 2, XY); Mat dpart(dst, y, bh, x, bw); for (y1 = 0; y1 < bh; ++y1) { short* xy = XY + y1 * bw * 2; int X0 = saturate_cast((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta; int Y0 = saturate_cast((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta; if (imode == IMode::INTER_NEAREST) { x1 = 0; int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); for (; x1 + 8 <= bw; x1 += 8) { int16x8x2_t v_dst; v_dst.val[0] = vcombine_s16( vqmovn_s32(vshrq_n_s32( vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)), vqmovn_s32(vshrq_n_s32( vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS))); v_dst.val[1] = vcombine_s16( vqmovn_s32(vshrq_n_s32( vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)), vqmovn_s32(vshrq_n_s32( vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS))); vst2q_s16(xy + (x1 << 1), v_dst); } for (; x1 < bw; x1++) { int X = (X0 + adelta[x + x1]) >> AB_BITS; int Y = (Y0 + bdelta[x + x1]) >> AB_BITS; xy[x1 * 2] = saturate_cast(X); xy[x1 * 2 + 1] = saturate_cast(Y); } } else { // if imode is not INTER_NEAREST short* alpha = A + y1 * bw; x1 = 0; int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); for (; x1 + 8 <= bw; x1 += 8) { int32x4_t v_X0 = vshrq_n_s32( vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); int32x4_t v_Y0 = vshrq_n_s32( vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); int32x4_t v_X1 = vshrq_n_s32( vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS); int32x4_t v_Y1 = vshrq_n_s32( vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS); int16x8x2_t v_xy; v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS))); v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS))); vst2q_s16(xy + (x1 << 1), v_xy); int16x4_t v_alpha0 = vmovn_s32(vaddq_s32( vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS), vandq_s32(v_X0, v_mask))); int16x4_t v_alpha1 = vmovn_s32(vaddq_s32( vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS), vandq_s32(v_X1, v_mask))); vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1)); } for (; x1 < bw; x1++) { int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS); int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS); xy[x1 * 2] = saturate_cast(X >> INTER_BITS); xy[x1 * 2 + 1] = saturate_cast(Y >> INTER_BITS); alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1))); } } } Mat _matA(bh, bw, 1, (ushort*)(A)); remap>(src, dpart, _XY, _matA, bvalue); } } // anonymous namespace void megdnn::arm_common::warp_affine_cv_exec( _megdnn_tensor_in src, _megdnn_tensor_in trans, _megdnn_tensor_in dst, float border_value, BorderMode bmode, InterpolationMode imode, Handle* handle) { size_t ch = dst.layout[3]; size_t width = dst.layout[2]; size_t height = dst.layout[1]; const size_t batch = dst.layout.shape[0]; size_t BLOCK_SZ_H = std::min(BLOCK_SZ / 2, height); size_t BLOCK_SZ_W = std::min(BLOCK_SZ * BLOCK_SZ / BLOCK_SZ_H, width); BLOCK_SZ_H = std::min(BLOCK_SZ * BLOCK_SZ / BLOCK_SZ_W, height); size_t parallelism_batch = div_ceil(height, BLOCK_SZ_H) * div_ceil(width, BLOCK_SZ_W); megdnn_assert(ch == 1 || ch == 3 || ch == 2, "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); const float* trans_ptr = trans.ptr(); if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { #define cb(_imode, _bmode, _ch) \ MIDOUT_BEGIN(megdnn_arm_common_warp_affine_cv, midout_iv(_imode), \ midout_iv(_bmode), midout_iv(_ch), float) { \ auto task = [src, trans_ptr, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ Mat src_mat = TensorND2Mat(src, batch_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ const float* task_trans_ptr = trans_ptr + batch_id * 2 * 3; \ warp_affine_cv( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ MEGDNN_COMMA task_trans_ptr MEGDNN_COMMA \ border_value, \ task_id); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast(handle), \ batch* parallelism_batch, task); \ } \ MIDOUT_END(); DISPATCH_IMODE(imode, bmode, ch, cb) } else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) { #undef cb #define cb(_imode, _bmode, _ch) \ MIDOUT_BEGIN(megdnn_arm_common_warp_affine_cv, midout_iv(_imode), \ midout_iv(_bmode), midout_iv(_ch), uchar) { \ auto task = [src, trans_ptr, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ Mat src_mat = TensorND2Mat(src, batch_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ const float* task_trans_ptr = trans_ptr + batch_id * 2 * 3; \ warp_affine_cv( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ MEGDNN_COMMA task_trans_ptr MEGDNN_COMMA \ border_value, \ task_id); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast(handle), \ batch* parallelism_batch, task); \ } \ MIDOUT_END(); DISPATCH_IMODE(imode, bmode, ch, cb) #undef cb } else { megdnn_throw(megdnn_mangle("Unsupported datatype of WarpAffine optr.")); } } // vim: syntax=cpp.doxygen