diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmeticself.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmeticself.cl new file mode 100644 index 0000000000000000000000000000000000000000..37bb67be29e70696c3198939671a389e680611e3 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmeticself.cl @@ -0,0 +1,450 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x >= 0 ? result.x : -result.x; + result.y = result.y >= 0 ? result.y : -result.y; + result.z = result.z >= 0 ? result.z : -result.z; + result.w = result.w >= 0 ? result.w : -result.w; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x >= 0 ? result.x : -result.x; + result.y = result.y >= 0 ? result.y : -result.y; + result.z = result.z >= 0 ? result.z : -result.z; + result.w = result.w >= 0 ? result.w : -result.w; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = cos(result.x); + result.y = cos(result.y); + result.z = cos(result.z); + result.w = cos(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = cos(result.x); + result.y = cos(result.y); + result.z = cos(result.z); + result.w = cos(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = sin(result.x); + result.y = sin(result.y); + result.z = sin(result.z); + result.w = sin(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = sin(result.x); + result.y = sin(result.y); + result.z = sin(result.z); + result.w = sin(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementTanh_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = tanh(result.x); + result.y = tanh(result.y); + result.z = tanh(result.z); + result.w = tanh(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementTanh_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = tanh(result.x); + result.y = tanh(result.y); + result.z = tanh(result.z); + result.w = tanh(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = -result.x; + result.y = -result.y; + result.z = -result.z; + result.w = -result.w; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = -result.x; + result.y = -result.y; + result.z = -result.z; + result.w = -result.w; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = exp(result.x); + result.y = exp(result.y); + result.z = exp(result.z); + result.w = exp(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = exp(result.x); + result.y = exp(result.y); + result.z = exp(result.z); + result.w = exp(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x > 0 ? log(result.x) : HUGE_VALF; + result.y = result.y > 0 ? log(result.y) : HUGE_VALF; + result.z = result.z > 0 ? log(result.z) : HUGE_VALF; + result.w = result.w > 0 ? log(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x > 0 ? log(result.x) : HUGE_VALF; + result.y = result.y > 0 ? log(result.y) : HUGE_VALF; + result.z = result.z > 0 ? log(result.z) : HUGE_VALF; + result.w = result.w > 0 ? log(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x * result.x; + result.y = result.y * result.y; + result.z = result.z * result.z; + result.w = result.w * result.w; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x * result.x; + result.y = result.y * result.y; + result.z = result.z * result.z; + result.w = result.w * result.w; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF; + result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF; + result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF; + result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF; + result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF; + result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF; + result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF; + result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF; + result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF; + result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF; + result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF; + result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF; + result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = result.x > 0 || result.x < 0 ? false : true; + result.y = result.y > 0 || result.y < 0 ? false : true; + result.z = result.z > 0 || result.z < 0 ? false : true; + result.w = result.w > 0 || result.w < 0 ? false : true; + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = result.x > 0 || result.x < 0 ? false : true; + result.y = result.y > 0 || result.y < 0 ? false : true; + result.z = result.z > 0 || result.z < 0 ? false : true; + result.w = result.w > 0 || result.w < 0 ? false : true; + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = floor(result.x); + result.y = floor(result.y); + result.z = floor(result.z); + result.w = floor(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = floor(result.x); + result.y = floor(result.y); + result.z = floor(result.z); + result.w = floor(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = ceil(result.x); + result.y = ceil(result.y); + result.z = ceil(result.z); + result.w = ceil(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = ceil(result.x); + result.y = ceil(result.y); + result.z = ceil(result.z); + result.w = ceil(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X))); + result.x = round(result.x); + result.y = round(result.y); + result.z = round(result.z); + result.w = round(result.w); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, + int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); + result.x = round(result.x); + result.y = round(result.y); + result.z = round(result.z); + result.w = round(result.w); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc new file mode 100644 index 0000000000000000000000000000000000000000..abd70a3f5a331b5f2d23c86cbc629e4443095972 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc @@ -0,0 +1,240 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include "src/kernel_registry.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" +#include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::schema::PrimitiveType_Abs; +using mindspore::schema::PrimitiveType_Ceil; +using mindspore::schema::PrimitiveType_Cos; +using mindspore::schema::PrimitiveType_Exp; +using mindspore::schema::PrimitiveType_Floor; +using mindspore::schema::PrimitiveType_Log; +using mindspore::schema::PrimitiveType_LogicalNot; +using mindspore::schema::PrimitiveType_Neg; +using mindspore::schema::PrimitiveType_Round; +using mindspore::schema::PrimitiveType_Rsqrt; +using mindspore::schema::PrimitiveType_Sin; +using mindspore::schema::PrimitiveType_Sqrt; +using mindspore::schema::PrimitiveType_Square; + +namespace mindspore::kernel { + +int ArithmeticSelfOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = out_tensors_[0]->Width() * CO4; + im_dst_y = out_tensors_[0]->Height() * out_tensors_[0]->Batch(); + } else { + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4; + im_dst_x = out_tensors_[0]->Width(); + } + size_t img_dtype = CL_FLOAT; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto enable_fp16_ = ocl_runtime->GetFp16Enable(); + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) { + switch (param->op_parameter_.type_) { + case PrimitiveType_Abs: + kernel_name[0] += "_ElementAbs"; + break; + case PrimitiveType_Cos: + kernel_name[0] += "_ElementCos"; + break; + case PrimitiveType_Exp: + kernel_name[0] += "_ElementExp"; + break; + case PrimitiveType_Log: + kernel_name[0] += "_ElementLog"; + break; + case PrimitiveType_Square: + kernel_name[0] += "_ElementSquare"; + break; + case PrimitiveType_Sqrt: + kernel_name[0] += "_ElementSqrt"; + break; + case PrimitiveType_Rsqrt: + kernel_name[0] += "_ElementRsqrt"; + break; + case PrimitiveType_Sin: + kernel_name[0] += "_ElementSin"; + break; + case PrimitiveType_LogicalNot: + kernel_name[0] += "_ElementLogicalNot"; + break; + case PrimitiveType_Floor: + kernel_name[0] += "_ElementFloor"; + break; + case PrimitiveType_Ceil: + kernel_name[0] += "_ElementCeil"; + break; + case PrimitiveType_Round: + kernel_name[0] += "_ElementRound"; + case PrimitiveType_Neg: + kernel_name[0] += "_ElementNeg"; + break; + default: + break; + } +} + +int ArithmeticSelfOpenCLKernel::Init() { + if (in_tensors_[0]->shape().size() != 4) { + MS_LOG(ERROR) << " only support dim = 4 "; + return RET_ERROR; + } + auto param = reinterpret_cast(this->op_parameter_); + + auto in_format = op_format_; + if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { + MS_LOG(ERROR) << "input format(" << in_format << ") " + << "format not support!"; + return RET_ERROR; + } + in_ori_format_ = in_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_ori_format_ = out_tensors_[0]->GetFormat(); + out_tensors_[0]->SetFormat(op_format_); + + std::string kernel_name = "ArithmeticSelf"; + GetKernelName(&kernel_name, param); + if (in_format == schema::Format_NC4HW4) { + kernel_name += "_NC4HW4"; + } else if (in_format == schema::Format_NHWC4) { + kernel_name += "_NHWC4"; + } + MS_LOG(DEBUG) << "execute kernel name : " << kernel_name; + std::set build_options; + std::string source = arithmeticself_source; + std::string program_name = "ArithmeticSelf"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->LoadSource(program_name, source); + ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); + + return RET_OK; +} + +int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; } + +int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) { + if (number % 8 == 0 && max_divider >= 8) { + return number / 8; + } + if (number % 4 == 0 && 4 <= max_divider) { + return number / 4; + } + if (number % 2 == 0 && 2 <= max_divider) { + return number / 2; + } + + for (int i = max_divider; i != 0; i--) { + if (number % i == 0) { + return i; + } + } + return RET_OK; +} + +void ArithmeticSelfGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { + const int max_divider = 8; + const int max_x = 4, max_y = 8; + int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x); + int yz = max_size / x; + int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); + int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); + + local->clear(); + local->push_back(x); + local->push_back(y); + local->push_back(z); +} + +int ArithmeticSelfOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running! "; + + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto output_shape = out_tensors_[0]->shape(); + cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; + + uint32_t OH = output_shape[0] * output_shape[1]; // N*H + uint32_t OW = output_shape[2]; + uint32_t OC = UP_DIV(output_shape[3], C4NUM); + + const std::vector &max_global = ocl_runtime->GetWorkItemSize(); + std::vector local = {1, 1, 1}; // init local + std::vector global = {OH, OW, OC}; + ArithmeticSelfGetWorkGroup(global, &local, max_global[0]); + + int arg_cn = 0; + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); + + ocl_runtime->RunKernel(kernel_, global, local, nullptr); + + return RET_OK; +} + +kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed "; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf "; + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h new file mode 100644 index 0000000000000000000000000000000000000000..7c49e765872f78d41adf580793ce0f9a9603bf25 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h @@ -0,0 +1,52 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_SELF_PARAMETER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_SELF_PARAMETER_H_ + +#include +#include +#include "ir/anf.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "nnacl/arithmetic_self_parameter.h" + +namespace mindspore::kernel { + +class ArithmeticSelfOpenCLKernel : public OpenCLKernel { + public: + explicit ArithmeticSelfOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + + ~ArithmeticSelfOpenCLKernel() override{}; + + int Init() override; + + int ReSize() override; + + int Run() override; + + int GetImageSize(size_t idx, std::vector *img_size) override; + + void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); + + private: + cl::Kernel kernel_; +}; + +} // namespace mindspore::kernel +#endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index 53cd41f07fd0d13b0431682b85cd62d1e5689ed3..c15a0723d43c9852f87e89c45449e7cfdfcc51a9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -114,7 +114,11 @@ int TransposeOpenCLKernel::Run() { cl_int2 C = {c, c4}; int arg_idx = 0; ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data()); + if (out_mem_type_ == OpenCLMemType::BUF) { + ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data(), lite::opencl::MemType::BUF); + } else { + ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data()); + } ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW); ocl_runtime->SetKernelArg(kernel_, arg_idx++, C); ocl_runtime->SetKernelArg(kernel_, arg_idx++, w); diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index f83e8398231ced277e190ff826c78ffedc4528ca..f0011c79b8c48ebe54591959d466469e79052b4e 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -150,6 +150,7 @@ if (SUPPORT_GPU) ${LITE_DIR}/src/runtime/kernel/opencl/kernel/matmul.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/softmax.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/concat.cc + ${LITE_DIR}/src/runtime/kernel/opencl/kernel/arithmetic_self.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/batchnorm.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/slice.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/activation.cc @@ -334,6 +335,7 @@ if (SUPPORT_GPU) ${TEST_DIR}/ut/src/runtime/kernel/opencl/concat_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/batchnorm_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/slice_tests.cc + ${TEST_DIR}/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/softmax_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/arithmetic_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..1289b7f85e1ab2c83588af46045ec02561882354 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc @@ -0,0 +1,133 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h" + +namespace mindspore { +class TestArithmeticSelfOpenCLfp16 : public mindspore::CommonTest { + public: + TestArithmeticSelfOpenCLfp16() {} +}; + +template +void CompareOutputData1(T *input_data1, T *output_data, T *correct_data, int size, float err_bound) { + for (size_t i = 0; i < 100; i++) { + T abs = fabs(output_data[i] - correct_data[i]); + ASSERT_LE(abs, err_bound); + } +} + +TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) { + MS_LOG(INFO) << " begin test "; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->SetFp16Enable(true); + ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); + + // get the input from .bin + size_t input1_size, output_size; + std::string input1Ppath = "./test_data/in_arithmetic_selffp16.bin"; + std::string correctOutputPath = "./test_data/out_arithmetic_selffp16.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto correctOutput = + reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + + MS_LOG(INFO) << " init tensors "; + + std::vector shape = {1, 19, 19, 96}; + auto data_type = kNumberTypeFloat16; + auto tensor_type = schema::NodeType_ValueNode; + auto *input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); + auto *output_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); + if (input_tensor == nullptr || output_tensor == nullptr) { + MS_LOG(INFO) << " new input_tensor or output_tensor failed "; + return; + } + std::vector inputs{input_tensor}; + std::vector outputs{output_tensor}; + + MS_LOG(INFO) << " initialize param "; + auto param = new (std::nothrow) ArithmeticSelfParameter(); + if (param == nullptr) { + MS_LOG(INFO) << " new ConcatParameter failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + return; + } + param->op_parameter_.type_ = schema::PrimitiveType_Neg; + auto *arithmeticself_kernel = + new (std::nothrow) kernel::ArithmeticSelfOpenCLKernel(reinterpret_cast(param), inputs, outputs); + if (arithmeticself_kernel == nullptr) { + MS_LOG(INFO) << " new kernel::ArithmeticSelfOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + return; + } + arithmeticself_kernel->SetFormatType(schema::Format_NC4HW4); + arithmeticself_kernel->Init(); + // to do allocate memory for inputs and outputs + for (auto &input_tensor : inputs) { + input_tensor->MallocData(allocator); + } + MS_LOG(INFO) << " initialize sub_graph "; + std::vector kernels{arithmeticself_kernel}; + auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + if (sub_graph == nullptr) { + MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete arithmeticself_kernel; + return; + } + sub_graph->Init(); + MS_LOG(INFO) << " initialize input data "; + memcpy(inputs[0]->Data(), input_data1, input1_size); + + std::cout << "==================output data================" << std::endl; + sub_graph->Run(); + auto *output_data_gpu = reinterpret_cast(output_tensor->Data()); + CompareOutputData1(input_data1, output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete arithmeticself_kernel; + delete sub_graph; +} +} // namespace mindspore