diff --git a/lite/backends/apu/neuron_adapter.cc b/lite/backends/apu/neuron_adapter.cc index 953c92d1828848bd030a65cb2a8af0eac0674ca1..ff08507504b8bd7e5342c5705afb17550f37469e 100644 --- a/lite/backends/apu/neuron_adapter.cc +++ b/lite/backends/apu/neuron_adapter.cc @@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() { PADDLE_DLSYM(NeuronModel_setOperandValue); PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams); PADDLE_DLSYM(NeuronModel_addOperation); + PADDLE_DLSYM(NeuronModel_addOperationExtension); PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs); PADDLE_DLSYM(NeuronCompilation_create); PADDLE_DLSYM(NeuronCompilation_free); PADDLE_DLSYM(NeuronCompilation_finish); + PADDLE_DLSYM(NeuronCompilation_createForDevices); PADDLE_DLSYM(NeuronExecution_create); PADDLE_DLSYM(NeuronExecution_free); PADDLE_DLSYM(NeuronExecution_setInput); PADDLE_DLSYM(NeuronExecution_setOutput); PADDLE_DLSYM(NeuronExecution_compute); - + PADDLE_DLSYM(Neuron_getDeviceCount); + PADDLE_DLSYM(Neuron_getDevice); + PADDLE_DLSYM(NeuronDevice_getName); #undef PADDLE_DLSYM } @@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model, model, type, inputCount, inputs, outputCount, outputs); } +int NeuronModel_addOperationExtension(NeuronModel* model, + const char* name, + const char* vendor, + const NeuronDevice* device, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + return paddle::lite::NeuronAdapter::Global() + ->NeuronModel_addOperationExtension()(model, + name, + vendor, + device, + inputCount, + inputs, + outputCount, + outputs); +} + int NeuronModel_identifyInputsAndOutputs(NeuronModel* model, uint32_t inputCount, const uint32_t* inputs, @@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) { compilation); } +int NeuronCompilation_createForDevices(NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + NeuronCompilation** compilation) { + return paddle::lite::NeuronAdapter::Global() + ->NeuronCompilation_createForDevices()( + model, devices, numDevices, compilation); +} + int NeuronExecution_create(NeuronCompilation* compilation, NeuronExecution** execution) { return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()( @@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) { return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()( execution); } + +int Neuron_getDeviceCount(uint32_t* numDevices) { + return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()( + numDevices); +} + +int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) { + return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex, + device); +} + +int NeuronDevice_getName(const NeuronDevice* device, const char** name) { + return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device, + name); +} diff --git a/lite/backends/apu/neuron_adapter.h b/lite/backends/apu/neuron_adapter.h index c08db73279ea3969300c8f298016a976e30a7ac4..c1b9669a98626699b126913dcc840906de4de8e0 100644 --- a/lite/backends/apu/neuron_adapter.h +++ b/lite/backends/apu/neuron_adapter.h @@ -42,12 +42,25 @@ class NeuronAdapter final { const uint32_t *, uint32_t, const uint32_t *); + using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *, + const char *, + const char *, + const NeuronDevice *, + uint32_t, + const uint32_t *, + uint32_t, + const uint32_t *); using NeuronModel_identifyInputsAndOutputs_Type = int (*)( NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *); using NeuronCompilation_create_Type = int (*)(NeuronModel *, NeuronCompilation **); using NeuronCompilation_free_Type = void (*)(NeuronCompilation *); using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *); + using NeuronCompilation_createForDevices_Type = + int (*)(NeuronModel *, + const NeuronDevice *const *, + uint32_t, + NeuronCompilation **); using NeuronExecution_create_Type = int (*)(NeuronCompilation *, NeuronExecution **); using NeuronExecution_free_Type = void (*)(NeuronExecution *); @@ -59,6 +72,10 @@ class NeuronAdapter final { using NeuronExecution_setOutput_Type = int (*)( NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t); using NeuronExecution_compute_Type = int (*)(NeuronExecution *); + using Neuron_getDeviceCount_Type = int (*)(uint32_t *); + using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **); + using NeuronDevice_getName_Type = int (*)(const NeuronDevice *, + const char **); Neuron_getVersion_Type Neuron_getVersion() { CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!"; @@ -105,6 +122,12 @@ class NeuronAdapter final { return NeuronModel_addOperation_; } + NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() { + CHECK(NeuronModel_addOperationExtension_ != nullptr) + << "Cannot load NeuronModel_addOperationExtension!"; + return NeuronModel_addOperationExtension_; + } + NeuronModel_identifyInputsAndOutputs_Type NeuronModel_identifyInputsAndOutputs() { CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr) @@ -130,6 +153,12 @@ class NeuronAdapter final { return NeuronCompilation_finish_; } + NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() { + CHECK(NeuronCompilation_createForDevices_ != nullptr) + << "Cannot load NeuronCompilation_createForDevices!"; + return NeuronCompilation_createForDevices_; + } + NeuronExecution_create_Type NeuronExecution_create() { CHECK(NeuronExecution_create_ != nullptr) << "Cannot load NeuronExecution_create!"; @@ -160,6 +189,23 @@ class NeuronAdapter final { return NeuronExecution_compute_; } + Neuron_getDeviceCount_Type Neuron_getDeviceCount() { + CHECK(Neuron_getDeviceCount_ != nullptr) + << "Cannot load Neuron_getDeviceCount!"; + return Neuron_getDeviceCount_; + } + + Neuron_getDevice_Type Neuron_getDevice() { + CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!"; + return Neuron_getDevice_; + } + + NeuronDevice_getName_Type NeuronDevice_getName() { + CHECK(NeuronDevice_getName_ != nullptr) + << "Cannot load NeuronDevice_getName!"; + return NeuronDevice_getName_; + } + private: NeuronAdapter(); NeuronAdapter(const NeuronAdapter &) = delete; @@ -176,16 +222,23 @@ class NeuronAdapter final { NeuronModel_setOperandSymmPerChannelQuantParams_Type NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr}; NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr}; + NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{ + nullptr}; NeuronModel_identifyInputsAndOutputs_Type NeuronModel_identifyInputsAndOutputs_{nullptr}; NeuronCompilation_create_Type NeuronCompilation_create_{nullptr}; NeuronCompilation_free_Type NeuronCompilation_free_{nullptr}; NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr}; + NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{ + nullptr}; NeuronExecution_create_Type NeuronExecution_create_{nullptr}; NeuronExecution_free_Type NeuronExecution_free_{nullptr}; NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr}; NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr}; NeuronExecution_compute_Type NeuronExecution_compute_{nullptr}; + Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr}; + Neuron_getDevice_Type Neuron_getDevice_{nullptr}; + NeuronDevice_getName_Type NeuronDevice_getName_{nullptr}; }; } // namespace lite } // namespace paddle diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt old mode 100644 new mode 100755 index 0b42af5a6fe79bbb8417c2a6a37a86c30f4a0f8b..609bf1b4b345f8eb7d14b9bb3291e6bc5bad2293 --- a/lite/kernels/apu/bridges/CMakeLists.txt +++ b/lite/kernels/apu/bridges/CMakeLists.txt @@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps}) set(apu_subgraph_bridges @@ -25,6 +27,8 @@ set(apu_subgraph_bridges subgraph_bridge_softmax_op_apu subgraph_bridge_fc_op_apu subgraph_bridge_pool_op_apu + subgraph_bridge_conv_transpose_op_apu + subgraph_bridge_concat_op_apu CACHE INTERNAL "apu_subgraph_bridges") message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}") diff --git a/lite/kernels/apu/bridges/concat_op.cc b/lite/kernels/apu/bridges/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..26f62101ab435059cde043c807f92cb3ba43dd01 --- /dev/null +++ b/lite/kernels/apu/bridges/concat_op.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/subgraph_bridge_registry.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + int neuron_errCode; + VLOG(3) << "[APU] Converting [" << op_type << "]"; + + // Get input and output vars and op attributes + auto x_names = op_info->Input("X"); + auto out_name = op_info->Output("Out").front(); + auto axis = op_info->GetAttr("axis"); + auto num = x_names.size(); + + // Process data layout axis change + if (axis == 1) + axis = 3; + else if (axis == 2) + axis = 1; + else if (axis == 3) + axis = 2; + + // Limitation: + // All input tensors of NEURON_TENSOR_QUANT8_ASYMM must + // have the same scale and zeroPoint as the output tensor + CHECK(op_info->HasOutputScale(out_name)); + auto output_scale = op_info->GetOutputScale(out_name)[0]; + + // Traverse all of input nodes + std::vector> input_nodes; + NeuronOperandType xType; + for (auto& x_name : x_names) { + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + std::shared_ptr x_node = nullptr; + + CHECK(op_info->HasInputScale(x_name)); + auto input_scale = op_info->GetInputScale(x_name)[0]; + + // Add x tensor type + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = input_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x = {(uint32_t)x_dims[0], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3], + (uint32_t)x_dims[1]}; + xType.dimensions = &dims_x[0]; + if (graph->Has(x_name)) { + VLOG(3) << "Graph has " << x_name; + if (graph->IsInput(x_name)) { + VLOG(3) << x_name << "is input and already exist"; + x_name = "transpose_" + x_name; + } + + if (graph->IsOutput(x_name)) { + VLOG(3) << x_name << "is input and output node"; + x_name = "transpose_" + x_name; + } + x_node = graph->Get(x_name); + } else { + // Add input operand + if (graph->IsInput(x_name)) { + // Insert transpose for NCHW -> NHWC + insert_transpose_node(ctx, + x_name, + "transpose_" + x_name, + {(uint32_t)x_dims[0], + (uint32_t)x_dims[1], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3]}, + dims_x, + {0, 2, 3, 1}, + xType.scale, + xType.zeroPoint); + + // Change x_name because we add transpose op + x_name = "transpose_" + x_name; + x_node = graph->Get(x_name); + } else { + NeuronModel_addOperand(model, &xType); + x_node = graph->Add(x_name, dims_x); + } + } // End of else + if (x_node == nullptr) return subgraph::FAILED; + input_nodes.push_back(x_node); + + VLOG(3) << "input node x: " << x_node->index() + << ": input_scale: " << input_scale << " x_dims:" << x_dims[0] + << ":" << x_dims[1] << ":" << x_dims + << ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1] + << ":" << xType.dimensions[2] << ":" << xType.dimensions[3]; + } // End of for + + if (input_nodes.size() != num) { + LOG(WARNING) << "Create input operand failed!"; + return subgraph::FAILED; + } + + // Add axis operand type + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {1}; + + // Add axis operand + std::shared_ptr axis_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // axis + axis_node = graph->Add(out_name + "_axis", dims_int32); + VLOG(3) << "axis:" << axis; + + // Add out operand type + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = output_scale; + outType.zeroPoint = 128; + outType.dimensionCount = out_dims.size(); + std::vector dims_out = {(uint32_t)out_dims[0], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3], + (uint32_t)out_dims[1]}; + outType.dimensions = &dims_out[0]; + + // Add out operand + std::shared_ptr out_node = nullptr; + if (graph->Has(out_name)) { + out_node = graph->Get(out_name); + } else { + if (graph->IsOutput(out_name)) { + NeuronModel_addOperand(model, &outType); + out_node = graph->Add("transpose_" + out_name, dims_out); + } else { + NeuronModel_addOperand(model, &outType); + out_node = graph->Add(out_name, dims_out); + } + } + VLOG(3) << "out node idx: " << out_node->index() + << ": output_scle: " << outType.scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Set axis value + int32_t axis_val[1] = {(int32_t)axis}; + NeuronModel_setOperandValue( + model, axis_node->index(), axis_val, sizeof(int32_t) * 1); + + std::vector addInIndex; + for (auto& node : input_nodes) { + addInIndex.push_back(node->index()); + } + + addInIndex.push_back(axis_node->index()); + std::vector addOutIndex = {out_node->index()}; + neuron_errCode = NeuronModel_addOperation(model, + NEURON_CONCATENATION, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return subgraph::FAILED; + } + + if (graph->IsOutput(out_name)) { + // Insert transpose for NHWC -> NCHW + insert_transpose_node(ctx, + "transpose_" + out_name, + out_name, + dims_out, + {(uint32_t)out_dims[0], + (uint32_t)out_dims[1], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + out_node = graph->Get(out_name); + if (out_node == nullptr) return subgraph::FAILED; + } + + return SUCCESS; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(concat, + kAPU, + paddle::lite::subgraph::apu::ConcatConverter); diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc index 1c3020065ee4b16a56f95077b5906effd75a0249..bb60331e44d94afaffac2dd42020c4b4c7b4309d 100644 --- a/lite/kernels/apu/bridges/conv_op.cc +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); bool is_depthwise_mode = ic == groups && oc == groups; - VLOG(3) << "is_depthwise_mode" << is_depthwise_mode; + VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode; if (paddings.size() == 2L) { for (size_t i = 0; i < strides.size(); ++i) { @@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto filter_scale = op_info->GetInputScale(filter_name); CHECK(op_info->HasOutputScale(output_name)); auto output_scale = op_info->GetOutputScale(output_name)[0]; + auto orig_output_scale = op_info->GetOutputScale(output_name)[0]; VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups << " ,dilations: " << dilations[0] << ":" << dilations[1]; @@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::shared_ptr input_node = nullptr; if (graph->Has(input_name)) { VLOG(3) << "Graph has " << input_name; - // input operand already exist + + if (graph->IsInput(input_name)) { + VLOG(3) << input_name << "is input and already exist"; + input_name = "transpose_" + input_name; + } + + if (graph->IsOutput(input_name)) { + VLOG(3) << input_name << "is input and output node"; + input_name = "transpose_" + input_name; + } input_node = graph->Get(input_name); } else { - // add input operand if (graph->IsInput(input_name)) { // Insert transpose for NCHW -> NHWC - insert_transpose_node( - ctx, - input_name, - "transpose_" + input_name, - {input_dims[0], input_dims[1], input_dims[2], input_dims[3]}, - dims_in, - {0, 2, 3, 1}, - inType.scale, - inType.zeroPoint); - - // change input_name + insert_transpose_node(ctx, + input_name, + "transpose_" + input_name, + {(uint32_t)input_dims[0], + (uint32_t)input_dims[1], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3]}, + dims_in, + {0, 2, 3, 1}, + inType.scale, + inType.zeroPoint); + input_name = "transpose_" + input_name; input_node = graph->Get(input_name); if (input_node == nullptr) return subgraph::FAILED; @@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { input_node = graph->Add(input_name, dims_in); } } - VLOG(3) << "input node idx" << input_node->index() + VLOG(3) << "input node idx: " << input_node->index() << ": input_scale: " << input_scale << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] << ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; @@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add bias type NeuronOperandType biasType; - // Add filter type - // filter NCHW -> NHWC + // Add filter type, filter data re-layout NCHW -> NHWC Tensor transpose_filter; std::vector dims_filter; @@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { biasType.scale = 0; } + auto precision = filter->precision(); std::shared_ptr filter_node = nullptr; if (1 == filter_scale.size()) { - NeuronModel_addOperand(model, &filterType); // 1: filter - filter_node = graph->Add(filter_name, dims_filter); + NeuronModel_addOperand(model, &filterType); + filter_node = graph->Add(filter_name, dims_filter); // Operand 1: filter VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]" << filter_scale[0] << ": filterType: " << filterType.dimensions[0] << ":" << filterType.dimensions[1] << ":" @@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { return subgraph::FAILED; } } else { - NeuronModel_addOperand(model, &channelFilterType); // 1: filter + NeuronModel_addOperand(model, &channelFilterType); // Operand 1: filter filter_node = graph->Add(filter_name, dims_filter); VLOG(3) << "chennel filter node idx: " << filter_node->index() << " ,scale_count:" << filter_scale.size() @@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add biasType node value // A 1-D tensor, of shape [depth_out], specifying the bias. // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias - // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 + // should be of NEURON_TENSOR_INT32, with zeroPoint of 0 // and bias_scale of 0. The actual scale of each value 'i' is equal // to bias_scale[i] = input_scale * filter_scale[i]. biasType.type = NEURON_TENSOR_INT32; @@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { for (int i = 0; i < bias_dims.size(); i++) dims_bias.push_back(bias_dims[i]); biasType.dimensions = &dims_bias[0]; - NeuronModel_addOperand(model, &biasType); // 2: bias + NeuronModel_addOperand(model, &biasType); // Operand 2: bias bias_node = graph->Add(bias_name, dims_bias); - VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name + VLOG(3) << "node idx: " << bias_node->index() + << ": Bias name: " << bias_name << " ,bias scale: " << biasType.scale << " ,dimensions: " << bias_dims; } else { biasType.dimensionCount = 1; dims_bias = {(uint32_t)output_dims[1]}; biasType.dimensions = &dims_bias[0]; - NeuronModel_addOperand(model, &biasType); // 2: bias + NeuronModel_addOperand(model, &biasType); // Operand 2: bias bias_node = graph->Add(filter_name + "_default_bias", dims_bias); VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias " << " ,bias scale: " << biasType.scale @@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::vector dims_int32 = {1}; std::shared_ptr paddingL_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 3: padding left + NeuronModel_addOperand(model, &int32Type); // Operand 3: padding left paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); std::shared_ptr paddingR_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 4: padding right + NeuronModel_addOperand(model, &int32Type); // Operand 4: padding right paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); std::shared_ptr paddingT_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 5: padding top + NeuronModel_addOperand(model, &int32Type); // Operand 5: padding top paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); std::shared_ptr paddingB_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 6: padding bottom + NeuronModel_addOperand(model, &int32Type); // Operand 6: padding bottom paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); std::shared_ptr strideW_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 7: stride width + NeuronModel_addOperand(model, &int32Type); // Operand 7: stride width strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); std::shared_ptr strideH_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 8: stride height + NeuronModel_addOperand(model, &int32Type); // Operand 8: stride height strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); std::shared_ptr dm_node = nullptr; if (is_depthwise_mode) { - NeuronModel_addOperand(model, &int32Type); // 9: depthwise multiplier + NeuronModel_addOperand(model, + &int32Type); // Operand 9: depthwise multiplier dm_node = graph->Add(filter_name + "_dm", dims_int32); } std::shared_ptr fuse_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 9/10: fuse + NeuronModel_addOperand(model, &int32Type); // Operand 9/10: fuse fuse_node = graph->Add(filter_name + "_fuse", dims_int32); + /* Check output scale */ + if (is_depthwise_mode) { + for (auto s : filter_scale) { + if (output_scale < s * input_scale) + output_scale = s * input_scale + 0.000001; + } +#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT + output_scale = orig_output_scale; +#endif + } + // Add output tensor type NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; @@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (graph->Has(output_name)) { output_node = graph->Get(output_name); } else { - // add output operand - if (graph->IsOutput(output_name)) { - NeuronModel_addOperand(model, &outType); // output + // Add output operand + NeuronModel_addOperand(model, &outType); + + if (orig_output_scale != output_scale) { + // Need to insert requant op, the result is requant_ -> transpose_ -> + // output + output_node = graph->Add("requant_" + output_name, dims_out); + } else if (graph->IsOutput(output_name)) { + // Need to insert transpose op, transpose_ -> output output_node = graph->Add("transpose_" + output_name, dims_out); } else { - NeuronModel_addOperand(model, &outType); // output output_node = graph->Add(output_name, dims_out); } } @@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add Stride int32_t stride_val[1]; - stride_val[0] = strides[1]; // width + stride_val[0] = strides[1]; // Entry 1: width stride NeuronModel_setOperandValue( model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); - stride_val[0] = strides[0]; // height + stride_val[0] = strides[0]; // Entry 0: height stride NeuronModel_setOperandValue( model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); @@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { model, dm_node->index(), &dm, sizeof(int32_t) * 1); VLOG(3) << "depthwise multiplier:" << dm; - // Depthwise conv + // Depthwise conv case NeuronModel_setOperandValue( model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); std::vector addInIndex = { @@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { return FAILED; } + // Check if Requant OP is needed + std::shared_ptr requant_node = nullptr; + if (orig_output_scale != output_scale) { + std::string requant_out_name = output_name; + VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale + << " ,output_scale:" << output_scale; + if (graph->IsOutput(output_name)) { + requant_out_name = "transpose_" + output_name; + } + + insert_requant_node(ctx, + "requant_" + output_name, + requant_out_name, + dims_out, + dims_out, + output_scale, + orig_output_scale, + outType.zeroPoint); + + requant_node = graph->Get(requant_out_name); + if (requant_node == nullptr) return subgraph::FAILED; + } + + std::shared_ptr transpose_node = nullptr; if (graph->IsOutput(output_name)) { + VLOG(3) << "Add output transpose:" << output_name; // Insert transpose for NHWC -> NCHW - insert_transpose_node( - ctx, - "transpose_" + output_name, - output_name, - dims_out, - {output_dims[0], output_dims[1], output_dims[2], output_dims[3]}, - {0, 3, 1, 2}, - outType.scale, - outType.zeroPoint); - output_node = graph->Get(output_name); - if (output_node == nullptr) return subgraph::FAILED; + insert_transpose_node(ctx, + "transpose_" + output_name, + output_name, + dims_out, + {(uint32_t)output_dims[0], + (uint32_t)output_dims[1], + (uint32_t)output_dims[2], + (uint32_t)output_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + transpose_node = graph->Get(output_name); + if (transpose_node == nullptr) return subgraph::FAILED; } return REBUILD_WHEN_SHAPE_CHANGED; diff --git a/lite/kernels/apu/bridges/conv_transpose_op.cc b/lite/kernels/apu/bridges/conv_transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..386c89c128e476611ebde4b337823775b5ae01a9 --- /dev/null +++ b/lite/kernels/apu/bridges/conv_transpose_op.cc @@ -0,0 +1,488 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/subgraph_bridge_registry.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" + +#include "lite/operators/conv_op.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + int neuron_errCode; + VLOG(3) << "[APU] Converting [" << op_type << "]"; + + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + + // Get input, output and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_EQ(input_dims.size(), 4); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + CHECK_EQ(filter_dims.size(), 4); + + auto output_name = op_info->Output("Output").front(); + + auto strides = op_info->GetAttr>("strides"); + CHECK_EQ(strides.size(), 2L); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + if (groups > 1) { + LOG(WARNING) << "[NPU] only support groups == 1"; + return FAILED; + } + + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + auto fuse_relu = + op_info->HasAttr("fuse_relu") && op_info->GetAttr("fuse_relu"); + + auto dilations = op_info->GetAttr>("dilations"); + CHECK_EQ(dilations.size(), 2L); + std::string padding_algorithm = + op_info->HasAttr("padding_algorithm") + ? op_info->GetAttr("padding_algorithm") + : ""; + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + + CHECK_EQ(paddings.size(), 4L) + << "[APU] Paddings size should be the same or twice as the input size." + << paddings.size(); + + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + std::vector output_dims; + // Set output_dims: batches + output_dims.push_back(input_dims[0]); + + std::vector output_size; + if (op_info->HasAttr("output_size")) { + output_size = op_info->GetAttr>("output_size"); + } + + if (output_size.size() > 2) { + // Set output_dims: height, width + output_dims.push_back(output_size[0]); + output_dims.push_back(output_size[1]); + } else { + // Compute output size + for (int i = 0; i < strides.size(); i++) { + int kernel_ext = filter_dims[i + 2]; + int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext - + paddings[i * 2] - paddings[i * 2 + 1]; + output_dims.push_back(output_size); + } + } + output_dims.push_back(filter_dims[1]); + + CHECK(op_info->HasInputScale(input_name)); + auto input_scale = op_info->GetInputScale(input_name)[0]; + CHECK(op_info->HasInputScale(filter_name)); + auto filter_scale = op_info->GetInputScale(filter_name); + CHECK(op_info->HasOutputScale(output_name)); + auto output_scale = op_info->GetOutputScale(output_name)[0]; + + VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups + << " ,dilations: " << dilations[0] << ":" << dilations[1]; + VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type; + VLOG(3) << "input_dims: " << input_dims + << " ,filter_scale size: " << filter_scale.size(); + VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims + << " ,memory_size: " << filter->memory_size() + << " ,data_size: " << filter->data_size(); + + // Add input tensor type + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = input_scale; + inType.zeroPoint = 128; + inType.dimensionCount = input_dims.size(); + std::vector dims_in = {(uint32_t)input_dims[0], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3], + (uint32_t)input_dims[1]}; + inType.dimensions = &dims_in[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(3) << "Graph has " << input_name; + // Input operand already created by previous OP + input_node = graph->Get(input_name); + } else { + // Add input operand + if (graph->IsInput(input_name)) { + // Insert transpose for NCHW -> NHWC + insert_transpose_node(ctx, + input_name, + "transpose_" + input_name, + {(uint32_t)input_dims[0], + (uint32_t)input_dims[1], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3]}, + dims_in, + {0, 2, 3, 1}, + inType.scale, + inType.zeroPoint); + + // Change input_name because we add transpose op + input_name = "transpose_" + input_name; + input_node = graph->Get(input_name); + if (input_node == nullptr) return subgraph::FAILED; + } else { + NeuronModel_addOperand(model, &inType); + input_node = graph->Add(input_name, dims_in); + } + } + + VLOG(3) << "input node idx: " << input_node->index() + << ": input_scale: " << input_scale + << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] + << ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; + + // Add bias type + NeuronOperandType biasType; + + // Add filter type + // Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in) + Tensor transpose_filter; + std::vector dims_filter; + transpose_filter.Resize({(uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}); + + transposeAsym(filter->data(), + transpose_filter.mutable_data(), + {(uint32_t)filter_dims[0], + (uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3]}, + {1, 2, 3, 0}); + + dims_filter = {(uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}; + + NeuronOperandType filterType; + filterType.type = NEURON_TENSOR_QUANT8_ASYMM; + filterType.scale = filter_scale[0]; + filterType.zeroPoint = 128; + filterType.dimensionCount = filter_dims.size(); + filterType.dimensions = &dims_filter[0]; + biasType.scale = inType.scale * filterType.scale; + + std::shared_ptr filter_node = nullptr; + NeuronModel_addOperand(model, &filterType); + filter_node = graph->Add(filter_name, dims_filter); + auto precision = filter->precision(); + VLOG(3) << " filter node idx: " << filter_node->index() + << " filter_scale[0]=" << filter_scale[0] + << " filter memory_size=" << filter->memory_size() + << " filter precision=" << PrecisionToStr(precision) + << " :filterType: " << filterType.dimensions[0] << ":" + << filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":" + << filterType.dimensions[3]; + + memcpy(filter->mutable_data(), + transpose_filter.mutable_data(), + filter->memory_size()); + + // Set filter value + neuron_errCode = NeuronModel_setOperandValue( + model, filter_node->index(), filter->raw_data(), filter->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + + // Add biasType node value + // A 1-D tensor, of shape [depth_out], specifying the bias. + // For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of + // NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale == + // input_scale * filter_scale + biasType.type = NEURON_TENSOR_INT32; + biasType.zeroPoint = 0; + std::vector dims_bias; + std::shared_ptr bias_node = nullptr; + + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto channel_size = bias->dims().production(); + CHECK_EQ(channel_size, filter_dims[1] * groups); + CHECK_EQ(bias_dims.size(), 1); + + biasType.dimensionCount = bias_dims.size(); + for (int i = 0; i < bias_dims.size(); i++) + dims_bias.push_back(bias_dims[i]); + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // Operand 2: bias + bias_node = graph->Add(bias_name, dims_bias); + VLOG(3) << "node idx: " << bias_node->index() + << ": Bias name: " << bias_name + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << bias_dims + << " ,channel_size:" << channel_size; + + } else { + // Create default bias with value 0 + biasType.dimensionCount = 1; + dims_bias = {(uint32_t)output_dims[1]}; + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // Operand 2: bias + bias_node = graph->Add(filter_name + "_default_bias", dims_bias); + VLOG(3) << "node idx: " << bias_node->index() + << ": Bias name: default_bias " + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << dims_bias.size(); + } + + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {1}; + + std::shared_ptr paddingL_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 3: padding left + paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); + + std::shared_ptr paddingR_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 4: padding right + paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); + + std::shared_ptr paddingT_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 5: padding top + paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); + + std::shared_ptr paddingB_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 6: padding bottom + paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); + + std::shared_ptr strideW_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 7: stride width + strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); + + std::shared_ptr strideH_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 8: stride height + strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); + + std::shared_ptr fuse_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 9: fuse + fuse_node = graph->Add(filter_name + "_fuse", dims_int32); + + NeuronOperandType boolType; + boolType.type = NEURON_BOOL; + boolType.dimensionCount = 0; // Must be 0 for scalars. + std::shared_ptr layout_node = nullptr; + NeuronModel_addOperand(model, &boolType); // Operand 9: fuse + layout_node = graph->Add(filter_name + "_layout", dims_int32); + + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = output_scale; + outType.zeroPoint = 128; + outType.dimensionCount = output_dims.size(); + std::vector dims_out = {(uint32_t)output_dims[0], + (uint32_t)output_dims[1], + (uint32_t)output_dims[2], + (uint32_t)output_dims[3]}; + outType.dimensions = &dims_out[0]; + std::shared_ptr output_node = nullptr; + if (graph->Has(output_name)) { + output_node = graph->Get(output_name); + } else { + if (graph->IsOutput(output_name)) { + NeuronModel_addOperand(model, &outType); + output_node = graph->Add("transpose_" + output_name, dims_out); + } else { + NeuronModel_addOperand(model, &outType); + output_node = graph->Add(output_name, dims_out); + } + } + VLOG(3) << "output node idx: " << output_node->index() + << ": output_scale: " << outType.scale + << " ,outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Add bias value + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + + int32_t *int32_bias_data = + reinterpret_cast(bias->mutable_data()); + float2int32( + bias->data(), input_scale, filter_scale, int32_bias_data); + + VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":" + << int32_bias_data[1] << ":" << int32_bias_data[2] << ":" + << int32_bias_data[3]; + + neuron_errCode = NeuronModel_setOperandValue( + model, bias_node->index(), bias->raw_data(), bias->memory_size()); + } else { + auto int32_bias = std::make_shared(); + int32_bias->Resize({1, output_dims[3]}); + int32_bias->mutable_data(); + VLOG(3) << "bais_default: " << int32_bias->memory_size(); + memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); + neuron_errCode = NeuronModel_setOperandValue(model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); + bias_node->set_data(int32_bias); + } + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + + VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":" + << paddings[2] << ":" << paddings[3]; + // Add padding value + int32_t padding_val[1]; + padding_val[0] = paddings[2]; + NeuronModel_setOperandValue( + model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[3]; + NeuronModel_setOperandValue( + model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[0]; + NeuronModel_setOperandValue( + model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[1]; + NeuronModel_setOperandValue( + model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); + + VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0]; + + // Add Stride + int32_t stride_val[1]; + stride_val[0] = strides[1]; // entry 1: width stride + NeuronModel_setOperandValue( + model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); + stride_val[0] = strides[0]; // entry 0: height stride + NeuronModel_setOperandValue( + model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); + + int32_t fuse_val[1] = {NEURON_FUSED_NONE}; + if (act_type == "relu") { + fuse_val[0] = NEURON_FUSED_RELU; + } else if (act_type == "relu1") { + fuse_val[0] = NEURON_FUSED_RELU1; + } else if (act_type == "relu6") { + fuse_val[0] = NEURON_FUSED_RELU6; + } else if (!act_type.empty()) { + fuse_val[0] = NEURON_FUSED_NONE; + LOG(WARNING) << "Support act_type: " << act_type; + return FAILED; + } + + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + + bool layout_val[] = {false}; + NeuronModel_setOperandValue( + model, layout_node->index(), layout_val, sizeof(bool) * 1); + + std::vector addInIndex = { + input_node->index(), // 0: input + filter_node->index(), // 1: filter + bias_node->index(), // 2: bias + paddingL_node->index(), // 3: padding left + paddingR_node->index(), // 4: padding right + paddingT_node->index(), // 5: padding top + paddingB_node->index(), // 6: padding bottom + strideW_node->index(), // 7: stride width + strideH_node->index(), // 8: stride height + fuse_node->index(), // 9: fuse + layout_node->index()}; // 10: layout + + std::vector addOutIndex = {output_node->index()}; + neuron_errCode = NeuronModel_addOperation(model, + NEURON_TRANSPOSE_CONV_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + if (graph->IsOutput(output_name)) { + // Insert transpose for NHWC -> NCHW + insert_transpose_node(ctx, + "transpose_" + output_name, + output_name, + dims_out, + {(uint32_t)output_dims[0], + (uint32_t)output_dims[1], + (uint32_t)output_dims[2], + (uint32_t)output_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + output_node = graph->Get(output_name); + if (output_node == nullptr) return subgraph::FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose, + kAPU, + paddle::lite::subgraph::apu::ConvTransposeConverter); diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc index 964e81eb6aba26c44dd1b3cd0658984792c6259f..af8f76c68e20e1206bf16450cd04f5ecf5cf7bb9 100644 --- a/lite/kernels/apu/bridges/elementwise_ops.cc +++ b/lite/kernels/apu/bridges/elementwise_ops.cc @@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto op_info = op->op_info(); auto op_type = op_info->Type(); auto scope = op->scope(); - VLOG(3) << "[APU] Converting " + op_type + "..."; + int neuron_errCode; + VLOG(3) << "[APU] Converting [" + op_type + "]"; // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); - auto x = scope->FindMutableTensor(x_name); + auto x = scope->FindTensor(x_name); auto x_dims = x->dims(); auto y_name = op_info->Input("Y").front(); - auto y = scope->FindMutableTensor(y_name); + auto y = scope->FindTensor(y_name); auto y_dims = y->dims(); auto out_name = op_info->Output("Out").front(); - auto out = scope->FindMutableTensor(out_name); + auto out = scope->FindTensor(out_name); auto out_dims = out->dims(); + auto axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + + auto x_shape = x_dims.Vectorize(); + auto y_shape = y_dims.Vectorize(); + + // Two dimensions are compatible when: + // 1. they are equal, or + // 2. one of them is 1 + for (int i = axis; i < x_shape.size(); i++) { + if (x_dims[i] != y_dims[i - axis]) { + // Input 1 compatible dimensions as input0 + if (y_dims[i - axis] != 1) { + LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis]; + return FAILED; + } + } + } // End of for + int32_t fuse_val[1] = {NEURON_FUSED_NONE}; // Act node if (op_type == "fusion_elementwise_add_activation" || op_type == "fusion_elementwise_sub_activation" || op_type == "fusion_elementwise_mul_activation" || op_type == "fusion_elementwise_div_activation") { auto act_type = op_info->GetAttr("act_type"); + + if (act_type == "relu") { + fuse_val[0] = NEURON_FUSED_RELU; + } else if (act_type == "relu1") { + fuse_val[0] = NEURON_FUSED_RELU1; + } else if (act_type == "relu6") { + fuse_val[0] = NEURON_FUSED_RELU6; + } else if (!act_type.empty()) { + fuse_val[0] = NEURON_FUSED_NONE; + LOG(WARNING) << "Support act_type: " << act_type; + return FAILED; + } + } // End of if + VLOG(3) << "x_name" << x_name; + + CHECK(op_info->HasInputScale(x_name)); + auto x_scale = op_info->GetInputScale(x_name)[0]; + CHECK(op_info->HasInputScale(y_name)); + auto y_scale = op_info->GetInputScale(y_name)[0]; + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; + + // Add x tensor type + NeuronOperandType xType; + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = x_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x = {(uint32_t)x_dims[0], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3], + (uint32_t)x_dims[1]}; + xType.dimensions = &dims_x[0]; + + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + VLOG(3) << "Graph has " << x_name; + if (graph->IsInput(x_name)) { + VLOG(3) << x_name << "is input and already exist"; + x_name = "transpose_" + x_name; + } + + if (graph->IsOutput(x_name)) { + VLOG(3) << x_name << "is input and output node"; + x_name = "transpose_" + x_name; + } + x_node = graph->Get(x_name); + } else { + if (graph->IsInput(x_name)) { + insert_transpose_node(ctx, + x_name, + "transpose_" + x_name, + {(uint32_t)x_dims[0], + (uint32_t)x_dims[1], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3]}, + dims_x, + {0, 2, 3, 1}, + xType.scale, + xType.zeroPoint); + + // Change x name after insert transpose op for x data relayout + x_name = "transpose_" + x_name; + x_node = graph->Get(x_name); + } else { + NeuronModel_addOperand(model, &xType); + x_node = graph->Add(x_name, dims_x); + } + } // End of else + VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims + << ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0] + << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":" + << xType.dimensions[3]; + + // Add y tensor type + NeuronOperandType yType; + yType.type = NEURON_TENSOR_QUANT8_ASYMM; + yType.scale = y_scale; + yType.zeroPoint = 128; + yType.dimensionCount = y_dims.size(); + std::vector dims_y = {(uint32_t)y_dims[0], + (uint32_t)y_dims[2], + (uint32_t)y_dims[3], + (uint32_t)y_dims[1]}; + yType.dimensions = &dims_y[0]; + + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + VLOG(3) << "Graph has " << y_name; + y_node = graph->Get(y_name); + } else { + if (graph->IsInput(y_name)) { + insert_transpose_node(ctx, + y_name, + "transpose_" + y_name, + {(uint32_t)y_dims[0], + (uint32_t)y_dims[1], + (uint32_t)y_dims[2], + (uint32_t)y_dims[3]}, + dims_y, + {0, 2, 3, 1}, + yType.scale, + yType.zeroPoint); + + y_name = "transpose_" + y_name; + y_node = graph->Get(y_name); + } else { + NeuronModel_addOperand(model, &yType); + y_node = graph->Add(y_name, dims_y); + } + } + VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims + << ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0] + << ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":" + << yType.dimensions[3]; + + // Add fuse operand type + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {1}; + + // Add fuse operand + std::shared_ptr fuse_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // Operand 2: fuse + fuse_node = graph->Add(out_name + "_fuse", dims_int32); + + // Add out tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale; + outType.zeroPoint = 128; + outType.dimensionCount = out_dims.size(); + std::vector dims_out = {(uint32_t)out_dims[0], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3], + (uint32_t)out_dims[1]}; + outType.dimensions = &dims_out[0]; + + std::shared_ptr out_node = nullptr; + if (graph->Has(out_name)) { + VLOG(3) << "Graph has " << out_name; + out_node = graph->Get(out_name); + } else { + if (graph->IsOutput(out_name)) { + NeuronModel_addOperand(model, &outType); + out_node = graph->Add("transpose_" + out_name, dims_out); + } else { + NeuronModel_addOperand(model, &outType); + out_node = graph->Add(out_name, dims_out); + } + } + VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims + << ": out_scale: " << out_scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Set fuse value + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + + std::vector addInIndex = { + x_node->index(), // 0: A tensor + y_node->index(), // 1: A tensor of the same OperandCode, + // and compatible dimensions as input 0 + fuse_node->index()}; // 2: fuse + + std::vector addOutIndex = {out_node->index()}; + if (op_type == "elementwise_add" || + op_type == "fusion_elementwise_add_activation") { + neuron_errCode = NeuronModel_addOperation(model, + NEURON_ADD, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } else { + LOG(WARNING) << "[APU] Unsupported op type: " << op_type; + return FAILED; + } + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "ADD op fail:" << op_type; + return FAILED; + } + + if (graph->IsOutput(out_name)) { + // Insert transpose for NHWC -> NCHW + insert_transpose_node(ctx, + "transpose_" + out_name, + out_name, + dims_out, + {(uint32_t)out_dims[0], + (uint32_t)out_dims[1], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + out_node = graph->Get(out_name); + if (out_node == nullptr) return FAILED; } return REBUILD_WHEN_SHAPE_CHANGED; @@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add, REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, kAPU, paddle::lite::subgraph::apu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, + kAPU, + paddle::lite::subgraph::apu::ElementwiseConverter); diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc index 5bee94424402c52b61bdd478488a55210f9b4000..ac0d27bc7bb950f764626d509238db18857a7e64 100644 --- a/lite/kernels/apu/bridges/fc_op.cc +++ b/lite/kernels/apu/bridges/fc_op.cc @@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { inType.dimensions = &dims_in[0]; std::shared_ptr in_node = nullptr; if (graph->Has(input_name)) { - // input operand already exist in_node = graph->Get(input_name); VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index(); } else { - // add input operand - NeuronModel_addOperand(model, &inType); // 0: input + NeuronModel_addOperand(model, &inType); // Operand 0: input in_node = graph->Add(input_name, dims_in); } VLOG(3) << "input_scale: " << input_scale @@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { wType.dimensionCount = w_dims.size(); std::vector dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]}; wType.dimensions = &dims_w[0]; - NeuronModel_addOperand(model, &wType); // 1: weight + NeuronModel_addOperand(model, &wType); // Operand 1: weight std::shared_ptr w_node = nullptr; w_node = graph->Add(w_name, dims_w); VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0] @@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { biasType.dimensionCount = bias_dims.size(); std::vector dims_bias = {(uint32_t)bias_dims[0]}; biasType.dimensions = &dims_bias[0]; - NeuronModel_addOperand(model, &biasType); // 2: bias + NeuronModel_addOperand(model, &biasType); // Operand 2: bias bias_node = graph->Add(bias_name, dims_bias); VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims << ", bias scale: " << biasType.scale @@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { biasType.dimensionCount = 1; std::vector dims_bias = {(uint32_t)n}; biasType.dimensions = &dims_bias[0]; - NeuronModel_addOperand(model, &biasType); // 2: bias + NeuronModel_addOperand(model, &biasType); // Operand 2: bias bias_node = graph->Add(w_name + "_default_bias", dims_bias); } @@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { fuseType.type = NEURON_INT32; fuseType.dimensionCount = 0; std::vector dims_int32 = {0}; - NeuronModel_addOperand(model, &fuseType); // 3: fuse + NeuronModel_addOperand(model, &fuseType); // Operand 3: fuse std::shared_ptr fuse_node = nullptr; fuse_node = graph->Add(w_name + "_fuse", dims_int32); @@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { outType.scale = out_scale; outType.zeroPoint = 128; outType.dimensionCount = 2; - std::vector dims_out = {(uint32_t)out_dims[0], out_dims[1]}; + std::vector dims_out = {(uint32_t)out_dims[0], + (uint32_t)out_dims[1]}; outType.dimensions = &dims_out[0]; VLOG(3) << "out_scale: " << out_scale << ", outType: " << outType.dimensions[0] << " : " << outType.dimensions[1]; - NeuronModel_addOperand(model, &outType); // output + NeuronModel_addOperand(model, &outType); std::shared_ptr out_node = nullptr; out_node = graph->Add(out_name, dims_out); @@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronModel_setOperandValue(model, bias_node->index(), bias->raw_data(), - bias->memory_size()); // 2: bias + bias->memory_size()); // Operand 2: bias } else { auto int32_bias = std::make_shared(); int32_bias->Resize({1, out_dims[1]}); int32_bias->mutable_data(); memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); VLOG(3) << "default: " << int32_bias->memory_size(); - neuron_errCode = - NeuronModel_setOperandValue(model, - bias_node->index(), - int32_bias->raw_data(), - int32_bias->memory_size()); // 2: bias + neuron_errCode = NeuronModel_setOperandValue( + model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); // Operand 2: bias bias_node->set_data(int32_bias); } // Add fuse value int32_t fuse_val[1] = {0}; - NeuronModel_setOperandValue( - model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); // 3: fuse - - std::vector addInIndex = {in_node->index(), - w_node->index(), - bias_node->index(), - fuse_node->index()}; + NeuronModel_setOperandValue(model, + fuse_node->index(), + fuse_val, + sizeof(int32_t) * 1); // Operand 3: fuse + + std::vector addInIndex = {in_node->index(), // 0: input + w_node->index(), // 1: weight + bias_node->index(), // 2: bias + fuse_node->index()}; // 3: fuse std::vector addOutIndex = {out_node->index()}; neuron_errCode = NeuronModel_addOperation(model, NEURON_FULLY_CONNECTED, diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc old mode 100644 new mode 100755 index 515853aa26a1d84339c61047b5d3be20478b5ca3..ee7c92d2c2b9399b44fffd2fe8ad80618f3de526 --- a/lite/kernels/apu/bridges/graph.cc +++ b/lite/kernels/apu/bridges/graph.cc @@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr node) { LOG(FATAL) << "[APU] Node" << name << " is redefined."; return -1; } else { - VLOG(3) << " Add: " << name << " : " << node->index(); + VLOG(5) << " Add: " << name << " : " << node->index(); auto ret = nodes_.insert( std::make_pair(name, std::vector>())); CHECK(ret.second); diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h old mode 100644 new mode 100755 index e3e68afc6c7c18d2b8d68361ac09de2abf2b684c..264ca8160ae4343eda7b8c7424cf26c0257512d8 --- a/lite/kernels/apu/bridges/paddle_use_bridges.h +++ b/lite/kernels/apu/bridges/paddle_use_bridges.h @@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU); USE_SUBGRAPH_BRIDGE(fc, kAPU); USE_SUBGRAPH_BRIDGE(pool2d, kAPU); USE_SUBGRAPH_BRIDGE(softmax, kAPU); +USE_SUBGRAPH_BRIDGE(concat, kAPU); +USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU); +USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU); diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc index e2555180446920b670d98ebc3d82aa492ed244f4..20691ee737ec47528b800367dca8d615f0b878a6 100644 --- a/lite/kernels/apu/bridges/pool_op.cc +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto ksize = op_info->GetAttr>("ksize"); std::vector paddings = op_info->GetAttr>("paddings"); - // pool mode + // Check pool mode if ((pooling_type == "max") || (pooling_type == "avg")) { } else { LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type; return FAILED; } - // pad mode + // Check padding mode int pad_mode = 0; std::string padding_algorithm(""); if (op_info->HasAttr("padding_algorithm")) { @@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { pad_mode = 5; } - // paddings and strides + // Check paddings and strides if (paddings.size() == 2L) { for (size_t i = 0; i < 2L; ++i) { int copy_pad = *(paddings.begin() + 2 * i); @@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { xType.dimensions = &dims_x[0]; std::shared_ptr x_node = nullptr; if (graph->Has(x_name)) { - LOG(INFO) << "Graph has " << x_name; - // input operand already exist + VLOG(3) << "Graph has " << x_name; x_node = graph->Get(x_name); } else { - // add input operand - NeuronModel_addOperand(model, &xType); // 0: x + NeuronModel_addOperand(model, &xType); // Operand 0: x x_node = graph->Add(x_name, dims_x); } VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":" << xType.dimensions[3]; + VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1]; + NeuronOperandType int32Type; int32Type.type = NEURON_INT32; int32Type.dimensionCount = 0; std::vector dims_int32 = {0}; std::shared_ptr paddingL_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 1: padding left + NeuronModel_addOperand(model, &int32Type); // Operand 1: padding left paddingL_node = graph->Add(x_name + "_padding_left", dims_int32); std::shared_ptr paddingR_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 2: padding right + NeuronModel_addOperand(model, &int32Type); // Operand 2: padding right paddingR_node = graph->Add(x_name + "_padding_right", dims_int32); std::shared_ptr paddingT_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 3: padding top + NeuronModel_addOperand(model, &int32Type); // Operand 3: padding top paddingT_node = graph->Add(x_name + "_padding_top", dims_int32); std::shared_ptr paddingB_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 4: padding bottom + NeuronModel_addOperand(model, &int32Type); // Operand 4: padding bottom paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32); std::shared_ptr strideW_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 5: stride width + NeuronModel_addOperand(model, &int32Type); // Operand 5: stride width strideW_node = graph->Add(x_name + "_stride_width", dims_int32); std::shared_ptr strideH_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 6: stride height + NeuronModel_addOperand(model, &int32Type); // Operand 6: stride height strideH_node = graph->Add(x_name + "_stride_height", dims_int32); std::shared_ptr filterW_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 7: filter width + NeuronModel_addOperand(model, &int32Type); // Operand 7: filter width filterW_node = graph->Add(x_name + "_filter_width", dims_int32); std::shared_ptr filterH_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 8: filter height + NeuronModel_addOperand(model, &int32Type); // Operand 8: filter height filterH_node = graph->Add(x_name + "_filter_height", dims_int32); std::shared_ptr fuse_node = nullptr; - NeuronModel_addOperand(model, &int32Type); // 9: fuse - fuse_node = graph->Add(x_name + "_fuse", dims_int32); + NeuronModel_addOperand(model, &int32Type); // Operand 9: fuse + fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32); - // Add out type // Add output tensor type NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; @@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (graph->Has(out_name)) { out_node = graph->Get(out_name); } else { - NeuronModel_addOperand(model, &outType); // out + NeuronModel_addOperand(model, &outType); out_node = graph->Add(out_name, dims_out); } - VLOG(3) << "output_scale: " << x_scale + VLOG(3) << "output_scale: " << out_scale << ", outType: " << outType.dimensions[0] << ":" << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" << outType.dimensions[3]; @@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add Stride int32_t stride_val[1]; - stride_val[0] = strides[1]; // width + stride_val[0] = strides[1]; // Entry 1: width stride NeuronModel_setOperandValue( model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); - stride_val[0] = strides[0]; // height + stride_val[0] = strides[0]; // Entry 0: height stride NeuronModel_setOperandValue( model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); // Add filter int32_t filter_val[1]; - filter_val[0] = global_pooling ? x_dims[3] : ksize[1]; // width + filter_val[0] = + global_pooling ? x_dims[3] : ksize[1]; // Entry 1: filter width NeuronModel_setOperandValue( model, filterW_node->index(), filter_val, sizeof(int32_t) * 1); - filter_val[0] = global_pooling ? x_dims[2] : ksize[0]; // height + filter_val[0] = + global_pooling ? x_dims[2] : ksize[0]; // Entry 0: filter height NeuronModel_setOperandValue( model, filterH_node->index(), filter_val, sizeof(int32_t) * 1); diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc index 4b2a465cd6e48d9d387f0b2195b04728890601ca..177f778ea7dbfc77f389a76ed236a975a9cfe314 100644 --- a/lite/kernels/apu/bridges/softmax_op.cc +++ b/lite/kernels/apu/bridges/softmax_op.cc @@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { xType.dimensions = &dims_x[0]; std::shared_ptr x_node = nullptr; if (graph->Has(x_name)) { - // input operand already exist x_node = graph->Get(x_name); VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index(); } else { - // add input operand - NeuronModel_addOperand(model, &xType); // 0: input + NeuronModel_addOperand(model, &xType); // Operand 0: input x_node = graph->Add(x_name, dims_x); } VLOG(3) << "input_scale size: " << input_scale @@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronOperandType betaType; betaType.type = NEURON_FLOAT32; betaType.dimensionCount = 0; - NeuronModel_addOperand(model, &betaType); // 1: beta + NeuronModel_addOperand(model, &betaType); // Operand 1: beta std::shared_ptr beta_node = nullptr; beta_node = graph->Add(x_name + "_beta", dims_int32); @@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronOperandType axisType; axisType.type = NEURON_INT32; axisType.dimensionCount = 0; - NeuronModel_addOperand(model, &axisType); // 2: axis + NeuronModel_addOperand(model, &axisType); // Operand 2: axis std::shared_ptr axis_node = nullptr; axis_node = graph->Add(x_name + "_axis", dims_int32); @@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { outType.zeroPoint = 128; outType.dimensionCount = x_dims.size(); outType.dimensions = &dims_x[0]; - NeuronModel_addOperand(model, &outType); // 3: output + NeuronModel_addOperand(model, &outType); // Operand 3: output std::shared_ptr out_node = nullptr; out_node = graph->Add(out_name, dims_x); VLOG(3) << "out_scale: " << out_scale; @@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis_val[0] = axis; NeuronModel_setOperandValue( model, axis_node->index(), axis_val, sizeof(int32_t) * 1); - std::vector addInIndex = { - x_node->index(), beta_node->index(), axis_node->index()}; + std::vector addInIndex = {x_node->index(), // 0: input + beta_node->index(), // 1: beta + axis_node->index()}; // 2: axis std::vector addOutIndex = {out_node->index()}; int neuron_errCode = NeuronModel_addOperation(model, NEURON_SOFTMAX, diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc index c91e81476e519a28ebf851f42f2916c9d7c38dd8..f9cd04b71805bc29a7da4450d1f9235c5cf5d64a 100644 --- a/lite/kernels/apu/bridges/utility.cc +++ b/lite/kernels/apu/bridges/utility.cc @@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info, } } -void insert_transpose_node(void* ctx, - const std::string& input_name, - const std::string& output_name, - std::vector input_shape, - std::vector output_shape, - std::vector axis, - float scale, - int32_t zeroPoint) { +int insert_requant_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + float scale_in, + float scale_out, + int32_t zeroPoint) { int neuron_errCode; auto graph = static_cast(ctx); auto model = graph->model(); + uint32_t numDevices = 0; + CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR); + CHECK_GT(numDevices, (uint32_t)0); + + NeuronDevice* targetDevice = nullptr; + + for (uint32_t i = 0; i < numDevices; ++i) { + NeuronDevice* device = nullptr; + Neuron_getDevice(i, &device); + const char* name; + NeuronDevice_getName(device, &name); + if (0 == strcmp(name, "mtk-dsp")) { + targetDevice = device; + break; + } + } + if (targetDevice == nullptr) { + LOG(FATAL) << "Insert mtk_requant op fail!"; + return -1; + } + // Add input NeuronOperandType inType; inType.type = NEURON_TENSOR_QUANT8_ASYMM; - inType.scale = scale; + inType.scale = scale_in; inType.zeroPoint = zeroPoint; inType.dimensionCount = input_shape.size(); inType.dimensions = &input_shape[0]; @@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx, VLOG(3) << "Has " << input_name; input_node = graph->Get(input_name); } else { - neuron_errCode = NeuronModel_addOperand(model, &inType); // input + neuron_errCode = NeuronModel_addOperand(model, &inType); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Insert transpose op fail!"; - return; + LOG(FATAL) << "Insert mtk_requant op fail!"; + return -1; } VLOG(3) << "Add " << input_name; input_node = graph->Add(input_name, input_shape); } + // Add output + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = scale_out; + outType.zeroPoint = zeroPoint; + outType.dimensionCount = output_shape.size(); + outType.dimensions = &output_shape[0]; + + NeuronModel_addOperand(model, &outType); + std::shared_ptr output_node = nullptr; + output_node = graph->Add(output_name, output_shape); + + std::vector addInIndex = {input_node->index()}; + + std::vector addOutIndex = {output_node->index()}; + + neuron_errCode = NeuronModel_addOperationExtension(model, + "MTK_REQUANTIZE", + "mediatek", + targetDevice, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(FATAL) << "Insert mtk_requant op fail!"; + return -1; + } + + return 0; +} + +int insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint) { + int neuron_errCode; + auto graph = static_cast(ctx); + auto model = graph->model(); + + // Add input + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = scale; + inType.zeroPoint = zeroPoint; + inType.dimensionCount = input_shape.size(); + inType.dimensions = &input_shape[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(5) << "Has " << input_name; + input_node = graph->Get(input_name); + } else { + neuron_errCode = NeuronModel_addOperand(model, &inType); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(FATAL) << "Insert transpose op fail!"; + return -1; + } + VLOG(5) << "Add " << input_name; + input_node = graph->Add(input_name, input_shape); + } + // Add perm NeuronOperandType permsType; permsType.type = NEURON_TENSOR_INT32; @@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx, uint32_t dims_perms[1] = {4}; permsType.dimensions = dims_perms; - neuron_errCode = NeuronModel_addOperand(model, &permsType); // perm + neuron_errCode = NeuronModel_addOperand(model, &permsType); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Insert transpose op fail!"; - return; + LOG(FATAL) << "Insert transpose op fail!"; + return -1; } std::shared_ptr perms_node = nullptr; perms_node = graph->Add(input_name + "_perms", {4}); - VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" + VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; - // &axis[0], sizeof(int32_t) * axis.size()); + neuron_errCode = NeuronModel_setOperandValue( model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size()); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Insert transpose op fail!"; - return; + LOG(FATAL) << "Insert transpose op fail!"; + return -1; } // Add output @@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx, outType.dimensionCount = output_shape.size(); outType.dimensions = &output_shape[0]; - NeuronModel_addOperand(model, &outType); // output + NeuronModel_addOperand(model, &outType); std::shared_ptr output_node = nullptr; output_node = graph->Add(output_name, output_shape); @@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx, &addOutIndex[0]); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Insert transpose op fail!"; + LOG(FATAL) << "Insert transpose op fail!"; } + + return 0; } void transpose(const int8_t* input_data, @@ -135,9 +224,9 @@ void transpose(const int8_t* input_data, int new_index = -1; int dim[4] = {0}; std::vector shape = input_shape; - VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] << ":" << input_shape[3]; - VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { @@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data, int new_index = -1; int dim[4] = {0}; std::vector shape = input_shape; - VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] << ":" << input_shape[3]; - VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { @@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data, dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; - - output_data[new_index] = input_data[old_index] + 128; // per layer + // Per layer op is asym op and need to add 128 + output_data[new_index] = input_data[old_index] + 128; } } } diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h old mode 100644 new mode 100755 index 01752d181964bfb0e19f4319b52727b1ab541ee7..ff9c75711c22cebc15f8b0f3b14d11dc8e6c62f1 --- a/lite/kernels/apu/bridges/utility.h +++ b/lite/kernels/apu/bridges/utility.h @@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info, const Scope* scope, const std::string& argname); -void insert_transpose_node(void* ctx, - const std::string& input_name, - const std::string& output_name, - std::vector input_shape, - std::vector output_shape, - std::vector axis, - float scale, - int32_t zeroPoint); +int insert_requant_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + float scale_in, + float scale_out, + int32_t zeroPoint); + +int insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint); void transpose(const int8_t* input_data, uint8_t* output_data, diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc old mode 100644 new mode 100755 index 698536743d3225aaf2ebd4e3a6a75ee3f3c1ef1f..5e86514478f421ece6642afdd0bfaab4025420bb --- a/lite/kernels/apu/subgraph_compute.cc +++ b/lite/kernels/apu/subgraph_compute.cc @@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() { BuildOriginProgram(); } + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + + auto start_time = GetCurrentUS(); + unsigned int version; Neuron_getVersion(&version); VLOG(3) << "Neuron Adapter version: " << version; @@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() { } VLOG(3) << "[APU] APU NIR model created!"; - auto GetCurrentUS = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; - }; - auto start_time = GetCurrentUS(); + VLOG(1) << "[APU] APU NIR model created, Create cost " + << GetCurrentUS() - start_time << " us"; + + start_time = GetCurrentUS(); compilation_ = lite::apu::Device::Global().Build(model_); if (compilation_ == nullptr) { LOG(WARNING) << "[APU] Build APU DLA model failed!"; return false; } - VLOG(3) << "[APU] APU DLA model created, Build cost " + VLOG(1) << "[APU] APU DLA model created, Build cost " << GetCurrentUS() - start_time << " us"; return true; } @@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() { } } NeuronExecution_free(run); - VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; + VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; return true; }