[cherry-pick][APU] Mtk apu add more OPs (#4287) (#4451)

aaba6447 · hong19860320 · GitHub · 5a5794b3 · aaba6447 · aaba6447
15 changed file
--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
  PADDLE_DLSYM(NeuronModel_setOperandValue);
  PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
  PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_addOperationExtension);
  PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
  PADDLE_DLSYM(NeuronCompilation_create);
  PADDLE_DLSYM(NeuronCompilation_free);
  PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_createForDevices);
  PADDLE_DLSYM(NeuronExecution_create);
  PADDLE_DLSYM(NeuronExecution_free);
  PADDLE_DLSYM(NeuronExecution_setInput);
  PADDLE_DLSYM(NeuronExecution_setOutput);
  PADDLE_DLSYM(NeuronExecution_compute);
+  PADDLE_DLSYM(Neuron_getDeviceCount);
+  PADDLE_DLSYM(Neuron_getDevice);
+  PADDLE_DLSYM(NeuronDevice_getName);
 #undef PADDLE_DLSYM
 }
@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
      model, type, inputCount, inputs, outputCount, outputs);
 }
+int NeuronModel_addOperationExtension(NeuronModel* model,
+                                      const char* name,
+                                      const char* vendor,
+                                      const NeuronDevice* device,
+                                      uint32_t inputCount,
+                                      const uint32_t* inputs,
+                                      uint32_t outputCount,
+                                      const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_addOperationExtension()(model,
+                                            name,
+                                            vendor,
+                                            device,
+                                            inputCount,
+                                            inputs,
+                                            outputCount,
+                                            outputs);
+}
 int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
                                         uint32_t inputCount,
                                         const uint32_t* inputs,
@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
      compilation);
 }
+int NeuronCompilation_createForDevices(NeuronModel* model,
+                                       const NeuronDevice* const* devices,
+                                       uint32_t numDevices,
+                                       NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_createForDevices()(
+          model, devices, numDevices, compilation);
+}
 int NeuronExecution_create(NeuronCompilation* compilation,
                           NeuronExecution** execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
      execution);
 }
+int Neuron_getDeviceCount(uint32_t* numDevices) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
+      numDevices);
+}
+int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
+                                                                   device);
+}
+int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
+                                                                       name);
+}
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -42,12 +42,25 @@ class NeuronAdapter final {
                                                const uint32_t *,
                                                uint32_t,
                                                const uint32_t *);
+  using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
+                                                         const char *,
+                                                         const char *,
+                                                         const NeuronDevice *,
+                                                         uint32_t,
+                                                         const uint32_t *,
+                                                         uint32_t,
+                                                         const uint32_t *);
  using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
      NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
  using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                NeuronCompilation **);
  using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
  using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_createForDevices_Type =
+      int (*)(NeuronModel *,
+              const NeuronDevice *const *,
+              uint32_t,
+              NeuronCompilation **);
  using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                              NeuronExecution **);
  using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -59,6 +72,10 @@ class NeuronAdapter final {
  using NeuronExecution_setOutput_Type = int (*)(
      NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
  using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+  using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
+  using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
+  using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
+                                            const char **);
  Neuron_getVersion_Type Neuron_getVersion() {
    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
@@ -105,6 +122,12 @@ class NeuronAdapter final {
    return NeuronModel_addOperation_;
  }
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
+    CHECK(NeuronModel_addOperationExtension_ != nullptr)
+        << "Cannot load NeuronModel_addOperationExtension!";
+    return NeuronModel_addOperationExtension_;
+  }
  NeuronModel_identifyInputsAndOutputs_Type
  NeuronModel_identifyInputsAndOutputs() {
    CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
@@ -130,6 +153,12 @@ class NeuronAdapter final {
    return NeuronCompilation_finish_;
  }
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
+    CHECK(NeuronCompilation_createForDevices_ != nullptr)
+        << "Cannot load NeuronCompilation_createForDevices!";
+    return NeuronCompilation_createForDevices_;
+  }
  NeuronExecution_create_Type NeuronExecution_create() {
    CHECK(NeuronExecution_create_ != nullptr)
        << "Cannot load NeuronExecution_create!";
@@ -160,6 +189,23 @@ class NeuronAdapter final {
    return NeuronExecution_compute_;
  }
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
+    CHECK(Neuron_getDeviceCount_ != nullptr)
+        << "Cannot load Neuron_getDeviceCount!";
+    return Neuron_getDeviceCount_;
+  }
+  Neuron_getDevice_Type Neuron_getDevice() {
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    return Neuron_getDevice_;
+  }
+  NeuronDevice_getName_Type NeuronDevice_getName() {
+    CHECK(NeuronDevice_getName_ != nullptr)
+        << "Cannot load NeuronDevice_getName!";
+    return NeuronDevice_getName_;
+  }
 private:
  NeuronAdapter();
  NeuronAdapter(const NeuronAdapter &) = delete;
@@ -176,16 +222,23 @@ class NeuronAdapter final {
  NeuronModel_setOperandSymmPerChannelQuantParams_Type
      NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
  NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
+      nullptr};
  NeuronModel_identifyInputsAndOutputs_Type
      NeuronModel_identifyInputsAndOutputs_{nullptr};
  NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
  NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
  NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
+      nullptr};
  NeuronExecution_create_Type NeuronExecution_create_{nullptr};
  NeuronExecution_free_Type NeuronExecution_free_{nullptr};
  NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
  NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
  NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
+  Neuron_getDevice_Type Neuron_getDevice_{nullptr};
+  NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
 };
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
 lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})
 set(apu_subgraph_bridges
@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
        subgraph_bridge_softmax_op_apu
        subgraph_bridge_fc_op_apu
        subgraph_bridge_pool_op_apu
+	subgraph_bridge_conv_transpose_op_apu
+	subgraph_bridge_concat_op_apu
        CACHE INTERNAL "apu_subgraph_bridges")
 message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/concat_op.cc
+++ b/lite/kernels/apu/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto out_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+  // Process data layout axis change
+  if (axis == 1)
+    axis = 3;
+  else if (axis == 2)
+    axis = 1;
+  else if (axis == 3)
+    axis = 2;
+  // Limitation:
+  // All input tensors of NEURON_TENSOR_QUANT8_ASYMM must
+  // have the same scale and zeroPoint as the output tensor
+  CHECK(op_info->HasOutputScale(out_name));
+  auto output_scale = op_info->GetOutputScale(out_name)[0];
+  // Traverse all of input nodes
+  std::vector<std::shared_ptr<Node>> input_nodes;
+  NeuronOperandType xType;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    CHECK(op_info->HasInputScale(x_name));
+    auto input_scale = op_info->GetInputScale(x_name)[0];
+    // Add x tensor type
+    xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+    xType.scale = input_scale;
+    xType.zeroPoint = 128;
+    xType.dimensionCount = x_dims.size();
+    std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                    (uint32_t)x_dims[2],
+                                    (uint32_t)x_dims[3],
+                                    (uint32_t)x_dims[1]};
+    xType.dimensions = &dims_x[0];
+    if (graph->Has(x_name)) {
+      VLOG(3) << "Graph has " << x_name;
+      if (graph->IsInput(x_name)) {
+        VLOG(3) << x_name << "is input and already exist";
+        x_name = "transpose_" + x_name;
+      }
+      if (graph->IsOutput(x_name)) {
+        VLOG(3) << x_name << "is input and output node";
+        x_name = "transpose_" + x_name;
+      }
+      x_node = graph->Get(x_name);
+    } else {
+      // Add input operand
+      if (graph->IsInput(x_name)) {
+        // Insert transpose for NCHW -> NHWC
+        insert_transpose_node(ctx,
+                              x_name,
+                              "transpose_" + x_name,
+                              {(uint32_t)x_dims[0],
+                               (uint32_t)x_dims[1],
+                               (uint32_t)x_dims[2],
+                               (uint32_t)x_dims[3]},
+                              dims_x,
+                              {0, 2, 3, 1},
+                              xType.scale,
+                              xType.zeroPoint);
+        // Change x_name because we add transpose op
+        x_name = "transpose_" + x_name;
+        x_node = graph->Get(x_name);
+      } else {
+        NeuronModel_addOperand(model, &xType);
+        x_node = graph->Add(x_name, dims_x);
+      }
+    }  // End of else
+    if (x_node == nullptr) return subgraph::FAILED;
+    input_nodes.push_back(x_node);
+    VLOG(3) << "input node x: " << x_node->index()
+            << ": input_scale: " << input_scale << " x_dims:" << x_dims[0]
+            << ":" << x_dims[1] << ":" << x_dims
+            << ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1]
+            << ":" << xType.dimensions[2] << ":" << xType.dimensions[3];
+  }  // End of for
+  if (input_nodes.size() != num) {
+    LOG(WARNING) << "Create input operand failed!";
+    return subgraph::FAILED;
+  }
+  // Add axis operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+  // Add axis operand
+  std::shared_ptr<Node> axis_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // axis
+  axis_node = graph->Add(out_name + "_axis", dims_int32);
+  VLOG(3) << "axis:" << axis;
+  // Add out operand type
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  // Add out operand
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index()
+          << ": output_scle: " << outType.scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+  // Set axis value
+  int32_t axis_val[1] = {(int32_t)axis};
+  NeuronModel_setOperandValue(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+  std::vector<uint32_t> addInIndex;
+  for (auto& node : input_nodes) {
+    addInIndex.push_back(node->index());
+  }
+  addInIndex.push_back(axis_node->index());
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_CONCATENATION,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return subgraph::FAILED;
+  }
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return subgraph::FAILED;
+  }
+  return SUCCESS;
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConcatConverter);
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(strides.size(), 2L);
  CHECK_EQ(dilations.size(), 2L);
  bool is_depthwise_mode = ic == groups && oc == groups;
-  VLOG(3) << "is_depthwise_mode" << is_depthwise_mode;
+  VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode;
  if (paddings.size() == 2L) {
    for (size_t i = 0; i < strides.size(); ++i) {
@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto filter_scale = op_info->GetInputScale(filter_name);
  CHECK(op_info->HasOutputScale(output_name));
  auto output_scale = op_info->GetOutputScale(output_name)[0];
+  auto orig_output_scale = op_info->GetOutputScale(output_name)[0];
  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
          << " ,dilations: " << dilations[0] << ":" << dilations[1];
@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> input_node = nullptr;
  if (graph->Has(input_name)) {
    VLOG(3) << "Graph has " << input_name;
-    // input operand already exist
+    if (graph->IsInput(input_name)) {
+      VLOG(3) << input_name << "is input and already exist";
+      input_name = "transpose_" + input_name;
+    }
+    if (graph->IsOutput(input_name)) {
+      VLOG(3) << input_name << "is input and output node";
+      input_name = "transpose_" + input_name;
+    }
    input_node = graph->Get(input_name);
  } else {
-    // add input operand
    if (graph->IsInput(input_name)) {
      // Insert transpose for NCHW -> NHWC
-      insert_transpose_node(
+      insert_transpose_node(ctx,
-          ctx,
+                            input_name,
-          input_name,
+                            "transpose_" + input_name,
-          "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
-          {input_dims[0], input_dims[1], input_dims[2], input_dims[3]},
+                             (uint32_t)input_dims[1],
-          dims_in,
+                             (uint32_t)input_dims[2],
-          {0, 2, 3, 1},
+                             (uint32_t)input_dims[3]},
-          inType.scale,
+                            dims_in,
-          inType.zeroPoint);
+                            {0, 2, 3, 1},
+                            inType.scale,
-      // change input_name
+                            inType.zeroPoint);
      input_name = "transpose_" + input_name;
      input_node = graph->Get(input_name);
      if (input_node == nullptr) return subgraph::FAILED;
@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      input_node = graph->Add(input_name, dims_in);
    }
  }
-  VLOG(3) << "input node idx" << input_node->index()
+  VLOG(3) << "input node idx: " << input_node->index()
          << ": input_scale: " << input_scale
          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add bias type
  NeuronOperandType biasType;
-  // Add filter type
+  // Add filter type, filter data re-layout NCHW -> NHWC
-  // filter NCHW -> NHWC
  Tensor transpose_filter;
  std::vector<uint32_t> dims_filter;
@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.scale = 0;
  }
+  auto precision = filter->precision();
  std::shared_ptr<Node> filter_node = nullptr;
  if (1 == filter_scale.size()) {
-    NeuronModel_addOperand(model, &filterType);  // 1: filter
+    NeuronModel_addOperand(model, &filterType);
-    filter_node = graph->Add(filter_name, dims_filter);
+    filter_node = graph->Add(filter_name, dims_filter);  // Operand 1: filter
    VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
            << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
            << ":" << filterType.dimensions[1] << ":"
@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      return subgraph::FAILED;
    }
  } else {
-    NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
+    NeuronModel_addOperand(model, &channelFilterType);  // Operand 1: filter
    filter_node = graph->Add(filter_name, dims_filter);
    VLOG(3) << "chennel filter node idx: " << filter_node->index()
            << " ,scale_count:" << filter_scale.size()
@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add biasType node value
  // A 1-D tensor, of shape [depth_out], specifying the bias.
  // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
-  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0
+  // should be of NEURON_TENSOR_INT32, with zeroPoint of 0
  // and bias_scale of 0. The actual scale of each value 'i' is equal
  // to bias_scale[i] = input_scale * filter_scale[i].
  biasType.type = NEURON_TENSOR_INT32;
@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    for (int i = 0; i < bias_dims.size(); i++)
      dims_bias.push_back(bias_dims[i]);
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(bias_name, dims_bias);
-    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
            << " ,bias scale: " << biasType.scale
            << " ,dimensions: " << bias_dims;
  } else {
    biasType.dimensionCount = 1;
    dims_bias = {(uint32_t)output_dims[1]};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
            << " ,bias scale: " << biasType.scale
@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::vector<uint32_t> dims_int32 = {1};
  std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
  std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
  std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
  std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
  std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
  std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
  std::shared_ptr<Node> dm_node = nullptr;
  if (is_depthwise_mode) {
-    NeuronModel_addOperand(model, &int32Type);  // 9: depthwise multiplier
+    NeuronModel_addOperand(model,
+                           &int32Type);  // Operand 9: depthwise multiplier
    dm_node = graph->Add(filter_name + "_dm", dims_int32);
  }
  std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9/10: fuse
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9/10: fuse
  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+  /* Check output scale */
+  if (is_depthwise_mode) {
+    for (auto s : filter_scale) {
+      if (output_scale < s * input_scale)
+        output_scale = s * input_scale + 0.000001;
+    }
+#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT
+    output_scale = orig_output_scale;
+#endif
+  }
  // Add output tensor type
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (graph->Has(output_name)) {
    output_node = graph->Get(output_name);
  } else {
-    // add output operand
+    // Add output operand
-    if (graph->IsOutput(output_name)) {
+    NeuronModel_addOperand(model, &outType);
-      NeuronModel_addOperand(model, &outType);  // output
+    if (orig_output_scale != output_scale) {
+      // Need to insert requant op, the result is requant_ -> transpose_ ->
+      // output
+      output_node = graph->Add("requant_" + output_name, dims_out);
+    } else if (graph->IsOutput(output_name)) {
+      // Need to insert transpose op, transpose_ -> output
      output_node = graph->Add("transpose_" + output_name, dims_out);
    } else {
-      NeuronModel_addOperand(model, &outType);  // output
      output_node = graph->Add(output_name, dims_out);
    }
  }
@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add Stride
  int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
  NeuronModel_setOperandValue(
      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
  NeuronModel_setOperandValue(
      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        model, dm_node->index(), &dm, sizeof(int32_t) * 1);
    VLOG(3) << "depthwise multiplier:" << dm;
-    // Depthwise conv
+    // Depthwise conv case
    NeuronModel_setOperandValue(
        model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
    std::vector<uint32_t> addInIndex = {
@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    return FAILED;
  }
+  // Check if Requant OP is needed
+  std::shared_ptr<Node> requant_node = nullptr;
+  if (orig_output_scale != output_scale) {
+    std::string requant_out_name = output_name;
+    VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale
+            << " ,output_scale:" << output_scale;
+    if (graph->IsOutput(output_name)) {
+      requant_out_name = "transpose_" + output_name;
+    }
+    insert_requant_node(ctx,
+                        "requant_" + output_name,
+                        requant_out_name,
+                        dims_out,
+                        dims_out,
+                        output_scale,
+                        orig_output_scale,
+                        outType.zeroPoint);
+    requant_node = graph->Get(requant_out_name);
+    if (requant_node == nullptr) return subgraph::FAILED;
+  }
+  std::shared_ptr<Node> transpose_node = nullptr;
  if (graph->IsOutput(output_name)) {
+    VLOG(3) << "Add output transpose:" << output_name;
    // Insert transpose for NHWC -> NCHW
-    insert_transpose_node(
+    insert_transpose_node(ctx,
-        ctx,
+                          "transpose_" + output_name,
-        "transpose_" + output_name,
+                          output_name,
-        output_name,
+                          dims_out,
-        dims_out,
+                          {(uint32_t)output_dims[0],
-        {output_dims[0], output_dims[1], output_dims[2], output_dims[3]},
+                           (uint32_t)output_dims[1],
-        {0, 3, 1, 2},
+                           (uint32_t)output_dims[2],
-        outType.scale,
+                           (uint32_t)output_dims[3]},
-        outType.zeroPoint);
+                          {0, 3, 1, 2},
-    output_node = graph->Get(output_name);
+                          outType.scale,
-    if (output_node == nullptr) return subgraph::FAILED;
+                          outType.zeroPoint);
+    transpose_node = graph->Get(output_name);
+    if (transpose_node == nullptr) return subgraph::FAILED;
  }
  return REBUILD_WHEN_SHAPE_CHANGED;

--- a/lite/kernels/apu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/operators/conv_op.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph *>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_EQ(input_dims.size(), 4);
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  CHECK_EQ(filter_dims.size(), 4);
+  auto output_name = op_info->Output("Output").front();
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK_EQ(strides.size(), 2L);
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  if (groups > 1) {
+    LOG(WARNING) << "[NPU] only support groups == 1";
+    return FAILED;
+  }
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  auto fuse_relu =
+      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  CHECK_EQ(dilations.size(), 2L);
+  std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the input size."
+      << paddings.size();
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+  std::vector<int> output_dims;
+  // Set output_dims: batches
+  output_dims.push_back(input_dims[0]);
+  std::vector<int> output_size;
+  if (op_info->HasAttr("output_size")) {
+    output_size = op_info->GetAttr<std::vector<int>>("output_size");
+  }
+  if (output_size.size() > 2) {
+    // Set output_dims: height, width
+    output_dims.push_back(output_size[0]);
+    output_dims.push_back(output_size[1]);
+  } else {
+    // Compute output size
+    for (int i = 0; i < strides.size(); i++) {
+      int kernel_ext = filter_dims[i + 2];
+      int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
+                        paddings[i * 2] - paddings[i * 2 + 1];
+      output_dims.push_back(output_size);
+    }
+  }
+  output_dims.push_back(filter_dims[1]);
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
+  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
+          << " ,dilations: " << dilations[0] << ":" << dilations[1];
+  VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type;
+  VLOG(3) << "input_dims: " << input_dims
+          << " ,filter_scale size: " << filter_scale.size();
+  VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims
+          << " ,memory_size: " << filter->memory_size()
+          << " ,data_size: " << filter->data_size();
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Graph has " << input_name;
+    // Input operand already created by previous OP
+    input_node = graph->Get(input_name);
+  } else {
+    // Add input operand
+    if (graph->IsInput(input_name)) {
+      // Insert transpose for NCHW -> NHWC
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+      // Change input_name because we add transpose op
+      input_name = "transpose_" + input_name;
+      input_node = graph->Get(input_name);
+      if (input_node == nullptr) return subgraph::FAILED;
+    } else {
+      NeuronModel_addOperand(model, &inType);
+      input_node = graph->Add(input_name, dims_in);
+    }
+  }
+  VLOG(3) << "input node idx: " << input_node->index()
+          << ": input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
+          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
+  // Add bias type
+  NeuronOperandType biasType;
+  // Add filter type
+  // Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in)
+  Tensor transpose_filter;
+  std::vector<uint32_t> dims_filter;
+  transpose_filter.Resize({(uint32_t)filter_dims[1],
+                           (uint32_t)filter_dims[2],
+                           (uint32_t)filter_dims[3],
+                           (uint32_t)filter_dims[0]});
+  transposeAsym(filter->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {(uint32_t)filter_dims[0],
+                 (uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3]},
+                {1, 2, 3, 0});
+  dims_filter = {(uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3],
+                 (uint32_t)filter_dims[0]};
+  NeuronOperandType filterType;
+  filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  filterType.scale = filter_scale[0];
+  filterType.zeroPoint = 128;
+  filterType.dimensionCount = filter_dims.size();
+  filterType.dimensions = &dims_filter[0];
+  biasType.scale = inType.scale * filterType.scale;
+  std::shared_ptr<Node> filter_node = nullptr;
+  NeuronModel_addOperand(model, &filterType);
+  filter_node = graph->Add(filter_name, dims_filter);
+  auto precision = filter->precision();
+  VLOG(3) << " filter node idx: " << filter_node->index()
+          << " filter_scale[0]=" << filter_scale[0]
+          << " filter memory_size=" << filter->memory_size()
+          << " filter precision=" << PrecisionToStr(precision)
+          << " :filterType: " << filterType.dimensions[0] << ":"
+          << filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":"
+          << filterType.dimensions[3];
+  memcpy(filter->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         filter->memory_size());
+  // Set filter value
+  neuron_errCode = NeuronModel_setOperandValue(
+      model, filter_node->index(), filter->raw_data(), filter->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+  // Add biasType node value
+  // A 1-D tensor, of shape [depth_out], specifying the bias.
+  // For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of
+  // NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale ==
+  // input_scale * filter_scale
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  std::vector<uint32_t> dims_bias;
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    auto channel_size = bias->dims().production();
+    CHECK_EQ(channel_size, filter_dims[1] * groups);
+    CHECK_EQ(bias_dims.size(), 1);
+    biasType.dimensionCount = bias_dims.size();
+    for (int i = 0; i < bias_dims.size(); i++)
+      dims_bias.push_back(bias_dims[i]);
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << bias_dims
+            << " ,channel_size:" << channel_size;
+  } else {
+    // Create default bias with value 0
+    biasType.dimensionCount = 1;
+    dims_bias = {(uint32_t)output_dims[1]};
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: default_bias "
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << dims_bias.size();
+  }
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
+  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
+  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
+  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
+  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
+  std::shared_ptr<Node> strideW_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
+  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
+  std::shared_ptr<Node> strideH_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
+  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+  NeuronOperandType boolType;
+  boolType.type = NEURON_BOOL;
+  boolType.dimensionCount = 0;  // Must be 0 for scalars.
+  std::shared_ptr<Node> layout_node = nullptr;
+  NeuronModel_addOperand(model, &boolType);  // Operand 9: fuse
+  layout_node = graph->Add(filter_name + "_layout", dims_int32);
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = output_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
+                                    (uint32_t)output_dims[1],
+                                    (uint32_t)output_dims[2],
+                                    (uint32_t)output_dims[3]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> output_node = nullptr;
+  if (graph->Has(output_name)) {
+    output_node = graph->Get(output_name);
+  } else {
+    if (graph->IsOutput(output_name)) {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add("transpose_" + output_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add(output_name, dims_out);
+    }
+  }
+  VLOG(3) << "output node idx: " << output_node->index()
+          << ": output_scale: " << outType.scale
+          << " ,outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+  // Add bias value
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    int32_t *int32_bias_data =
+        reinterpret_cast<int32_t *>(bias->mutable_data<float>());
+    float2int32(
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
+    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":"
+            << int32_bias_data[1] << ":" << int32_bias_data[2] << ":"
+            << int32_bias_data[3];
+    neuron_errCode = NeuronModel_setOperandValue(
+        model, bias_node->index(), bias->raw_data(), bias->memory_size());
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, output_dims[3]});
+    int32_bias->mutable_data<int32_t>();
+    VLOG(3) << "bais_default: " << int32_bias->memory_size();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    neuron_errCode = NeuronModel_setOperandValue(model,
+                                                 bias_node->index(),
+                                                 int32_bias->raw_data(),
+                                                 int32_bias->memory_size());
+    bias_node->set_data(int32_bias);
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+  VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
+          << paddings[2] << ":" << paddings[3];
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  NeuronModel_setOperandValue(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  NeuronModel_setOperandValue(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  NeuronModel_setOperandValue(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  NeuronModel_setOperandValue(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+  VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // entry 1: width stride
+  NeuronModel_setOperandValue(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // entry 0: height stride
+  NeuronModel_setOperandValue(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
+  if (act_type == "relu") {
+    fuse_val[0] = NEURON_FUSED_RELU;
+  } else if (act_type == "relu1") {
+    fuse_val[0] = NEURON_FUSED_RELU1;
+  } else if (act_type == "relu6") {
+    fuse_val[0] = NEURON_FUSED_RELU6;
+  } else if (!act_type.empty()) {
+    fuse_val[0] = NEURON_FUSED_NONE;
+    LOG(WARNING) << "Support act_type: " << act_type;
+    return FAILED;
+  }
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+  bool layout_val[] = {false};
+  NeuronModel_setOperandValue(
+      model, layout_node->index(), layout_val, sizeof(bool) * 1);
+  std::vector<uint32_t> addInIndex = {
+      input_node->index(),     // 0: input
+      filter_node->index(),    // 1: filter
+      bias_node->index(),      // 2: bias
+      paddingL_node->index(),  // 3: padding left
+      paddingR_node->index(),  // 4: padding right
+      paddingT_node->index(),  // 5: padding top
+      paddingB_node->index(),  // 6: padding bottom
+      strideW_node->index(),   // 7: stride width
+      strideH_node->index(),   // 8: stride height
+      fuse_node->index(),      // 9: fuse
+      layout_node->index()};   // 10: layout
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_TRANSPOSE_CONV_2D,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+  if (graph->IsOutput(output_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    output_node = graph->Get(output_name);
+    if (output_node == nullptr) return subgraph::FAILED;
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvTransposeConverter);
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
  auto scope = op->scope();
-  VLOG(3) << "[APU] Converting " + op_type + "...";
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
  auto x_dims = x->dims();
  auto y_name = op_info->Input("Y").front();
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
  auto y_dims = y->dims();
  auto out_name = op_info->Output("Out").front();
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
  auto out_dims = out->dims();
  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  auto x_shape = x_dims.Vectorize();
+  auto y_shape = y_dims.Vectorize();
+  // Two dimensions are compatible when:
+  // 1. they are equal, or
+  // 2. one of them is 1
+  for (int i = axis; i < x_shape.size(); i++) {
+    if (x_dims[i] != y_dims[i - axis]) {
+      // Input 1 compatible dimensions as input0
+      if (y_dims[i - axis] != 1) {
+        LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis];
+        return FAILED;
+      }
+    }
+  }  // End of for
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
  // Act node
  if (op_type == "fusion_elementwise_add_activation" ||
      op_type == "fusion_elementwise_sub_activation" ||
      op_type == "fusion_elementwise_mul_activation" ||
      op_type == "fusion_elementwise_div_activation") {
    auto act_type = op_info->GetAttr<std::string>("act_type");
+    if (act_type == "relu") {
+      fuse_val[0] = NEURON_FUSED_RELU;
+    } else if (act_type == "relu1") {
+      fuse_val[0] = NEURON_FUSED_RELU1;
+    } else if (act_type == "relu6") {
+      fuse_val[0] = NEURON_FUSED_RELU6;
+    } else if (!act_type.empty()) {
+      fuse_val[0] = NEURON_FUSED_NONE;
+      LOG(WARNING) << "Support act_type: " << act_type;
+      return FAILED;
+    }
+  }  // End of if
+  VLOG(3) << "x_name" << x_name;
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasInputScale(y_name));
+  auto y_scale = op_info->GetInputScale(y_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
+  // Add x tensor type
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    VLOG(3) << "Graph has " << x_name;
+    if (graph->IsInput(x_name)) {
+      VLOG(3) << x_name << "is input and already exist";
+      x_name = "transpose_" + x_name;
+    }
+    if (graph->IsOutput(x_name)) {
+      VLOG(3) << x_name << "is input and output node";
+      x_name = "transpose_" + x_name;
+    }
+    x_node = graph->Get(x_name);
+  } else {
+    if (graph->IsInput(x_name)) {
+      insert_transpose_node(ctx,
+                            x_name,
+                            "transpose_" + x_name,
+                            {(uint32_t)x_dims[0],
+                             (uint32_t)x_dims[1],
+                             (uint32_t)x_dims[2],
+                             (uint32_t)x_dims[3]},
+                            dims_x,
+                            {0, 2, 3, 1},
+                            xType.scale,
+                            xType.zeroPoint);
+      // Change x name after insert transpose op for x data relayout
+      x_name = "transpose_" + x_name;
+      x_node = graph->Get(x_name);
+    } else {
+      NeuronModel_addOperand(model, &xType);
+      x_node = graph->Add(x_name, dims_x);
+    }
+  }  // End of else
+  VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims
+          << ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0]
+          << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+  // Add y tensor type
+  NeuronOperandType yType;
+  yType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  yType.scale = y_scale;
+  yType.zeroPoint = 128;
+  yType.dimensionCount = y_dims.size();
+  std::vector<uint32_t> dims_y = {(uint32_t)y_dims[0],
+                                  (uint32_t)y_dims[2],
+                                  (uint32_t)y_dims[3],
+                                  (uint32_t)y_dims[1]};
+  yType.dimensions = &dims_y[0];
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    VLOG(3) << "Graph has " << y_name;
+    y_node = graph->Get(y_name);
+  } else {
+    if (graph->IsInput(y_name)) {
+      insert_transpose_node(ctx,
+                            y_name,
+                            "transpose_" + y_name,
+                            {(uint32_t)y_dims[0],
+                             (uint32_t)y_dims[1],
+                             (uint32_t)y_dims[2],
+                             (uint32_t)y_dims[3]},
+                            dims_y,
+                            {0, 2, 3, 1},
+                            yType.scale,
+                            yType.zeroPoint);
+      y_name = "transpose_" + y_name;
+      y_node = graph->Get(y_name);
+    } else {
+      NeuronModel_addOperand(model, &yType);
+      y_node = graph->Add(y_name, dims_y);
+    }
+  }
+  VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims
+          << ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0]
+          << ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":"
+          << yType.dimensions[3];
+  // Add fuse operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+  // Add fuse operand
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: fuse
+  fuse_node = graph->Add(out_name + "_fuse", dims_int32);
+  // Add out tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    VLOG(3) << "Graph has " << out_name;
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims
+          << ": out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+  // Set fuse value
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(),      // 0: A tensor
+      y_node->index(),      // 1: A tensor of the same OperandCode,
+                            //    and compatible dimensions as input 0
+      fuse_node->index()};  // 2: fuse
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    neuron_errCode = NeuronModel_addOperation(model,
+                                              NEURON_ADD,
+                                              addInIndex.size(),
+                                              &addInIndex[0],
+                                              addOutIndex.size(),
+                                              &addOutIndex[0]);
+  } else {
+    LOG(WARNING) << "[APU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "ADD op fail:" << op_type;
+    return FAILED;
+  }
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return FAILED;
  }
  return REBUILD_WHEN_SHAPE_CHANGED;
@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
                         kAPU,
                         paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  inType.dimensions = &dims_in[0];
  std::shared_ptr<Node> in_node = nullptr;
  if (graph->Has(input_name)) {
-    // input operand already exist
    in_node = graph->Get(input_name);
    VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
  } else {
-    // add input operand
+    NeuronModel_addOperand(model, &inType);  // Operand 0: input
-    NeuronModel_addOperand(model, &inType);  // 0: input
    in_node = graph->Add(input_name, dims_in);
  }
  VLOG(3) << "input_scale: " << input_scale
@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  wType.dimensionCount = w_dims.size();
  std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
  wType.dimensions = &dims_w[0];
-  NeuronModel_addOperand(model, &wType);  // 1: weight
+  NeuronModel_addOperand(model, &wType);  // Operand 1: weight
  std::shared_ptr<Node> w_node = nullptr;
  w_node = graph->Add(w_name, dims_w);
  VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.dimensionCount = bias_dims.size();
    std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(bias_name, dims_bias);
    VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
            << ", bias scale: " << biasType.scale
@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.dimensionCount = 1;
    std::vector<uint32_t> dims_bias = {(uint32_t)n};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(w_name + "_default_bias", dims_bias);
  }
@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  fuseType.type = NEURON_INT32;
  fuseType.dimensionCount = 0;
  std::vector<uint32_t> dims_int32 = {0};
-  NeuronModel_addOperand(model, &fuseType);  // 3: fuse
+  NeuronModel_addOperand(model, &fuseType);  // Operand 3: fuse
  std::shared_ptr<Node> fuse_node = nullptr;
  fuse_node = graph->Add(w_name + "_fuse", dims_int32);
@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  outType.scale = out_scale;
  outType.zeroPoint = 128;
  outType.dimensionCount = 2;
-  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[1]};
  outType.dimensions = &dims_out[0];
  VLOG(3) << "out_scale: " << out_scale
          << ", outType: " << outType.dimensions[0] << " : "
          << outType.dimensions[1];
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_out);
@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        NeuronModel_setOperandValue(model,
                                    bias_node->index(),
                                    bias->raw_data(),
-                                    bias->memory_size());  // 2: bias
+                                    bias->memory_size());  // Operand 2: bias
  } else {
    auto int32_bias = std::make_shared<Tensor>();
    int32_bias->Resize({1, out_dims[1]});
    int32_bias->mutable_data<int32_t>();
    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
    VLOG(3) << "default: " << int32_bias->memory_size();
-    neuron_errCode =
+    neuron_errCode = NeuronModel_setOperandValue(
-        NeuronModel_setOperandValue(model,
+        model,
-                                    bias_node->index(),
+        bias_node->index(),
-                                    int32_bias->raw_data(),
+        int32_bias->raw_data(),
-                                    int32_bias->memory_size());  // 2: bias
+        int32_bias->memory_size());  // Operand 2: bias
    bias_node->set_data(int32_bias);
  }
  // Add fuse value
  int32_t fuse_val[1] = {0};
-  NeuronModel_setOperandValue(
+  NeuronModel_setOperandValue(model,
-      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
+                              fuse_node->index(),
+                              fuse_val,
-  std::vector<uint32_t> addInIndex = {in_node->index(),
+                              sizeof(int32_t) * 1);  // Operand 3: fuse
-                                      w_node->index(),
-                                      bias_node->index(),
+  std::vector<uint32_t> addInIndex = {in_node->index(),     // 0: input
-                                      fuse_node->index()};
+                                      w_node->index(),      // 1: weight
+                                      bias_node->index(),   // 2: bias
+                                      fuse_node->index()};  // 3: fuse
  std::vector<uint32_t> addOutIndex = {out_node->index()};
  neuron_errCode = NeuronModel_addOperation(model,
                                            NEURON_FULLY_CONNECTED,

--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
    return -1;
  } else {
-    VLOG(3) << " Add: " << name << " : " << node->index();
+    VLOG(5) << " Add: " << name << " : " << node->index();
    auto ret = nodes_.insert(
        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
    CHECK(ret.second);

--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
 USE_SUBGRAPH_BRIDGE(fc, kAPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
 USE_SUBGRAPH_BRIDGE(softmax, kAPU);
+USE_SUBGRAPH_BRIDGE(concat, kAPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  // pool mode
+  // Check pool mode
  if ((pooling_type == "max") || (pooling_type == "avg")) {
  } else {
    LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
    return FAILED;
  }
-  // pad mode
+  // Check padding mode
  int pad_mode = 0;
  std::string padding_algorithm("");
  if (op_info->HasAttr("padding_algorithm")) {
@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    pad_mode = 5;
  }
-  // paddings and strides
+  // Check paddings and strides
  if (paddings.size() == 2L) {
    for (size_t i = 0; i < 2L; ++i) {
      int copy_pad = *(paddings.begin() + 2 * i);
@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  xType.dimensions = &dims_x[0];
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
-    LOG(INFO) << "Graph has " << x_name;
+    VLOG(3) << "Graph has " << x_name;
-    // input operand already exist
    x_node = graph->Get(x_name);
  } else {
-    // add input operand
+    NeuronModel_addOperand(model, &xType);  // Operand 0: x
-    NeuronModel_addOperand(model, &xType);  // 0: x
    x_node = graph->Add(x_name, dims_x);
  }
  VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
          << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
          << xType.dimensions[3];
+  VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1];
  NeuronOperandType int32Type;
  int32Type.type = NEURON_INT32;
  int32Type.dimensionCount = 0;
  std::vector<uint32_t> dims_int32 = {0};
  std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 1: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 1: padding left
  paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
  std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 2: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: padding right
  paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
  std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding top
  paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
  std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding bottom
  paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
  std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: stride width
  strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
  std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: stride height
  strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
  std::shared_ptr<Node> filterW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: filter width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: filter width
  filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
  std::shared_ptr<Node> filterH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: filter height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: filter height
  filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
  std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9: fuse
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
-  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+  fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32);
-  // Add out type
  // Add output tensor type
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (graph->Has(out_name)) {
    out_node = graph->Get(out_name);
  } else {
-    NeuronModel_addOperand(model, &outType);  // out
+    NeuronModel_addOperand(model, &outType);
    out_node = graph->Add(out_name, dims_out);
  }
-  VLOG(3) << "output_scale: " << x_scale
+  VLOG(3) << "output_scale: " << out_scale
          << ", outType: " << outType.dimensions[0] << ":"
          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
          << outType.dimensions[3];
@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add Stride
  int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
  NeuronModel_setOperandValue(
      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
  NeuronModel_setOperandValue(
      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
  // Add filter
  int32_t filter_val[1];
-  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  filter_val[0] =
+      global_pooling ? x_dims[3] : ksize[1];  // Entry 1: filter width
  NeuronModel_setOperandValue(
      model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
-  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  filter_val[0] =
+      global_pooling ? x_dims[2] : ksize[0];  // Entry 0: filter height
  NeuronModel_setOperandValue(
      model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);

--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  xType.dimensions = &dims_x[0];
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
-    // input operand already exist
    x_node = graph->Get(x_name);
    VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
  } else {
-    // add input operand
+    NeuronModel_addOperand(model, &xType);  // Operand 0: input
-    NeuronModel_addOperand(model, &xType);  // 0: input
    x_node = graph->Add(x_name, dims_x);
  }
  VLOG(3) << "input_scale size: " << input_scale
@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType betaType;
  betaType.type = NEURON_FLOAT32;
  betaType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &betaType);  // 1: beta
+  NeuronModel_addOperand(model, &betaType);  // Operand 1: beta
  std::shared_ptr<Node> beta_node = nullptr;
  beta_node = graph->Add(x_name + "_beta", dims_int32);
@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType axisType;
  axisType.type = NEURON_INT32;
  axisType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &axisType);  // 2: axis
+  NeuronModel_addOperand(model, &axisType);  // Operand 2: axis
  std::shared_ptr<Node> axis_node = nullptr;
  axis_node = graph->Add(x_name + "_axis", dims_int32);
@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  outType.zeroPoint = 128;
  outType.dimensionCount = x_dims.size();
  outType.dimensions = &dims_x[0];
-  NeuronModel_addOperand(model, &outType);  // 3: output
+  NeuronModel_addOperand(model, &outType);  // Operand 3: output
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_x);
  VLOG(3) << "out_scale: " << out_scale;
@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  axis_val[0] = axis;
  NeuronModel_setOperandValue(
      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
-  std::vector<uint32_t> addInIndex = {
+  std::vector<uint32_t> addInIndex = {x_node->index(),      // 0: input
-      x_node->index(), beta_node->index(), axis_node->index()};
+                                      beta_node->index(),   // 1: beta
+                                      axis_node->index()};  // 2: axis
  std::vector<uint32_t> addOutIndex = {out_node->index()};
  int neuron_errCode = NeuronModel_addOperation(model,
                                                NEURON_SOFTMAX,

--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info,
  }
 }
-void insert_transpose_node(void* ctx,
+int insert_requant_node(void* ctx,
-                           const std::string& input_name,
+                        const std::string& input_name,
-                           const std::string& output_name,
+                        const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
+                        std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
+                        float scale_in,
-                           float scale,
+                        float scale_out,
-                           int32_t zeroPoint) {
+                        int32_t zeroPoint) {
  int neuron_errCode;
  auto graph = static_cast<Graph*>(ctx);
  auto model = graph->model();
+  uint32_t numDevices = 0;
+  CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR);
+  CHECK_GT(numDevices, (uint32_t)0);
+  NeuronDevice* targetDevice = nullptr;
+  for (uint32_t i = 0; i < numDevices; ++i) {
+    NeuronDevice* device = nullptr;
+    Neuron_getDevice(i, &device);
+    const char* name;
+    NeuronDevice_getName(device, &name);
+    if (0 == strcmp(name, "mtk-dsp")) {
+      targetDevice = device;
+      break;
+    }
+  }
+  if (targetDevice == nullptr) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
  // Add input
  NeuronOperandType inType;
  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  inType.scale = scale;
+  inType.scale = scale_in;
  inType.zeroPoint = zeroPoint;
  inType.dimensionCount = input_shape.size();
  inType.dimensions = &input_shape[0];
@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx,
    VLOG(3) << "Has " << input_name;
    input_node = graph->Get(input_name);
  } else {
-    neuron_errCode = NeuronModel_addOperand(model, &inType);  // input
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
    if (NEURON_NO_ERROR != neuron_errCode) {
-      LOG(WARNING) << "Insert transpose op fail!";
+      LOG(FATAL) << "Insert mtk_requant op fail!";
-      return;
+      return -1;
    }
    VLOG(3) << "Add " << input_name;
    input_node = graph->Add(input_name, input_shape);
  }
+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale_out;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+  NeuronModel_addOperand(model, &outType);
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+  std::vector<uint32_t> addInIndex = {input_node->index()};
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+  neuron_errCode = NeuronModel_addOperationExtension(model,
+                                                     "MTK_REQUANTIZE",
+                                                     "mediatek",
+                                                     targetDevice,
+                                                     addInIndex.size(),
+                                                     &addInIndex[0],
+                                                     addOutIndex.size(),
+                                                     &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+  return 0;
+}
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(5) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(FATAL) << "Insert transpose op fail!";
+      return -1;
+    }
+    VLOG(5) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
  // Add perm
  NeuronOperandType permsType;
  permsType.type = NEURON_TENSOR_INT32;
@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx,
  uint32_t dims_perms[1] = {4};
  permsType.dimensions = dims_perms;
-  neuron_errCode = NeuronModel_addOperand(model, &permsType);  // perm
+  neuron_errCode = NeuronModel_addOperand(model, &permsType);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
-    return;
+    return -1;
  }
  std::shared_ptr<Node> perms_node = nullptr;
  perms_node = graph->Add(input_name + "_perms", {4});
-  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+  VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
          << axis[3];
-  //  &axis[0], sizeof(int32_t) * axis.size());
  neuron_errCode = NeuronModel_setOperandValue(
      model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
-    return;
+    return -1;
  }
  // Add output
@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx,
  outType.dimensionCount = output_shape.size();
  outType.dimensions = &output_shape[0];
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
  std::shared_ptr<Node> output_node = nullptr;
  output_node = graph->Add(output_name, output_shape);
@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx,
                                            &addOutIndex[0]);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
  }
+  return 0;
 }
 void transpose(const int8_t* input_data,
@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data,
  int new_index = -1;
  int dim[4] = {0};
  std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
          << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data,
  int new_index = -1;
  int dim[4] = {0};
  std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
          << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data,
              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+          // Per layer op is asym op and need to add 128
-          output_data[new_index] = input_data[old_index] + 128;  // per layer
+          output_data[new_index] = input_data[old_index] + 128;
        }
      }
    }

--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info,
                 const Scope* scope,
                 const std::string& argname);
-void insert_transpose_node(void* ctx,
+int insert_requant_node(void* ctx,
-                           const std::string& input_name,
+                        const std::string& input_name,
-                           const std::string& output_name,
+                        const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
+                        std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
+                        float scale_in,
-                           float scale,
+                        float scale_out,
-                           int32_t zeroPoint);
+                        int32_t zeroPoint);
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint);
 void transpose(const int8_t* input_data,
               uint8_t* output_data,

--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
    BuildOriginProgram();
  }
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
  unsigned int version;
  Neuron_getVersion(&version);
  VLOG(3) << "Neuron Adapter version: " << version;
@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() {
  }
  VLOG(3) << "[APU] APU NIR model created!";
-  auto GetCurrentUS = []() -> double {
+  VLOG(1) << "[APU] APU NIR model created, Create cost "
-    struct timeval time;
+          << GetCurrentUS() - start_time << " us";
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
+  start_time = GetCurrentUS();
-  };
-  auto start_time = GetCurrentUS();
  compilation_ = lite::apu::Device::Global().Build(model_);
  if (compilation_ == nullptr) {
    LOG(WARNING) << "[APU] Build APU DLA model failed!";
    return false;
  }
-  VLOG(3) << "[APU] APU DLA model created, Build cost "
+  VLOG(1) << "[APU] APU DLA model created, Build cost "
          << GetCurrentUS() - start_time << " us";
  return true;
 }
@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() {
    }
  }
  NeuronExecution_free(run);
-  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
  return true;
 }