diff --git a/lite/backends/apu/neuron_adapter.cc b/lite/backends/apu/neuron_adapter.cc
index 953c92d1828848bd030a65cb2a8af0eac0674ca1..ff08507504b8bd7e5342c5705afb17550f37469e 100644
--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
   PADDLE_DLSYM(NeuronModel_setOperandValue);
   PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
   PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_addOperationExtension);
   PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
   PADDLE_DLSYM(NeuronCompilation_create);
   PADDLE_DLSYM(NeuronCompilation_free);
   PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_createForDevices);
   PADDLE_DLSYM(NeuronExecution_create);
   PADDLE_DLSYM(NeuronExecution_free);
   PADDLE_DLSYM(NeuronExecution_setInput);
   PADDLE_DLSYM(NeuronExecution_setOutput);
   PADDLE_DLSYM(NeuronExecution_compute);
-
+  PADDLE_DLSYM(Neuron_getDeviceCount);
+  PADDLE_DLSYM(Neuron_getDevice);
+  PADDLE_DLSYM(NeuronDevice_getName);
 #undef PADDLE_DLSYM
 }
 
@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
       model, type, inputCount, inputs, outputCount, outputs);
 }
 
+int NeuronModel_addOperationExtension(NeuronModel* model,
+                                      const char* name,
+                                      const char* vendor,
+                                      const NeuronDevice* device,
+                                      uint32_t inputCount,
+                                      const uint32_t* inputs,
+                                      uint32_t outputCount,
+                                      const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_addOperationExtension()(model,
+                                            name,
+                                            vendor,
+                                            device,
+                                            inputCount,
+                                            inputs,
+                                            outputCount,
+                                            outputs);
+}
+
 int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
                                          uint32_t inputCount,
                                          const uint32_t* inputs,
@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
       compilation);
 }
 
+int NeuronCompilation_createForDevices(NeuronModel* model,
+                                       const NeuronDevice* const* devices,
+                                       uint32_t numDevices,
+                                       NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_createForDevices()(
+          model, devices, numDevices, compilation);
+}
+
 int NeuronExecution_create(NeuronCompilation* compilation,
                            NeuronExecution** execution) {
   return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
   return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
       execution);
 }
+
+int Neuron_getDeviceCount(uint32_t* numDevices) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
+      numDevices);
+}
+
+int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
+                                                                   device);
+}
+
+int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
+                                                                       name);
+}
diff --git a/lite/backends/apu/neuron_adapter.h b/lite/backends/apu/neuron_adapter.h
index c08db73279ea3969300c8f298016a976e30a7ac4..c1b9669a98626699b126913dcc840906de4de8e0 100644
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -42,12 +42,25 @@ class NeuronAdapter final {
                                                 const uint32_t *,
                                                 uint32_t,
                                                 const uint32_t *);
+  using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
+                                                         const char *,
+                                                         const char *,
+                                                         const NeuronDevice *,
+                                                         uint32_t,
+                                                         const uint32_t *,
+                                                         uint32_t,
+                                                         const uint32_t *);
   using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
       NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
   using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                 NeuronCompilation **);
   using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
   using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_createForDevices_Type =
+      int (*)(NeuronModel *,
+              const NeuronDevice *const *,
+              uint32_t,
+              NeuronCompilation **);
   using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                               NeuronExecution **);
   using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -59,6 +72,10 @@ class NeuronAdapter final {
   using NeuronExecution_setOutput_Type = int (*)(
       NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
   using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+  using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
+  using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
+  using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
+                                            const char **);
 
   Neuron_getVersion_Type Neuron_getVersion() {
     CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
@@ -105,6 +122,12 @@ class NeuronAdapter final {
     return NeuronModel_addOperation_;
   }
 
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
+    CHECK(NeuronModel_addOperationExtension_ != nullptr)
+        << "Cannot load NeuronModel_addOperationExtension!";
+    return NeuronModel_addOperationExtension_;
+  }
+
   NeuronModel_identifyInputsAndOutputs_Type
   NeuronModel_identifyInputsAndOutputs() {
     CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
@@ -130,6 +153,12 @@ class NeuronAdapter final {
     return NeuronCompilation_finish_;
   }
 
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
+    CHECK(NeuronCompilation_createForDevices_ != nullptr)
+        << "Cannot load NeuronCompilation_createForDevices!";
+    return NeuronCompilation_createForDevices_;
+  }
+
   NeuronExecution_create_Type NeuronExecution_create() {
     CHECK(NeuronExecution_create_ != nullptr)
         << "Cannot load NeuronExecution_create!";
@@ -160,6 +189,23 @@ class NeuronAdapter final {
     return NeuronExecution_compute_;
   }
 
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
+    CHECK(Neuron_getDeviceCount_ != nullptr)
+        << "Cannot load Neuron_getDeviceCount!";
+    return Neuron_getDeviceCount_;
+  }
+
+  Neuron_getDevice_Type Neuron_getDevice() {
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    return Neuron_getDevice_;
+  }
+
+  NeuronDevice_getName_Type NeuronDevice_getName() {
+    CHECK(NeuronDevice_getName_ != nullptr)
+        << "Cannot load NeuronDevice_getName!";
+    return NeuronDevice_getName_;
+  }
+
  private:
   NeuronAdapter();
   NeuronAdapter(const NeuronAdapter &) = delete;
@@ -176,16 +222,23 @@ class NeuronAdapter final {
   NeuronModel_setOperandSymmPerChannelQuantParams_Type
       NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
   NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
+      nullptr};
   NeuronModel_identifyInputsAndOutputs_Type
       NeuronModel_identifyInputsAndOutputs_{nullptr};
   NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
   NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
   NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
+      nullptr};
   NeuronExecution_create_Type NeuronExecution_create_{nullptr};
   NeuronExecution_free_Type NeuronExecution_free_{nullptr};
   NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
   NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
   NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
+  Neuron_getDevice_Type Neuron_getDevice_{nullptr};
+  NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
 };
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt
old mode 100644
new mode 100755
index 0b42af5a6fe79bbb8417c2a6a37a86c30f4a0f8b..609bf1b4b345f8eb7d14b9bb3291e6bc5bad2293
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
 lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})
 
 
 set(apu_subgraph_bridges
@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
         subgraph_bridge_softmax_op_apu
         subgraph_bridge_fc_op_apu
         subgraph_bridge_pool_op_apu
+	subgraph_bridge_conv_transpose_op_apu
+	subgraph_bridge_concat_op_apu
         CACHE INTERNAL "apu_subgraph_bridges")
 
 message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
diff --git a/lite/kernels/apu/bridges/concat_op.cc b/lite/kernels/apu/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26f62101ab435059cde043c807f92cb3ba43dd01
--- /dev/null
+++ b/lite/kernels/apu/bridges/concat_op.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto out_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // Process data layout axis change
+  if (axis == 1)
+    axis = 3;
+  else if (axis == 2)
+    axis = 1;
+  else if (axis == 3)
+    axis = 2;
+
+  // Limitation:
+  // All input tensors of NEURON_TENSOR_QUANT8_ASYMM must
+  // have the same scale and zeroPoint as the output tensor
+  CHECK(op_info->HasOutputScale(out_name));
+  auto output_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Traverse all of input nodes
+  std::vector<std::shared_ptr<Node>> input_nodes;
+  NeuronOperandType xType;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+
+    CHECK(op_info->HasInputScale(x_name));
+    auto input_scale = op_info->GetInputScale(x_name)[0];
+
+    // Add x tensor type
+    xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+    xType.scale = input_scale;
+    xType.zeroPoint = 128;
+    xType.dimensionCount = x_dims.size();
+    std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                    (uint32_t)x_dims[2],
+                                    (uint32_t)x_dims[3],
+                                    (uint32_t)x_dims[1]};
+    xType.dimensions = &dims_x[0];
+    if (graph->Has(x_name)) {
+      VLOG(3) << "Graph has " << x_name;
+      if (graph->IsInput(x_name)) {
+        VLOG(3) << x_name << "is input and already exist";
+        x_name = "transpose_" + x_name;
+      }
+
+      if (graph->IsOutput(x_name)) {
+        VLOG(3) << x_name << "is input and output node";
+        x_name = "transpose_" + x_name;
+      }
+      x_node = graph->Get(x_name);
+    } else {
+      // Add input operand
+      if (graph->IsInput(x_name)) {
+        // Insert transpose for NCHW -> NHWC
+        insert_transpose_node(ctx,
+                              x_name,
+                              "transpose_" + x_name,
+                              {(uint32_t)x_dims[0],
+                               (uint32_t)x_dims[1],
+                               (uint32_t)x_dims[2],
+                               (uint32_t)x_dims[3]},
+                              dims_x,
+                              {0, 2, 3, 1},
+                              xType.scale,
+                              xType.zeroPoint);
+
+        // Change x_name because we add transpose op
+        x_name = "transpose_" + x_name;
+        x_node = graph->Get(x_name);
+      } else {
+        NeuronModel_addOperand(model, &xType);
+        x_node = graph->Add(x_name, dims_x);
+      }
+    }  // End of else
+    if (x_node == nullptr) return subgraph::FAILED;
+    input_nodes.push_back(x_node);
+
+    VLOG(3) << "input node x: " << x_node->index()
+            << ": input_scale: " << input_scale << " x_dims:" << x_dims[0]
+            << ":" << x_dims[1] << ":" << x_dims
+            << ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1]
+            << ":" << xType.dimensions[2] << ":" << xType.dimensions[3];
+  }  // End of for
+
+  if (input_nodes.size() != num) {
+    LOG(WARNING) << "Create input operand failed!";
+    return subgraph::FAILED;
+  }
+
+  // Add axis operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add axis operand
+  std::shared_ptr<Node> axis_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // axis
+  axis_node = graph->Add(out_name + "_axis", dims_int32);
+  VLOG(3) << "axis:" << axis;
+
+  // Add out operand type
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  // Add out operand
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index()
+          << ": output_scle: " << outType.scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set axis value
+  int32_t axis_val[1] = {(int32_t)axis};
+  NeuronModel_setOperandValue(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex;
+  for (auto& node : input_nodes) {
+    addInIndex.push_back(node->index());
+  }
+
+  addInIndex.push_back(axis_node->index());
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_CONCATENATION,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return subgraph::FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return subgraph::FAILED;
+  }
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConcatConverter);
diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc
index 1c3020065ee4b16a56f95077b5906effd75a0249..bb60331e44d94afaffac2dd42020c4b4c7b4309d 100644
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(strides.size(), 2L);
   CHECK_EQ(dilations.size(), 2L);
   bool is_depthwise_mode = ic == groups && oc == groups;
-  VLOG(3) << "is_depthwise_mode" << is_depthwise_mode;
+  VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode;
 
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < strides.size(); ++i) {
@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto filter_scale = op_info->GetInputScale(filter_name);
   CHECK(op_info->HasOutputScale(output_name));
   auto output_scale = op_info->GetOutputScale(output_name)[0];
+  auto orig_output_scale = op_info->GetOutputScale(output_name)[0];
 
   VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
           << " ,dilations: " << dilations[0] << ":" << dilations[1];
@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> input_node = nullptr;
   if (graph->Has(input_name)) {
     VLOG(3) << "Graph has " << input_name;
-    // input operand already exist
+
+    if (graph->IsInput(input_name)) {
+      VLOG(3) << input_name << "is input and already exist";
+      input_name = "transpose_" + input_name;
+    }
+
+    if (graph->IsOutput(input_name)) {
+      VLOG(3) << input_name << "is input and output node";
+      input_name = "transpose_" + input_name;
+    }
     input_node = graph->Get(input_name);
   } else {
-    // add input operand
     if (graph->IsInput(input_name)) {
       // Insert transpose for NCHW -> NHWC
-      insert_transpose_node(
-          ctx,
-          input_name,
-          "transpose_" + input_name,
-          {input_dims[0], input_dims[1], input_dims[2], input_dims[3]},
-          dims_in,
-          {0, 2, 3, 1},
-          inType.scale,
-          inType.zeroPoint);
-
-      // change input_name
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
       input_name = "transpose_" + input_name;
       input_node = graph->Get(input_name);
       if (input_node == nullptr) return subgraph::FAILED;
@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       input_node = graph->Add(input_name, dims_in);
     }
   }
-  VLOG(3) << "input node idx" << input_node->index()
+  VLOG(3) << "input node idx: " << input_node->index()
           << ": input_scale: " << input_scale
           << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
           << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add bias type
   NeuronOperandType biasType;
 
-  // Add filter type
-  // filter NCHW -> NHWC
+  // Add filter type, filter data re-layout NCHW -> NHWC
   Tensor transpose_filter;
   std::vector<uint32_t> dims_filter;
 
@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.scale = 0;
   }
 
+  auto precision = filter->precision();
   std::shared_ptr<Node> filter_node = nullptr;
   if (1 == filter_scale.size()) {
-    NeuronModel_addOperand(model, &filterType);  // 1: filter
-    filter_node = graph->Add(filter_name, dims_filter);
+    NeuronModel_addOperand(model, &filterType);
+    filter_node = graph->Add(filter_name, dims_filter);  // Operand 1: filter
     VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
             << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
             << ":" << filterType.dimensions[1] << ":"
@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       return subgraph::FAILED;
     }
   } else {
-    NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
+    NeuronModel_addOperand(model, &channelFilterType);  // Operand 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
     VLOG(3) << "chennel filter node idx: " << filter_node->index()
             << " ,scale_count:" << filter_scale.size()
@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add biasType node value
   // A 1-D tensor, of shape [depth_out], specifying the bias.
   // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
-  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0
+  // should be of NEURON_TENSOR_INT32, with zeroPoint of 0
   // and bias_scale of 0. The actual scale of each value 'i' is equal
   // to bias_scale[i] = input_scale * filter_scale[i].
   biasType.type = NEURON_TENSOR_INT32;
@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     for (int i = 0; i < bias_dims.size(); i++)
       dims_bias.push_back(bias_dims[i]);
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(bias_name, dims_bias);
-    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
             << " ,bias scale: " << biasType.scale
             << " ,dimensions: " << bias_dims;
   } else {
     biasType.dimensionCount = 1;
     dims_bias = {(uint32_t)output_dims[1]};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
     VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
             << " ,bias scale: " << biasType.scale
@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::vector<uint32_t> dims_int32 = {1};
 
   std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
   paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
 
   std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
   paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
 
   std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
   paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
 
   std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
   paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
 
   std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
   strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
 
   std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
   strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
 
   std::shared_ptr<Node> dm_node = nullptr;
   if (is_depthwise_mode) {
-    NeuronModel_addOperand(model, &int32Type);  // 9: depthwise multiplier
+    NeuronModel_addOperand(model,
+                           &int32Type);  // Operand 9: depthwise multiplier
     dm_node = graph->Add(filter_name + "_dm", dims_int32);
   }
 
   std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9/10: fuse
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9/10: fuse
   fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
 
+  /* Check output scale */
+  if (is_depthwise_mode) {
+    for (auto s : filter_scale) {
+      if (output_scale < s * input_scale)
+        output_scale = s * input_scale + 0.000001;
+    }
+#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT
+    output_scale = orig_output_scale;
+#endif
+  }
+
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (graph->Has(output_name)) {
     output_node = graph->Get(output_name);
   } else {
-    // add output operand
-    if (graph->IsOutput(output_name)) {
-      NeuronModel_addOperand(model, &outType);  // output
+    // Add output operand
+    NeuronModel_addOperand(model, &outType);
+
+    if (orig_output_scale != output_scale) {
+      // Need to insert requant op, the result is requant_ -> transpose_ ->
+      // output
+      output_node = graph->Add("requant_" + output_name, dims_out);
+    } else if (graph->IsOutput(output_name)) {
+      // Need to insert transpose op, transpose_ -> output
       output_node = graph->Add("transpose_" + output_name, dims_out);
     } else {
-      NeuronModel_addOperand(model, &outType);  // output
       output_node = graph->Add(output_name, dims_out);
     }
   }
@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Add Stride
   int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
   NeuronModel_setOperandValue(
       model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
   NeuronModel_setOperandValue(
       model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
 
@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         model, dm_node->index(), &dm, sizeof(int32_t) * 1);
     VLOG(3) << "depthwise multiplier:" << dm;
 
-    // Depthwise conv
+    // Depthwise conv case
     NeuronModel_setOperandValue(
         model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
     std::vector<uint32_t> addInIndex = {
@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     return FAILED;
   }
 
+  // Check if Requant OP is needed
+  std::shared_ptr<Node> requant_node = nullptr;
+  if (orig_output_scale != output_scale) {
+    std::string requant_out_name = output_name;
+    VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale
+            << " ,output_scale:" << output_scale;
+    if (graph->IsOutput(output_name)) {
+      requant_out_name = "transpose_" + output_name;
+    }
+
+    insert_requant_node(ctx,
+                        "requant_" + output_name,
+                        requant_out_name,
+                        dims_out,
+                        dims_out,
+                        output_scale,
+                        orig_output_scale,
+                        outType.zeroPoint);
+
+    requant_node = graph->Get(requant_out_name);
+    if (requant_node == nullptr) return subgraph::FAILED;
+  }
+
+  std::shared_ptr<Node> transpose_node = nullptr;
   if (graph->IsOutput(output_name)) {
+    VLOG(3) << "Add output transpose:" << output_name;
     // Insert transpose for NHWC -> NCHW
-    insert_transpose_node(
-        ctx,
-        "transpose_" + output_name,
-        output_name,
-        dims_out,
-        {output_dims[0], output_dims[1], output_dims[2], output_dims[3]},
-        {0, 3, 1, 2},
-        outType.scale,
-        outType.zeroPoint);
-    output_node = graph->Get(output_name);
-    if (output_node == nullptr) return subgraph::FAILED;
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    transpose_node = graph->Get(output_name);
+    if (transpose_node == nullptr) return subgraph::FAILED;
   }
 
   return REBUILD_WHEN_SHAPE_CHANGED;
diff --git a/lite/kernels/apu/bridges/conv_transpose_op.cc b/lite/kernels/apu/bridges/conv_transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..386c89c128e476611ebde4b337823775b5ae01a9
--- /dev/null
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
@@ -0,0 +1,488 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph *>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_EQ(input_dims.size(), 4);
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  CHECK_EQ(filter_dims.size(), 4);
+
+  auto output_name = op_info->Output("Output").front();
+
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK_EQ(strides.size(), 2L);
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  if (groups > 1) {
+    LOG(WARNING) << "[NPU] only support groups == 1";
+    return FAILED;
+  }
+
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  auto fuse_relu =
+      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
+
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  CHECK_EQ(dilations.size(), 2L);
+  std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the input size."
+      << paddings.size();
+
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  std::vector<int> output_dims;
+  // Set output_dims: batches
+  output_dims.push_back(input_dims[0]);
+
+  std::vector<int> output_size;
+  if (op_info->HasAttr("output_size")) {
+    output_size = op_info->GetAttr<std::vector<int>>("output_size");
+  }
+
+  if (output_size.size() > 2) {
+    // Set output_dims: height, width
+    output_dims.push_back(output_size[0]);
+    output_dims.push_back(output_size[1]);
+  } else {
+    // Compute output size
+    for (int i = 0; i < strides.size(); i++) {
+      int kernel_ext = filter_dims[i + 2];
+      int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
+                        paddings[i * 2] - paddings[i * 2 + 1];
+      output_dims.push_back(output_size);
+    }
+  }
+  output_dims.push_back(filter_dims[1]);
+
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
+
+  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
+          << " ,dilations: " << dilations[0] << ":" << dilations[1];
+  VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type;
+  VLOG(3) << "input_dims: " << input_dims
+          << " ,filter_scale size: " << filter_scale.size();
+  VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims
+          << " ,memory_size: " << filter->memory_size()
+          << " ,data_size: " << filter->data_size();
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Graph has " << input_name;
+    // Input operand already created by previous OP
+    input_node = graph->Get(input_name);
+  } else {
+    // Add input operand
+    if (graph->IsInput(input_name)) {
+      // Insert transpose for NCHW -> NHWC
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
+      // Change input_name because we add transpose op
+      input_name = "transpose_" + input_name;
+      input_node = graph->Get(input_name);
+      if (input_node == nullptr) return subgraph::FAILED;
+    } else {
+      NeuronModel_addOperand(model, &inType);
+      input_node = graph->Add(input_name, dims_in);
+    }
+  }
+
+  VLOG(3) << "input node idx: " << input_node->index()
+          << ": input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
+          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
+
+  // Add bias type
+  NeuronOperandType biasType;
+
+  // Add filter type
+  // Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in)
+  Tensor transpose_filter;
+  std::vector<uint32_t> dims_filter;
+  transpose_filter.Resize({(uint32_t)filter_dims[1],
+                           (uint32_t)filter_dims[2],
+                           (uint32_t)filter_dims[3],
+                           (uint32_t)filter_dims[0]});
+
+  transposeAsym(filter->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {(uint32_t)filter_dims[0],
+                 (uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3]},
+                {1, 2, 3, 0});
+
+  dims_filter = {(uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3],
+                 (uint32_t)filter_dims[0]};
+
+  NeuronOperandType filterType;
+  filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  filterType.scale = filter_scale[0];
+  filterType.zeroPoint = 128;
+  filterType.dimensionCount = filter_dims.size();
+  filterType.dimensions = &dims_filter[0];
+  biasType.scale = inType.scale * filterType.scale;
+
+  std::shared_ptr<Node> filter_node = nullptr;
+  NeuronModel_addOperand(model, &filterType);
+  filter_node = graph->Add(filter_name, dims_filter);
+  auto precision = filter->precision();
+  VLOG(3) << " filter node idx: " << filter_node->index()
+          << " filter_scale[0]=" << filter_scale[0]
+          << " filter memory_size=" << filter->memory_size()
+          << " filter precision=" << PrecisionToStr(precision)
+          << " :filterType: " << filterType.dimensions[0] << ":"
+          << filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":"
+          << filterType.dimensions[3];
+
+  memcpy(filter->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         filter->memory_size());
+
+  // Set filter value
+  neuron_errCode = NeuronModel_setOperandValue(
+      model, filter_node->index(), filter->raw_data(), filter->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  // Add biasType node value
+  // A 1-D tensor, of shape [depth_out], specifying the bias.
+  // For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of
+  // NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale ==
+  // input_scale * filter_scale
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  std::vector<uint32_t> dims_bias;
+  std::shared_ptr<Node> bias_node = nullptr;
+
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    auto channel_size = bias->dims().production();
+    CHECK_EQ(channel_size, filter_dims[1] * groups);
+    CHECK_EQ(bias_dims.size(), 1);
+
+    biasType.dimensionCount = bias_dims.size();
+    for (int i = 0; i < bias_dims.size(); i++)
+      dims_bias.push_back(bias_dims[i]);
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << bias_dims
+            << " ,channel_size:" << channel_size;
+
+  } else {
+    // Create default bias with value 0
+    biasType.dimensionCount = 1;
+    dims_bias = {(uint32_t)output_dims[1]};
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: default_bias "
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << dims_bias.size();
+  }
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
+  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
+  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
+  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
+  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
+  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
+  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+
+  NeuronOperandType boolType;
+  boolType.type = NEURON_BOOL;
+  boolType.dimensionCount = 0;  // Must be 0 for scalars.
+  std::shared_ptr<Node> layout_node = nullptr;
+  NeuronModel_addOperand(model, &boolType);  // Operand 9: fuse
+  layout_node = graph->Add(filter_name + "_layout", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = output_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
+                                    (uint32_t)output_dims[1],
+                                    (uint32_t)output_dims[2],
+                                    (uint32_t)output_dims[3]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> output_node = nullptr;
+  if (graph->Has(output_name)) {
+    output_node = graph->Get(output_name);
+  } else {
+    if (graph->IsOutput(output_name)) {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add("transpose_" + output_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add(output_name, dims_out);
+    }
+  }
+  VLOG(3) << "output node idx: " << output_node->index()
+          << ": output_scale: " << outType.scale
+          << " ,outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add bias value
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+
+    int32_t *int32_bias_data =
+        reinterpret_cast<int32_t *>(bias->mutable_data<float>());
+    float2int32(
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
+
+    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":"
+            << int32_bias_data[1] << ":" << int32_bias_data[2] << ":"
+            << int32_bias_data[3];
+
+    neuron_errCode = NeuronModel_setOperandValue(
+        model, bias_node->index(), bias->raw_data(), bias->memory_size());
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, output_dims[3]});
+    int32_bias->mutable_data<int32_t>();
+    VLOG(3) << "bais_default: " << int32_bias->memory_size();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    neuron_errCode = NeuronModel_setOperandValue(model,
+                                                 bias_node->index(),
+                                                 int32_bias->raw_data(),
+                                                 int32_bias->memory_size());
+    bias_node->set_data(int32_bias);
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
+          << paddings[2] << ":" << paddings[3];
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  NeuronModel_setOperandValue(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  NeuronModel_setOperandValue(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  NeuronModel_setOperandValue(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  NeuronModel_setOperandValue(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // entry 1: width stride
+  NeuronModel_setOperandValue(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // entry 0: height stride
+  NeuronModel_setOperandValue(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
+  if (act_type == "relu") {
+    fuse_val[0] = NEURON_FUSED_RELU;
+  } else if (act_type == "relu1") {
+    fuse_val[0] = NEURON_FUSED_RELU1;
+  } else if (act_type == "relu6") {
+    fuse_val[0] = NEURON_FUSED_RELU6;
+  } else if (!act_type.empty()) {
+    fuse_val[0] = NEURON_FUSED_NONE;
+    LOG(WARNING) << "Support act_type: " << act_type;
+    return FAILED;
+  }
+
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  bool layout_val[] = {false};
+  NeuronModel_setOperandValue(
+      model, layout_node->index(), layout_val, sizeof(bool) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      input_node->index(),     // 0: input
+      filter_node->index(),    // 1: filter
+      bias_node->index(),      // 2: bias
+      paddingL_node->index(),  // 3: padding left
+      paddingR_node->index(),  // 4: padding right
+      paddingT_node->index(),  // 5: padding top
+      paddingB_node->index(),  // 6: padding bottom
+      strideW_node->index(),   // 7: stride width
+      strideH_node->index(),   // 8: stride height
+      fuse_node->index(),      // 9: fuse
+      layout_node->index()};   // 10: layout
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_TRANSPOSE_CONV_2D,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(output_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    output_node = graph->Get(output_name);
+    if (output_node == nullptr) return subgraph::FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvTransposeConverter);
diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc
index 964e81eb6aba26c44dd1b3cd0658984792c6259f..af8f76c68e20e1206bf16450cd04f5ecf5cf7bb9 100644
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
   auto scope = op->scope();
-  VLOG(3) << "[APU] Converting " + op_type + "...";
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
+
   auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+
+  auto x_shape = x_dims.Vectorize();
+  auto y_shape = y_dims.Vectorize();
+
+  // Two dimensions are compatible when:
+  // 1. they are equal, or
+  // 2. one of them is 1
+  for (int i = axis; i < x_shape.size(); i++) {
+    if (x_dims[i] != y_dims[i - axis]) {
+      // Input 1 compatible dimensions as input0
+      if (y_dims[i - axis] != 1) {
+        LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis];
+        return FAILED;
+      }
+    }
+  }  // End of for
 
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
   // Act node
   if (op_type == "fusion_elementwise_add_activation" ||
       op_type == "fusion_elementwise_sub_activation" ||
       op_type == "fusion_elementwise_mul_activation" ||
       op_type == "fusion_elementwise_div_activation") {
     auto act_type = op_info->GetAttr<std::string>("act_type");
+
+    if (act_type == "relu") {
+      fuse_val[0] = NEURON_FUSED_RELU;
+    } else if (act_type == "relu1") {
+      fuse_val[0] = NEURON_FUSED_RELU1;
+    } else if (act_type == "relu6") {
+      fuse_val[0] = NEURON_FUSED_RELU6;
+    } else if (!act_type.empty()) {
+      fuse_val[0] = NEURON_FUSED_NONE;
+      LOG(WARNING) << "Support act_type: " << act_type;
+      return FAILED;
+    }
+  }  // End of if
+  VLOG(3) << "x_name" << x_name;
+
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasInputScale(y_name));
+  auto y_scale = op_info->GetInputScale(y_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Add x tensor type
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    VLOG(3) << "Graph has " << x_name;
+    if (graph->IsInput(x_name)) {
+      VLOG(3) << x_name << "is input and already exist";
+      x_name = "transpose_" + x_name;
+    }
+
+    if (graph->IsOutput(x_name)) {
+      VLOG(3) << x_name << "is input and output node";
+      x_name = "transpose_" + x_name;
+    }
+    x_node = graph->Get(x_name);
+  } else {
+    if (graph->IsInput(x_name)) {
+      insert_transpose_node(ctx,
+                            x_name,
+                            "transpose_" + x_name,
+                            {(uint32_t)x_dims[0],
+                             (uint32_t)x_dims[1],
+                             (uint32_t)x_dims[2],
+                             (uint32_t)x_dims[3]},
+                            dims_x,
+                            {0, 2, 3, 1},
+                            xType.scale,
+                            xType.zeroPoint);
+
+      // Change x name after insert transpose op for x data relayout
+      x_name = "transpose_" + x_name;
+      x_node = graph->Get(x_name);
+    } else {
+      NeuronModel_addOperand(model, &xType);
+      x_node = graph->Add(x_name, dims_x);
+    }
+  }  // End of else
+  VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims
+          << ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0]
+          << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+
+  // Add y tensor type
+  NeuronOperandType yType;
+  yType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  yType.scale = y_scale;
+  yType.zeroPoint = 128;
+  yType.dimensionCount = y_dims.size();
+  std::vector<uint32_t> dims_y = {(uint32_t)y_dims[0],
+                                  (uint32_t)y_dims[2],
+                                  (uint32_t)y_dims[3],
+                                  (uint32_t)y_dims[1]};
+  yType.dimensions = &dims_y[0];
+
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    VLOG(3) << "Graph has " << y_name;
+    y_node = graph->Get(y_name);
+  } else {
+    if (graph->IsInput(y_name)) {
+      insert_transpose_node(ctx,
+                            y_name,
+                            "transpose_" + y_name,
+                            {(uint32_t)y_dims[0],
+                             (uint32_t)y_dims[1],
+                             (uint32_t)y_dims[2],
+                             (uint32_t)y_dims[3]},
+                            dims_y,
+                            {0, 2, 3, 1},
+                            yType.scale,
+                            yType.zeroPoint);
+
+      y_name = "transpose_" + y_name;
+      y_node = graph->Get(y_name);
+    } else {
+      NeuronModel_addOperand(model, &yType);
+      y_node = graph->Add(y_name, dims_y);
+    }
+  }
+  VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims
+          << ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0]
+          << ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":"
+          << yType.dimensions[3];
+
+  // Add fuse operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add fuse operand
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: fuse
+  fuse_node = graph->Add(out_name + "_fuse", dims_int32);
+
+  // Add out tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    VLOG(3) << "Graph has " << out_name;
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims
+          << ": out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set fuse value
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(),      // 0: A tensor
+      y_node->index(),      // 1: A tensor of the same OperandCode,
+                            //    and compatible dimensions as input 0
+      fuse_node->index()};  // 2: fuse
+
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    neuron_errCode = NeuronModel_addOperation(model,
+                                              NEURON_ADD,
+                                              addInIndex.size(),
+                                              &addInIndex[0],
+                                              addOutIndex.size(),
+                                              &addOutIndex[0]);
+  } else {
+    LOG(WARNING) << "[APU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "ADD op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return FAILED;
   }
 
   return REBUILD_WHEN_SHAPE_CHANGED;
@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
                          kAPU,
                          paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc
index 5bee94424402c52b61bdd478488a55210f9b4000..ac0d27bc7bb950f764626d509238db18857a7e64 100644
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   inType.dimensions = &dims_in[0];
   std::shared_ptr<Node> in_node = nullptr;
   if (graph->Has(input_name)) {
-    // input operand already exist
     in_node = graph->Get(input_name);
     VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &inType);  // 0: input
+    NeuronModel_addOperand(model, &inType);  // Operand 0: input
     in_node = graph->Add(input_name, dims_in);
   }
   VLOG(3) << "input_scale: " << input_scale
@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   wType.dimensionCount = w_dims.size();
   std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
   wType.dimensions = &dims_w[0];
-  NeuronModel_addOperand(model, &wType);  // 1: weight
+  NeuronModel_addOperand(model, &wType);  // Operand 1: weight
   std::shared_ptr<Node> w_node = nullptr;
   w_node = graph->Add(w_name, dims_w);
   VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.dimensionCount = bias_dims.size();
     std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(bias_name, dims_bias);
     VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
             << ", bias scale: " << biasType.scale
@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.dimensionCount = 1;
     std::vector<uint32_t> dims_bias = {(uint32_t)n};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(w_name + "_default_bias", dims_bias);
   }
 
@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   fuseType.type = NEURON_INT32;
   fuseType.dimensionCount = 0;
   std::vector<uint32_t> dims_int32 = {0};
-  NeuronModel_addOperand(model, &fuseType);  // 3: fuse
+  NeuronModel_addOperand(model, &fuseType);  // Operand 3: fuse
   std::shared_ptr<Node> fuse_node = nullptr;
   fuse_node = graph->Add(w_name + "_fuse", dims_int32);
 
@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   outType.scale = out_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = 2;
-  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[1]};
   outType.dimensions = &dims_out[0];
   VLOG(3) << "out_scale: " << out_scale
           << ", outType: " << outType.dimensions[0] << " : "
           << outType.dimensions[1];
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_out);
 
@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         NeuronModel_setOperandValue(model,
                                     bias_node->index(),
                                     bias->raw_data(),
-                                    bias->memory_size());  // 2: bias
+                                    bias->memory_size());  // Operand 2: bias
   } else {
     auto int32_bias = std::make_shared<Tensor>();
     int32_bias->Resize({1, out_dims[1]});
     int32_bias->mutable_data<int32_t>();
     memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
     VLOG(3) << "default: " << int32_bias->memory_size();
-    neuron_errCode =
-        NeuronModel_setOperandValue(model,
-                                    bias_node->index(),
-                                    int32_bias->raw_data(),
-                                    int32_bias->memory_size());  // 2: bias
+    neuron_errCode = NeuronModel_setOperandValue(
+        model,
+        bias_node->index(),
+        int32_bias->raw_data(),
+        int32_bias->memory_size());  // Operand 2: bias
     bias_node->set_data(int32_bias);
   }
   // Add fuse value
   int32_t fuse_val[1] = {0};
-  NeuronModel_setOperandValue(
-      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
-
-  std::vector<uint32_t> addInIndex = {in_node->index(),
-                                      w_node->index(),
-                                      bias_node->index(),
-                                      fuse_node->index()};
+  NeuronModel_setOperandValue(model,
+                              fuse_node->index(),
+                              fuse_val,
+                              sizeof(int32_t) * 1);  // Operand 3: fuse
+
+  std::vector<uint32_t> addInIndex = {in_node->index(),     // 0: input
+                                      w_node->index(),      // 1: weight
+                                      bias_node->index(),   // 2: bias
+                                      fuse_node->index()};  // 3: fuse
   std::vector<uint32_t> addOutIndex = {out_node->index()};
   neuron_errCode = NeuronModel_addOperation(model,
                                             NEURON_FULLY_CONNECTED,
diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc
old mode 100644
new mode 100755
index 515853aa26a1d84339c61047b5d3be20478b5ca3..ee7c92d2c2b9399b44fffd2fe8ad80618f3de526
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
     LOG(FATAL) << "[APU] Node" << name << " is redefined.";
     return -1;
   } else {
-    VLOG(3) << " Add: " << name << " : " << node->index();
+    VLOG(5) << " Add: " << name << " : " << node->index();
     auto ret = nodes_.insert(
         std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
     CHECK(ret.second);
diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h
old mode 100644
new mode 100755
index e3e68afc6c7c18d2b8d68361ac09de2abf2b684c..264ca8160ae4343eda7b8c7424cf26c0257512d8
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
 USE_SUBGRAPH_BRIDGE(fc, kAPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
 USE_SUBGRAPH_BRIDGE(softmax, kAPU);
+USE_SUBGRAPH_BRIDGE(concat, kAPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc
index e2555180446920b670d98ebc3d82aa492ed244f4..20691ee737ec47528b800367dca8d615f0b878a6 100644
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
   std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
 
-  // pool mode
+  // Check pool mode
   if ((pooling_type == "max") || (pooling_type == "avg")) {
   } else {
     LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
     return FAILED;
   }
 
-  // pad mode
+  // Check padding mode
   int pad_mode = 0;
   std::string padding_algorithm("");
   if (op_info->HasAttr("padding_algorithm")) {
@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     pad_mode = 5;
   }
 
-  // paddings and strides
+  // Check paddings and strides
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < 2L; ++i) {
       int copy_pad = *(paddings.begin() + 2 * i);
@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   xType.dimensions = &dims_x[0];
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
-    LOG(INFO) << "Graph has " << x_name;
-    // input operand already exist
+    VLOG(3) << "Graph has " << x_name;
     x_node = graph->Get(x_name);
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: x
+    NeuronModel_addOperand(model, &xType);  // Operand 0: x
     x_node = graph->Add(x_name, dims_x);
   }
   VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
           << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
           << xType.dimensions[3];
 
+  VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1];
+
   NeuronOperandType int32Type;
   int32Type.type = NEURON_INT32;
   int32Type.dimensionCount = 0;
   std::vector<uint32_t> dims_int32 = {0};
 
   std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 1: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 1: padding left
   paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
 
   std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 2: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: padding right
   paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
 
   std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding top
   paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
 
   std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding bottom
   paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
 
   std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: stride width
   strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
 
   std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: stride height
   strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
 
   std::shared_ptr<Node> filterW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: filter width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: filter width
   filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
 
   std::shared_ptr<Node> filterH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: filter height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: filter height
   filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
 
   std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9: fuse
-  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32);
 
-  // Add out type
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (graph->Has(out_name)) {
     out_node = graph->Get(out_name);
   } else {
-    NeuronModel_addOperand(model, &outType);  // out
+    NeuronModel_addOperand(model, &outType);
     out_node = graph->Add(out_name, dims_out);
   }
-  VLOG(3) << "output_scale: " << x_scale
+  VLOG(3) << "output_scale: " << out_scale
           << ", outType: " << outType.dimensions[0] << ":"
           << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
           << outType.dimensions[3];
@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Add Stride
   int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
   NeuronModel_setOperandValue(
       model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
   NeuronModel_setOperandValue(
       model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
 
   // Add filter
   int32_t filter_val[1];
-  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  filter_val[0] =
+      global_pooling ? x_dims[3] : ksize[1];  // Entry 1: filter width
   NeuronModel_setOperandValue(
       model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
-  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  filter_val[0] =
+      global_pooling ? x_dims[2] : ksize[0];  // Entry 0: filter height
   NeuronModel_setOperandValue(
       model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);
 
diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc
index 4b2a465cd6e48d9d387f0b2195b04728890601ca..177f778ea7dbfc77f389a76ed236a975a9cfe314 100644
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   xType.dimensions = &dims_x[0];
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
-    // input operand already exist
     x_node = graph->Get(x_name);
     VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: input
+    NeuronModel_addOperand(model, &xType);  // Operand 0: input
     x_node = graph->Add(x_name, dims_x);
   }
   VLOG(3) << "input_scale size: " << input_scale
@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType betaType;
   betaType.type = NEURON_FLOAT32;
   betaType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &betaType);  // 1: beta
+  NeuronModel_addOperand(model, &betaType);  // Operand 1: beta
   std::shared_ptr<Node> beta_node = nullptr;
   beta_node = graph->Add(x_name + "_beta", dims_int32);
 
@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType axisType;
   axisType.type = NEURON_INT32;
   axisType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &axisType);  // 2: axis
+  NeuronModel_addOperand(model, &axisType);  // Operand 2: axis
   std::shared_ptr<Node> axis_node = nullptr;
   axis_node = graph->Add(x_name + "_axis", dims_int32);
 
@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   outType.zeroPoint = 128;
   outType.dimensionCount = x_dims.size();
   outType.dimensions = &dims_x[0];
-  NeuronModel_addOperand(model, &outType);  // 3: output
+  NeuronModel_addOperand(model, &outType);  // Operand 3: output
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_x);
   VLOG(3) << "out_scale: " << out_scale;
@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   axis_val[0] = axis;
   NeuronModel_setOperandValue(
       model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
-  std::vector<uint32_t> addInIndex = {
-      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addInIndex = {x_node->index(),      // 0: input
+                                      beta_node->index(),   // 1: beta
+                                      axis_node->index()};  // 2: axis
   std::vector<uint32_t> addOutIndex = {out_node->index()};
   int neuron_errCode = NeuronModel_addOperation(model,
                                                 NEURON_SOFTMAX,
diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc
index c91e81476e519a28ebf851f42f2916c9d7c38dd8..f9cd04b71805bc29a7da4450d1f9235c5cf5d64a 100644
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info,
   }
 }
 
-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint) {
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint) {
   int neuron_errCode;
   auto graph = static_cast<Graph*>(ctx);
   auto model = graph->model();
 
+  uint32_t numDevices = 0;
+  CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR);
+  CHECK_GT(numDevices, (uint32_t)0);
+
+  NeuronDevice* targetDevice = nullptr;
+
+  for (uint32_t i = 0; i < numDevices; ++i) {
+    NeuronDevice* device = nullptr;
+    Neuron_getDevice(i, &device);
+    const char* name;
+    NeuronDevice_getName(device, &name);
+    if (0 == strcmp(name, "mtk-dsp")) {
+      targetDevice = device;
+      break;
+    }
+  }
+  if (targetDevice == nullptr) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
   // Add input
   NeuronOperandType inType;
   inType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  inType.scale = scale;
+  inType.scale = scale_in;
   inType.zeroPoint = zeroPoint;
   inType.dimensionCount = input_shape.size();
   inType.dimensions = &input_shape[0];
@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx,
     VLOG(3) << "Has " << input_name;
     input_node = graph->Get(input_name);
   } else {
-    neuron_errCode = NeuronModel_addOperand(model, &inType);  // input
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
     if (NEURON_NO_ERROR != neuron_errCode) {
-      LOG(WARNING) << "Insert transpose op fail!";
-      return;
+      LOG(FATAL) << "Insert mtk_requant op fail!";
+      return -1;
     }
     VLOG(3) << "Add " << input_name;
     input_node = graph->Add(input_name, input_shape);
   }
 
+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale_out;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+
+  NeuronModel_addOperand(model, &outType);
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+
+  std::vector<uint32_t> addInIndex = {input_node->index()};
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+
+  neuron_errCode = NeuronModel_addOperationExtension(model,
+                                                     "MTK_REQUANTIZE",
+                                                     "mediatek",
+                                                     targetDevice,
+                                                     addInIndex.size(),
+                                                     &addInIndex[0],
+                                                     addOutIndex.size(),
+                                                     &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
+  return 0;
+}
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(5) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(FATAL) << "Insert transpose op fail!";
+      return -1;
+    }
+    VLOG(5) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
+
   // Add perm
   NeuronOperandType permsType;
   permsType.type = NEURON_TENSOR_INT32;
@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx,
   uint32_t dims_perms[1] = {4};
   permsType.dimensions = dims_perms;
 
-  neuron_errCode = NeuronModel_addOperand(model, &permsType);  // perm
+  neuron_errCode = NeuronModel_addOperand(model, &permsType);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
   }
   std::shared_ptr<Node> perms_node = nullptr;
   perms_node = graph->Add(input_name + "_perms", {4});
 
-  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+  VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
           << axis[3];
-  //  &axis[0], sizeof(int32_t) * axis.size());
+
   neuron_errCode = NeuronModel_setOperandValue(
       model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
   }
 
   // Add output
@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx,
   outType.dimensionCount = output_shape.size();
   outType.dimensions = &output_shape[0];
 
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
   std::shared_ptr<Node> output_node = nullptr;
   output_node = graph->Add(output_name, output_shape);
 
@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx,
                                             &addOutIndex[0]);
 
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
   }
+
+  return 0;
 }
 
 void transpose(const int8_t* input_data,
@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data,
   int new_index = -1;
   int dim[4] = {0};
   std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
           << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
   for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
     for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
       for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data,
   int new_index = -1;
   int dim[4] = {0};
   std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
           << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
   for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
     for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
       for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data,
               dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
               dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
               dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-
-          output_data[new_index] = input_data[old_index] + 128;  // per layer
+          // Per layer op is asym op and need to add 128
+          output_data[new_index] = input_data[old_index] + 128;
         }
       }
     }
diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h
old mode 100644
new mode 100755
index 01752d181964bfb0e19f4319b52727b1ab541ee7..ff9c75711c22cebc15f8b0f3b14d11dc8e6c62f1
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info,
                  const Scope* scope,
                  const std::string& argname);
 
-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint);
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint);
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint);
 
 void transpose(const int8_t* input_data,
                uint8_t* output_data,
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
old mode 100644
new mode 100755
index 698536743d3225aaf2ebd4e3a6a75ee3f3c1ef1f..5e86514478f421ece6642afdd0bfaab4025420bb
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
     BuildOriginProgram();
   }
 
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+
+  auto start_time = GetCurrentUS();
+
   unsigned int version;
   Neuron_getVersion(&version);
   VLOG(3) << "Neuron Adapter version: " << version;
@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() {
   }
   VLOG(3) << "[APU] APU NIR model created!";
 
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
+  VLOG(1) << "[APU] APU NIR model created, Create cost "
+          << GetCurrentUS() - start_time << " us";
+
+  start_time = GetCurrentUS();
   compilation_ = lite::apu::Device::Global().Build(model_);
   if (compilation_ == nullptr) {
     LOG(WARNING) << "[APU] Build APU DLA model failed!";
     return false;
   }
-  VLOG(3) << "[APU] APU DLA model created, Build cost "
+  VLOG(1) << "[APU] APU DLA model created, Build cost "
           << GetCurrentUS() - start_time << " us";
   return true;
 }
@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() {
     }
   }
   NeuronExecution_free(run);
-  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
   return true;
 }