Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into fpga_patch

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into fpga_patch
test=develop
a5a63ad4 · MyPandaShaoxiang · 2c904a18 · 18b5f852 · a5a63ad4 · a5a63ad4
24 changed file
--- a/lite/backends/arm/math/yolo_box.cc
+++ b/lite/backends/arm/math/yolo_box.cc
@@ -108,7 +108,7 @@ void yolobox(lite::Tensor* X,
  auto anchors_data = anchors.data();

  const float* X_data = X->data<float>();
-  float* ImgSize_data = ImgSize->mutable_data<float>();
+  int* ImgSize_data = ImgSize->mutable_data<int>();

  float* Boxes_data = Boxes->mutable_data<float>();

@@ -116,8 +116,8 @@ void yolobox(lite::Tensor* X,

  float box[4];
  for (int i = 0; i < n; i++) {
-    int img_height = static_cast<int>(ImgSize_data[2 * i]);
-    int img_width = static_cast<int>(ImgSize_data[2 * i + 1]);
+    int img_height = ImgSize_data[2 * i];
+    int img_width = ImgSize_data[2 * i + 1];

    for (int j = 0; j < an_num; j++) {
      for (int k = 0; k < h; k++) {

--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
@@ -45,7 +45,8 @@ inline ActivationType GetActivationType(const std::string &type) {
  } else if (type == "identity" || type == "") {
    return ActivationType::kIdentity;
  }
-  PADDLE_ENFORCE(false, "Not support type %s", type);
+  LOG(ERROR) << "Not support type " << type;
+  // PADDLE_ENFORCE(false, "Not support type %s", type);
  // PADDLE_THROW("Not support type %s.", type);
 }


--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-#include "lite/fluid/lod.h"
+// #include "lite/fluid/lod.h"
 #include "lite/utils/paddle_enforce.h"

 namespace paddle {

--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
-# lite_cc_library(activation_compute_x86 SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
+add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
@@ -18,13 +18,15 @@ add_kernel(reshape_compute_x86 X86 basic SRCS reshape_compute.cc DEPS ${lite_ker
 # lite_cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
+add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
+#add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})

 # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 # lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
 # lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
 # lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
-# lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
@@ -49,4 +51,7 @@ lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc D
 lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
 lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
 lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
+lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS activation_compute_x86)
+lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86)
+lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
 lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -12,94 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename Functor>
-void Activate(const platform::CPUDeviceContext& context,
-              const framework::LoDTensor* X,
-              framework::LoDTensor* Out) {
-  using T = typename Functor::ELEMENT_TYPE;
-  auto* place = context.eigen_device();
-  auto x =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(X));
-  auto out =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(Out));
-  Functor()(*place, x, out);
-}
-
-template <typename Functor>
-void ActivateGrad(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor* X,
-                  const framework::LoDTensor* Out,
-                  const framework::LoDTensor* Out_grad,
-                  framework::LoDTensor* X_grad) {
-  using T = typename Functor::ELEMENT_TYPE;
-  auto* place = context.eigen_device();
-  auto x =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(X));
-  auto out =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(Out));
-  auto x_grad = framework::EigenVector<T>::Flatten(
-      paddle::operators::detail::Ref(X_grad));
-  auto out_grad = framework::EigenVector<T>::Flatten(
-      paddle::operators::detail::Ref(Out_grad));
-  Functor()(*place, x, out, out_grad, x_grad);
-}
-
-template <typename T>
-class SquareCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::ActivationParam>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-    Activate<paddle::operators::SquareFunctor<T>>(*context.x86_device_context(),
-                                                  &param.X->raw_tensor(),
-                                                  &param.Out->raw_tensor());
-  }
-
-  virtual ~SquareCompute() = default;
-};
-
-template <typename T>
-class SquareGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationGradParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::ActivationGradParam>();
-    CHECK(context.x86_device_context());
-    param.X_grad->template mutable_data<T>();
-
-    ActivateGrad<paddle::operators::SquareGradFunctor<T>>(
-        *context.x86_device_context(),
-        &param.X->raw_tensor(),
-        &param.Out->raw_tensor(),
-        &param.Out_grad->raw_tensor(),
-        &param.X_grad->raw_tensor());
-  }
-
-  virtual ~SquareGradCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/x86/activation_compute.h"

 // float
 REGISTER_LITE_KERNEL(square,
@@ -112,16 +25,13 @@ REGISTER_LITE_KERNEL(square,
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();

-REGISTER_LITE_KERNEL(square_grad,
+// float
+REGISTER_LITE_KERNEL(relu,
                     kX86,
                     kFloat,
                     kNCHW,
-                     paddle::lite::kernels::x86::SquareGradCompute<float>,
+                     paddle::lite::kernels::x86::ReluCompute<float>,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <utility>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/fluid/eigen.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+
+  // Never add kDepXOut, because Out can be always calculated
+  // by forward input X in backward part.
+  // FIXME(zjl): but in MKLDNN abs, X and Out are all needed...
+  // Developers should not rely on this enum value!
+  kDepXOut = 0x03
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
+};
+
+template <typename Functor>
+bool Activate(const lite::Tensor* X, lite::Tensor* Out) {
+  using T = typename Functor::ELEMENT_TYPE;
+  auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+  CHECK_OR_FALSE(X)
+  CHECK_OR_FALSE(Out)
+  auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+  auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+  Functor()(place, x, out);
+}
+
+// square(x) = x^2
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
+  }
+};
+
+template <typename T>
+class SquareCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<SquareFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~SquareCompute() = default;
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<ReluFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~ReluCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/activation_compute_test.cc
+++ b/lite/kernels/x86/activation_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/activation_compute.cc"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(relu_x86, retrive_op) {
+  auto relu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  ASSERT_FALSE(relu.empty());
+  ASSERT_TRUE(relu.front());
+}
+
+TEST(relu_x86, init) {
+  ReluComputeCompute<float> relu;
+  ASSERT_EQ(relu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(relu.target(), TARGET(kX86));
+}
+
+TEST(relu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    int sign = i % 2 == 0 ? 1 : -1;
+    x_data[i] = static_cast<float>(i * sign);
+  }
+
+  // ReluCompute relu;
+  ReluCompute<float> relu;
+  operators::Param param;
+
+  param.x = &x;
+  param.y = &y;
+  param.out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  sequence_expand_as.SetContext(std::move(ctx));
+  sequence_expand_as.SetParam(param);
+  sequence_expand_as.Run();
+  auto out_data = out.mutable_data<float>();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_expand_as, kX86, kFloat, kNCHW, def);
--- a/lite/kernels/x86/gru_compute.cc
+++ b/lite/kernels/x86/gru_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gru_compute.h"
+
+DEFINE_int32(paddle_num_threads,
+             1,
+             "Number of threads for each paddle instance.");
+
+REGISTER_LITE_KERNEL(gru,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::GRUCompute<float>,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Batch_gate", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Batch_reset_hidden_prev",
+                {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Batch_hidden", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/detail/gru_cpu_kernel.h"
+#include "lite/backends/x86/math/detail/gru_kernel.h"
+#include "lite/backends/x86/math/gru_compute.h"
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/backends/x86/math/sequence2batch.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/eigen.h"
+
+DECLARE_int32(paddle_num_threads);
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+using Tensor = lite::Tensor;
+
+template <typename T>
+inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
+                             const Tensor& src,
+                             const std::vector<uint64_t>& index_lod,
+                             Tensor* dst,
+                             bool indexed_src) {
+  lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle;
+  dst->Resize(src.dims());
+  dst->mutable_data<T>();
+  row_shuffle(context, src, index_lod, dst, indexed_src);
+}
+
+template <typename T>
+class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::GRUParam>();
+
+    bool origin_mode = param.origin_mode;
+    bool is_reverse = param.is_reverse;
+
+    auto* input = param.input;
+    auto* h0 = param.h0;
+    auto* weight = param.weight;
+    const T* weight_data = weight->data<T>();
+    auto* bias = param.bias;
+
+    auto* batch_gate = param.batch_gate;
+    batch_gate->mutable_data<T>();
+    auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+    batch_reset_hidden_prev->mutable_data<T>();
+    auto* batch_hidden = param.batch_hidden;
+    batch_hidden->mutable_data<T>();
+    auto* hidden = param.hidden;
+    hidden->mutable_data<T>();
+
+    auto hidden_dims = hidden->dims();
+
+    lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch;
+    to_batch(context, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      lite::x86::math::RowwiseAdd<TARGET(kX86), T> add_bias;
+      add_bias(context, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    lite::x86::math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    std::vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.mutable_data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node =
+        lite::x86::math::detail::GetActivationType(param.activation);
+    auto active_gate =
+        lite::x86::math::detail::GetActivationType(param.gate_activation);
+
+#ifdef PADDLE_WITH_MKLML
+    // use MKL packed to speedup GEMM
+    if (FLAGS_paddle_num_threads >= 4) {
+      auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix,
+                                       1 /*height of C*/,
+                                       frame_size * 2 /*width of weight*/,
+                                       frame_size /*height of height*/);
+      CHECK(packed_gate);
+      blas.GEMM_PACK(CblasBMatrix,
+                     CblasNoTrans,
+                     1 /*cur bs?*/,
+                     frame_size * 2,
+                     frame_size,
+                     T(1.0),
+                     gru_value.gate_weight,
+                     frame_size * 2,
+                     packed_gate);
+      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix,
+                                        1 /*height of C*/,
+                                        frame_size /*width of weight*/,
+                                        frame_size /*height of height*/);
+      CHECK(packed_state);
+      blas.GEMM_PACK(CblasBMatrix,
+                     CblasNoTrans,
+                     1 /*cur bs?*/,
+                     frame_size,
+                     frame_size,
+                     T(1.0),
+                     gru_value.state_weight,
+                     frame_size,
+                     packed_state);
+      for (size_t n = 0; n < seq_len; n++) {
+        int64_t bstart = static_cast<int64_t>(batch_starts[n]);
+        int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
+        int64_t cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice<T>(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
+        gru_value.output_value = hidden_t.mutable_data<T>();
+        gru_value.gate_value = gate_t.mutable_data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(CblasNoTrans,
+                            CblasPacked,
+                            cur_batch_size,
+                            frame_size * 2,
+                            frame_size,
+                            gru_value.prev_out_value,
+                            frame_size,
+                            packed_gate,
+                            frame_size * 2,
+                            T(1),
+                            gru_value.gate_value,
+                            frame_size * 3);
+        }
+
+        lite::x86::math::detail::forward_final_output(
+            lite::x86::math::detail::forward::gru_finalOutput<T>(),
+            gru_value,
+            frame_size,
+            cur_batch_size,
+            active_node,
+            origin_mode);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+
+      blas.GEMM_FREE(packed_gate);
+      blas.GEMM_FREE(packed_state);
+    } else {
+#endif
+      for (size_t n = 0; n < seq_len; n++) {
+        int64_t bstart = static_cast<int64_t>(batch_starts[n]);
+        int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
+        int64_t cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice<T>(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
+        gru_value.output_value = hidden_t.mutable_data<T>();
+        gru_value.gate_value = gate_t.mutable_data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+
+        lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute(
+            context,
+            gru_value,
+            frame_size,
+            cur_batch_size,
+            active_node,
+            active_gate,
+            origin_mode);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+#ifdef PADDLE_WITH_MKLML
+    }
+#endif
+    lite::x86::math::Batch2LoDTensorFunctor<TARGET(kX86), T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(context, *batch_hidden, hidden);
+  }
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/gru_compute_test.cc
+++ b/lite/kernels/x86/gru_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gru_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(gru_x86, retrive_op) {
+  auto gru =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gru");
+  ASSERT_FALSE(gru.empty());
+  ASSERT_TRUE(gru.front());
+}
+
+TEST(gru_x86, init) {
+  GRUCompute<float> gru;
+  ASSERT_EQ(gru.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gru.target(), TARGET(kX86));
+}
+
+TEST(gru_x86, run_test) {
+  lite::Tensor input, h0, weight, bias;
+  lite::Tensor batch_gate, batch_reset_hidden_prev, batch_hidden, hidden;
+  constexpr int batch_size = 9;
+  std::vector<int64_t> input_shape{batch_size, 15};
+  input.Resize(lite::DDim(input_shape));
+  std::vector<int64_t> weight_shape{5, 15};
+  weight.Resize(lite::DDim(weight_shape));
+  std::vector<int64_t> h0_shape{3, 5};
+  h0.Resize(lite::DDim(h0_shape));
+  std::vector<int64_t> bias_shape{1, 15};
+  bias.Resize(lite::DDim(bias_shape));
+  std::vector<int64_t> batch_gate_shape{batch_size, 15};
+  batch_gate.Resize(lite::DDim(batch_gate_shape));
+  std::vector<int64_t> batch_reset_hidden_prev_shape{batch_size, 5};
+  batch_reset_hidden_prev.Resize(lite::DDim(batch_reset_hidden_prev_shape));
+  std::vector<int64_t> batch_hidden_shape{batch_size, 5};
+  batch_hidden.Resize(lite::DDim(batch_hidden_shape));
+  std::vector<int64_t> hidden_shape{batch_size, 5};
+  hidden.Resize(lite::DDim(hidden_shape));
+
+  std::vector<std::vector<uint64_t>> lod{{0, 2, 6, 9}};
+  input.set_lod(lod);
+
+  auto input_data = input.mutable_data<float>();
+  auto weight_data = weight.mutable_data<float>();
+  auto h0_data = h0.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+
+  for (int64_t i = 0; i < input.dims().production(); i++) {
+    input_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < weight.dims().production(); i++) {
+    weight_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < h0.dims().production(); i++) {
+    h0_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(0);
+  }
+  // ReluCompute relu;
+  GRUCompute<float> gru;
+  operators::GRUParam param;
+
+  param.input = &input;
+  param.h0 = &h0;
+  param.weight = &weight;
+  param.bias = &bias;
+  param.batch_gate = &batch_gate;
+  param.batch_reset_hidden_prev = &batch_reset_hidden_prev;
+  param.batch_hidden = &batch_hidden;
+  param.hidden = &hidden;
+  param.gate_activation = "sigmoid";
+  param.activation = "tanh";
+  param.is_reverse = false;
+  param.origin_mode = false;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gru.SetContext(std::move(ctx));
+  gru.SetParam(param);
+  gru.Run();
+
+  auto batch_gate_data = batch_gate.mutable_data<float>();
+  auto batch_reset_hidden_prev_data =
+      batch_reset_hidden_prev.mutable_data<float>();
+  auto batch_hidden_data = batch_hidden.mutable_data<float>();
+  auto hidden_data = hidden.mutable_data<float>();
+  std::vector<float> batch_gate_out{
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0};
+  std::vector<float> batch_reset_hidden_prev_out{
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> batch_hidden_out{
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> hidden_out{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < batch_gate.dims().production(); i++) {
+    LOG(INFO) << batch_gate_data[i];
+    EXPECT_NEAR(batch_gate_data[i], batch_gate_out[i], 1e-3);
+  }
+  for (int i = 0; i < batch_reset_hidden_prev.dims().production(); i++) {
+    LOG(INFO) << batch_reset_hidden_prev_data[i];
+    EXPECT_NEAR(
+        batch_reset_hidden_prev_data[i], batch_reset_hidden_prev_out[i], 1e-3);
+  }
+  for (int i = 0; i < batch_hidden.dims().production(); i++) {
+    LOG(INFO) << batch_hidden_data[i];
+    EXPECT_NEAR(batch_hidden_data[i], batch_hidden_out[i], 1e-3);
+  }
+  for (int i = 0; i < hidden.dims().production(); i++) {
+    LOG(INFO) << hidden_data[i];
+    EXPECT_NEAR(hidden_data[i], hidden_out[i], 1e-3);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gru, kX86, kFloat, kNCHW, def);
--- a/lite/kernels/x86/relu_compute_test.cc
+++ b/lite/kernels/x86/relu_compute_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/x86/relu_compute.h"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <vector>
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.h"

 namespace paddle {
 namespace lite {
@@ -64,6 +64,8 @@ TEST(relu_x86, run_test) {
  LOG(INFO) << "output: ";
  for (int i = 0; i < out.dims().production(); i++) {
    LOG(INFO) << out_data[i];
+    int sign = i % 2 == 0 ? 1 : 0;
+    ASSERT_EQ(out_data[i], i * sign);
  }
 }


--- a/lite/kernels/x86/sequence_expand_as_compute.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_expand_as_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_expand_as,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceExpandAsCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/eigen.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+using Tensor = lite::Tensor;
+
+template <typename T>
+struct SequenceExpandFunctor {
+  void operator()(const Tensor &x,
+                  const std::vector<size_t> &ref_lod, /*expand referenced lod*/
+                  Tensor *out) {
+    int64_t hight = x.dims()[0];
+    int64_t width = x.data_size() / hight;
+
+    const T *in_data = x.data<T>();
+    T *out_data = out->mutable_data<T, T>();
+
+    for (int h_id = 0; h_id < hight; ++h_id) {
+      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      if (span == 0) continue;
+      const T *src = in_data + h_id * width;
+      for (int64_t w_id = 0; w_id < width; ++w_id) {
+        T ele = src[w_id];
+        size_t offset = ref_lod[h_id] * width;
+        for (size_t k = 0; k < span; ++k) {
+          out_data[offset + k * width + w_id] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequenceExpandAsCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::SequenceExpandAsParam>();
+
+    auto *x = param.x;
+    auto *y = param.y;
+    auto *out = param.out;
+
+    auto &y_lod = y->lod();
+    CHECK_EQ(y_lod.size(), 1);
+    CHECK_GT(y_lod[0].size(), 1);
+
+    out->mutable_data<T, T>();
+
+    SequenceExpandFunctor<T> seq_espand_functor;
+    seq_espand_functor(*x, y_lod[0], out);
+  }
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/sequence_expand_as_compute_test.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_expand_as_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(sequence_expand_as_x86, retrive_op) {
+  auto sequence_expand_as =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_expand_as");
+  ASSERT_FALSE(sequence_expand_as.empty());
+  ASSERT_TRUE(sequence_expand_as.front());
+}
+
+TEST(sequence_expand_as_x86, init) {
+  SequenceExpandAsCompute<float> sequence_expand_as;
+  ASSERT_EQ(sequence_expand_as.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_expand_as.target(), TARGET(kX86));
+}
+
+TEST(sequence_expand_as_x86, run_test) {
+  lite::Tensor x, y, out;
+  std::vector<int64_t> x_shape{4, 1};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{1, 5};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{8, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  std::vector<std::vector<uint64_t>> lod{{0, 3, 6, 7, 8}};
+  y.set_lod(lod);
+  // MulCompute mul;
+  SequenceExpandAsCompute<float> sequence_expand_as;
+  operators::SequenceExpandAsParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  sequence_expand_as.SetContext(std::move(ctx));
+  sequence_expand_as.SetParam(param);
+  sequence_expand_as.Run();
+  auto out_data = out.mutable_data<float>();
+
+  int index = 1;
+  int lod_sum = lod[0][index];
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    if (i >= lod_sum) {
+      index++;
+      lod_sum = lod[0][index];
+    }
+    ASSERT_EQ(out_data[i], x_data[index - 1]);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_expand_as, kX86, kFloat, kNCHW, def);
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -71,6 +71,7 @@ add_operator(roi_align_op basic SRCS roi_align_op.cc DEPS ${op_DEPS})
 add_operator(box_clip_op basic SRCS box_clip_op.cc DEPS ${op_DEPS})
 add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op basic SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
+add_operator(sequence_expand_as_op_lite basic SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op basic SRCS assign_value_op.cc DEPS ${op_DEPS})


--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -682,6 +682,12 @@ struct SequenceExpandParam {
  int ref_level{-1};
 };

+struct SequenceExpandAsParam {
+  const lite::Tensor* x{nullptr};
+  const lite::Tensor* y{nullptr};
+  lite::Tensor* out{nullptr};
+};
+
 struct ReduceMaxParam {
  const lite::Tensor* X{};
  lite::Tensor* Out{};

--- a/lite/operators/sequence_expand_as_op.cc
+++ b/lite/operators/sequence_expand_as_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_expand_as_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceExpandAsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x)
+  CHECK_OR_FALSE(param_.y)
+  CHECK_OR_FALSE(param_.out)
+
+  auto x_dims = param_.x->dims();
+  CHECK_EQ_OR_FALSE(x_dims.size(), 2)
+  auto y_lod = param_.y->lod();
+  CHECK_EQ_OR_FALSE(y_lod.size(), 1)
+  CHECK_EQ_OR_FALSE(static_cast<size_t>(x_dims[0]), y_lod[0].size() - 1)
+
+  return true;
+}
+
+bool SequenceExpandAsOpLite::InferShape() const {
+  auto x_dims = param_.x->dims();
+  auto y_lod = param_.y->lod();
+  auto out_dims = x_dims;
+
+  int64_t out_first_dim = 0;
+  if (y_lod[0].size() <= 1) {
+    out_first_dim = x_dims[0];
+  } else {
+    for (size_t i = 1; i < y_lod[0].size(); ++i) {
+      out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]);
+    }
+  }
+  out_dims[0] = out_first_dim;
+
+  param_.out->Resize(out_dims);
+  param_.out->set_lod(y_lod);
+
+  return true;
+}
+
+bool SequenceExpandAsOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                        lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto y = op_desc.Input("Y").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.y = scope->FindVar(y)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_expand_as,
+                 paddle::lite::operators::SequenceExpandAsOpLite)
--- a/lite/operators/sequence_expand_as_op.h
+++ b/lite/operators/sequence_expand_as_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceExpandAsOpLite : public OpLite {
+ public:
+  SequenceExpandAsOpLite() {}
+  explicit SequenceExpandAsOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_expand_as"; }
+
+ private:
+  mutable SequenceExpandAsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/yolo_box_op.cc
+++ b/lite/operators/yolo_box_op.cc
@@ -31,6 +31,19 @@ bool YoloBoxOp::CheckShape() const {
  CHECK_OR_FALSE(ImgSize);
  CHECK_OR_FALSE(Boxes);
  CHECK_OR_FALSE(Scores);
+
+  auto dim_x = X->dims();
+  auto dim_imgsize = ImgSize->dims();
+  std::vector<int> anchors = param_.anchors;
+  int anchor_num = anchors.size() / 2;
+  auto class_num = param_.class_num;
+  CHECK_OR_FALSE(dim_x.size() == 4);
+  CHECK_OR_FALSE(dim_x[1] == anchor_num * (5 + class_num));
+  CHECK_OR_FALSE(dim_imgsize[0] == dim_x[0]);
+  CHECK_OR_FALSE(dim_imgsize[1] == 2);
+  CHECK_OR_FALSE(anchors.size() > 0 && anchors.size() % 2 == 0);
+  CHECK_OR_FALSE(class_num > 0);
+  return true;
 }

 bool YoloBoxOp::InferShape() const {

--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
@@ -101,7 +101,7 @@ class YoloBoxComputeTester : public arena::TestCase {
  float conf_thresh_ = 0.f;
  int downsample_ratio_ = 0;

-  DDim _dims0_{{1, 2, 2, 1}};
+  DDim _dims0_{{1, 255, 13, 13}};
  DDim _dims1_{{1, 2}};

 public:
@@ -115,7 +115,10 @@ class YoloBoxComputeTester : public arena::TestCase {
        anchors_(anchors),
        class_num_(class_num),
        conf_thresh_(conf_thresh),
-        downsample_ratio_(downsample_ratio) {}
+        downsample_ratio_(downsample_ratio) {
+    int anchor_num = anchors_.size() / 2;
+    _dims0_[1] = anchor_num * (5 + class_num);
+  }

  void RunBaseline(Scope* scope) override {
    const lite::Tensor* X = scope->FindTensor(input0_);
@@ -149,14 +152,14 @@ class YoloBoxComputeTester : public arena::TestCase {

    auto anchors_data = anchors.data();
    const float* in_data = in->data<float>();
-    const float* imgsize_data = imgsize->data<float>();
+    const int* imgsize_data = imgsize->data<int>();
    float* boxes_data = boxes->mutable_data<float>();
    float* scores_data = scores->mutable_data<float>();

    float box[4];
    for (int i = 0; i < n; i++) {
-      int img_height = static_cast<int>(imgsize_data[2 * i]);
-      int img_width = static_cast<int>(imgsize_data[2 * i + 1]);
+      int img_height = imgsize_data[2 * i];
+      int img_width = imgsize_data[2 * i + 1];
      for (int j = 0; j < an_num; j++) {
        for (int k = 0; k < h; k++) {
          for (int l = 0; l < w; l++) {
@@ -218,7 +221,7 @@ class YoloBoxComputeTester : public arena::TestCase {
    }
    std::vector<int> data1(_dims1_.production());
    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = i + 8;
+      data1[i] = 608;
    }
    SetCommonTensor(input0_, _dims0_, data0.data());
    SetCommonTensor(input1_, _dims1_, data1.data());
@@ -227,10 +230,9 @@ class YoloBoxComputeTester : public arena::TestCase {

 void test_yolobox(Place place) {
  for (int class_num : {1, 2, 3, 4}) {
-    for (float conf_thresh : {0.5, 0.2, 0.7}) {
-      for (int downsample_ratio : {1, 2, 3}) {
-        std::vector<int> anchor({1, 2, 3, 4});
-
+    for (float conf_thresh : {0.01, 0.2, 0.7}) {
+      for (int downsample_ratio : {16, 32}) {
+        std::vector<int> anchor({10, 13, 16, 30});
        std::unique_ptr<arena::TestCase> tester(new YoloBoxComputeTester(
            place, "def", anchor, class_num, conf_thresh, downsample_ratio));
        arena::Arena arena(std::move(tester), place, 2e-5);

--- a/mobile/src/io/api.cc
+++ b/mobile/src/io/api.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "common/type_define.h"
 #include "cstring"
 #include "io/paddle_inference_api.h"


--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
@@ -18,6 +18,7 @@
 #include <utility>
 #include <vector>
 #include "common/enforce.h"
+#include "common/type_define.h"
 #include "framework/tensor.h"
 #ifdef PADDLE_MOBILE_FPGA
 #include <fpga/common/fpga_common.h>
@@ -35,7 +36,9 @@ PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(

 template <typename Device, typename T>
 bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
-  paddle_mobile_.reset(new PaddleMobile<Device, T>());
+  PaddleMobileConfigInternal configInternal;
+  configInternal.load_when_predict = config.load_when_predict;
+  paddle_mobile_.reset(new PaddleMobile<Device, T>(configInternal));
 #ifdef PADDLE_MOBILE_CL
  paddle_mobile_->SetCLPath(config.cl_path);
 #endif
@@ -135,14 +138,14 @@ bool PaddleMobilePredictor<Device, T>::Run(
 void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) {
  des->Resize(framework::make_ddim(src.shape));
  des->external_data = src.data.data();
-  des->set_type(src.dtypeid);
+  des->set_type(static_cast<kTypeId_t>(static_cast<int>(src.dtypeid)));
  des->layout =
      src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW;
 }

 void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) {
  des->shape = framework::vectorize2int(src.dims());
-  des->dtypeid = src.type();
+  des->dtypeid = static_cast<PaddlekTypeId_t>(static_cast<int>(src.type()));
  des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW;

  auto num = src.numel();
@@ -164,7 +167,8 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
  auto num = inputs.size();
  std::vector<framework::Tensor> tensors(num, framework::Tensor());
  for (int i = 0; i < num; i++) {
-    if (inputs[i].dtypeid == type_id<int8_t>().hash_code()) {
+    if (static_cast<kTypeId_t>(static_cast<int>(inputs[i].dtypeid)) ==
+        type_id<int8_t>().hash_code()) {
      tensors[i].init(type_id<int8_t>().hash_code());
    } else {
      tensors[i].init(type_id<float>().hash_code());

--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "common/type_define.h"

 namespace paddle_mobile {

@@ -86,6 +85,56 @@ class PaddleBuf {
  bool memory_owned_{true};
 };

+typedef enum {
+  paddle_void = 0,
+  paddle_float,
+  paddle_int,
+  paddle_uint16_t,
+  paddle_double,
+  paddle_int64_t,
+  paddle_size_t,
+  paddle_int16_t,
+  paddle_int8_t,
+  paddle_uint8_t,
+  paddle_bool,
+  paddle_string,
+  paddle_floats = 100,
+  paddle_ints,
+  paddle_int64_ts,
+  paddle_size_ts,
+  paddle_bools,
+  paddle_strings,
+  paddle_const_float = 200,
+  paddle_const_int,
+  paddle_block = 300,
+  paddle_tensor,
+  paddle_lod_tensor,
+  paddle_blocks,
+  paddle_tensors,
+  paddle_lod_tensors,
+  paddle_p_block = 400,
+  paddle_p_tensor,
+  paddle_p_lod_tensor,
+  paddle_p_blocks,
+  paddle_p_tensors,
+  paddle_p_lod_tensors,
+  paddle_scopes = 500,
+  paddle_selected_rows,
+  paddle_dim0 = 600,
+  paddle_dim1,
+  paddle_dim2,
+  paddle_dim3,
+  paddle_dim4,
+  paddle_dim5,
+  paddle_dim6,
+  paddle_dim7,
+  paddle_dim8,
+  paddle_dim9,
+#ifdef PADDLE_MOBILE_CL
+  paddle_cl_image,
+#endif
+} PaddlekTypeId_t;
+
 struct PaddleTensor {
  PaddleTensor() = default;
  std::string name;  // variable name.
@@ -93,7 +142,7 @@ struct PaddleTensor {
  std::vector<int> lod;
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
-  kTypeId_t dtypeid;
+  PaddlekTypeId_t dtypeid;
  LayoutType layout;
 };

@@ -166,6 +215,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  bool quantification = false;
  bool lod_mode = false;
  int thread_num = 1;
+  bool load_when_predict = false;
  std::string cl_path;
  struct PaddleModelMemoryPack memory_pack;
 };

--- a/mobile/tools/build.sh
+++ b/mobile/tools/build.sh
@@ -61,7 +61,7 @@ build_for_android() {
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
    else
        echo "unknown platform!"
        exit -1