diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6ef44fb127afbe56207136116fbc09e933b1c7f0..d60fdd90e2a2a438a461a2d6fd7462ccfe5576d8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
       BuildPtenKernelContext(*runtime_ctx, dev_ctx);
       (*pt_kernel_)(pt_kernel_context_.get());
-
       WriteBackToOutputs(runtime_ctx);
-
       pt_kernel_context_->ClearData();
     } else {
       (*kernel_func_)(
@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (auto* var : ins_vector) {
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(*var, in_def));
-      }
-      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (pt_kernel_context_->InputsSize() > start_idx) {
-      size_t input_size = pt_kernel_context_->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = pt_kernel_context_->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr =
+            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(
+              *ins_vector[offset], in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[j], in_def,
+              *ins_vector[offset], in_def,
               pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
-          in_def));
-          */
+                                                                    offset));
         }
+      } else {
+        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
+                                                    in_def));
       }
-      pt_kernel_context_->MutableInputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->InputsSize()));
     }
+    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto* var : outs_vector) {
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(var, out_def));
-      }
-      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (pt_kernel_context_->OutputsSize() > start_idx) {
-      size_t output_size = pt_kernel_context_->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > start_idx + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j], out_def,
-              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                     j));
-
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(outs_vector[j],
-          out_def));
-              */
-        }
+    auto current_vector_size = pt_kernel_context_->OutputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset], out_def,
+            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
+                                                                   offset));
+      } else {
+        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                    out_def));
       }
-      pt_kernel_context_->MutableOutputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->OutputsSize()));
     }
+    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
+                                          i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 604f9d2be9e487237789243e0a44486af7a75918..8e61b7d2eed880d58c6bdd832f9e95ff4bff6d2c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (const auto& var : ins_vector) {
-        const auto& variable = var->Var();
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
-      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (kernel_ctx->InputsSize() > start_idx) {
-      size_t input_size = kernel_ctx->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = kernel_ctx->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      const auto& variable = ins_vector[offset]->Var();
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              ins_vector[j]->Var(), in_def,
-              kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
-                                                        in_def));
-          */
+              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
+                                    start_idx + offset));
         }
+      } else {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
       }
-      kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->InputsSize()));
     }
+    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto& var : outs_vector) {
-        auto* variable = var->MutableVar();
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, out_def));
-      }
-      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (kernel_ctx->OutputsSize() > start_idx) {
-      size_t output_size = kernel_ctx->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > i + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j]->MutableVar(), out_def,
-              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(
-                    outs_vector[j]->MutableVar(), out_def));
-          */
-        }
+    auto current_vector_size = kernel_ctx->OutputsSize();
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset]->MutableVar(), out_def,
+            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(
+                outs_vector[offset]->MutableVar(), out_def));
       }
-      kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->OutputsSize()));
     }
+    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 4f4d673dfe6c4bc0a5c21d3282a18e9435ee914a..8a87a5b735e99eb13114e9bd60777aff7e18ac7f 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -104,14 +104,18 @@ class KernelContext {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
   }
 
+  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
+    return inputs_.at(idx);
+  }
+
   template <typename TensorType>
-  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
     std::vector<TensorType> v;
     for (size_t i = start; i < end; ++i) {
       auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
       v.emplace_back(std::move(*t.get()));
+      inputs_.at(i) = nullptr;
     }
-
     return v;
   }
 
@@ -123,12 +127,32 @@ class KernelContext {
     return output_range_.at(idx);
   }
 
-  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
-    return input_range_[idx];
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < input_range_.size()) {
+      input_range_[idx] = range;
+    } else if (idx == input_range_.size()) {
+      input_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          input_range_.size()));
+    }
   }
 
-  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
-    return output_range_[idx];
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < output_range_.size()) {
+      output_range_[idx] = range;
+    } else if (idx == output_range_.size()) {
+      output_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          output_range_.size()));
+    }
   }
 
   template <typename TensorType>
@@ -165,8 +189,10 @@ class KernelContext {
   // Only deal with DenseTensor now
   void ClearData() {
     for (auto& in : inputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
+      if (in) {
+        CompatibleDenseTensorUtils::ClearStorage(
+            static_cast<DenseTensor*>(in.get()));
+      }
     }
     for (auto& out : outputs_) {
       CompatibleDenseTensorUtils::ClearStorage(
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 7e6be1c391400685b5e4d28f4e2a64f75f4a4fae..dcfc8c55644d99c9c2094f259e1e003167b08557 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
-  template <typename... Tail>                                           \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
-    template <int dev_ctx_idx,                                          \
-              int in_idx,                                               \
-              int attr_idx,                                             \
-              int out_idx,                                              \
-              typename... PreviousArgs>                                 \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
-      static_assert(attr_idx == 0,                                      \
-                    "Kernel's Input should appear before Attributes."); \
-      static_assert(out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");    \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
-      std::vector<tensor_type> arg = std::move(                         \
-          ctx->InputBetween<tensor_type>(range.first, range.second));   \
-      KernelCallHelper<Tail...>::                                       \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
-              ctx, pargs..., arg);                                      \
-    }                                                                   \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
+  template <typename... Tail>                                              \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(attr_idx == 0,                                         \
+                    "Kernel's Input should appear before Attributes.");    \
+      static_assert(out_idx == 0,                                          \
+                    "Kernel's Input should appear before Outputs.");       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      std::vector<tensor_type> arg = std::move(                            \
+          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg);                                         \
+    }                                                                      \
   }
 
 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index 8f559b01b3bcb37275a4c0f24146b0fa323c35f1..7693e204eaa0912376e36d2a22d1e4d39fded5d9 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CPU,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 22ada75304f24559c559b9fe10ce32477608a44b..1a1d5cef300d4b3551c0f0009d7152ed0109219a 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CUDA,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}