[PTen]Make inplace_op and vector<DenseTensor> input compatible with old architecture (#37674)

* add inplace op adaptation * optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor> * refactor logic that transform variable to densetensor * update func name

[PTen]Make inplace_op and vector<DenseTensor> input compatible with old architecture (#37674)
* add inplace op adaptation * optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor> * refactor logic that transform variable to densetensor * update func name
c1fd1b1c · YuanRisheng · GitHub · f306965d · c1fd1b1c · c1fd1b1c
6 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      }
      BuildPtenKernelContext(*runtime_ctx, dev_ctx);
      (*pt_kernel_)(pt_kernel_context_.get());
-
      WriteBackToOutputs(runtime_ctx);
-
      pt_kernel_context_->ClearData();
    } else {
      (*kernel_func_)(
@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t start_idx =
        (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
    size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (auto* var : ins_vector) {
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(*var, in_def));
-      }
-      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (pt_kernel_context_->InputsSize() > start_idx) {
-      size_t input_size = pt_kernel_context_->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = pt_kernel_context_->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr =
+            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(
+              *ins_vector[offset], in_def);
+        } else {
          experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[j], in_def,
+              *ins_vector[offset], in_def,
              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
-          in_def));
-          */
+                                                                    offset));
        }
+      } else {
+        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
+                                                    in_def));
      }
-      pt_kernel_context_->MutableInputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->InputsSize()));
    }
+    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }

  for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t start_idx =
        (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
    size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto* var : outs_vector) {
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(var, out_def));
-      }
-      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (pt_kernel_context_->OutputsSize() > start_idx) {
-      size_t output_size = pt_kernel_context_->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > start_idx + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j], out_def,
-              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                     j));
-
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(outs_vector[j],
-          out_def));
-              */
-        }
+    auto current_vector_size = pt_kernel_context_->OutputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset], out_def,
+            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
+                                                                   offset));
+      } else {
+        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                    out_def));
      }
-      pt_kernel_context_->MutableOutputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->OutputsSize()));
    }
+    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
+                                          i);
  }

  for (size_t i = 0; i < attr_names.size(); ++i) {

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext(

    size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
    size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (const auto& var : ins_vector) {
-        const auto& variable = var->Var();
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
-      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (kernel_ctx->InputsSize() > start_idx) {
-      size_t input_size = kernel_ctx->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = kernel_ctx->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      const auto& variable = ins_vector[offset]->Var();
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
+        } else {
          experimental::ReMakePtenDenseTensorFromVar(
-              ins_vector[j]->Var(), in_def,
-              kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
-                                                        in_def));
-          */
+              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
+                                    start_idx + offset));
        }
+      } else {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
      }
-      kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->InputsSize()));
    }
+    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }

  for (size_t i = 0; i < output_names.size(); ++i) {
@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext(

    size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
    size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto& var : outs_vector) {
-        auto* variable = var->MutableVar();
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, out_def));
-      }
-      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (kernel_ctx->OutputsSize() > start_idx) {
-      size_t output_size = kernel_ctx->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > i + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j]->MutableVar(), out_def,
-              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(
-                    outs_vector[j]->MutableVar(), out_def));
-          */
-        }
+    auto current_vector_size = kernel_ctx->OutputsSize();
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset]->MutableVar(), out_def,
+            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(
+                outs_vector[offset]->MutableVar(), out_def));
      }
-      kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->OutputsSize()));
    }
+    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
  }

  for (size_t i = 0; i < attr_names.size(); ++i) {

--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -104,14 +104,18 @@ class KernelContext {
    return static_cast<const TensorType&>(*(inputs_.at(idx)));
  }

+  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
+    return inputs_.at(idx);
+  }
+
  template <typename TensorType>
-  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
    std::vector<TensorType> v;
    for (size_t i = start; i < end; ++i) {
      auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
      v.emplace_back(std::move(*t.get()));
+      inputs_.at(i) = nullptr;
    }
-
    return v;
  }

@@ -123,12 +127,32 @@ class KernelContext {
    return output_range_.at(idx);
  }

-  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
-    return input_range_[idx];
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < input_range_.size()) {
+      input_range_[idx] = range;
+    } else if (idx == input_range_.size()) {
+      input_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          input_range_.size()));
+    }
  }

-  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
-    return output_range_[idx];
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < output_range_.size()) {
+      output_range_[idx] = range;
+    } else if (idx == output_range_.size()) {
+      output_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          output_range_.size()));
+    }
  }

  template <typename TensorType>
@@ -165,8 +189,10 @@ class KernelContext {
  // Only deal with DenseTensor now
  void ClearData() {
    for (auto& in : inputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
+      if (in) {
+        CompatibleDenseTensorUtils::ClearStorage(
+            static_cast<DenseTensor*>(in.get()));
+      }
    }
    for (auto& out : outputs_) {
      CompatibleDenseTensorUtils::ClearStorage(

--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
    }                                                                   \
  }

-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
-  template <typename... Tail>                                           \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
-    template <int dev_ctx_idx,                                          \
-              int in_idx,                                               \
-              int attr_idx,                                             \
-              int out_idx,                                              \
-              typename... PreviousArgs>                                 \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
-      static_assert(attr_idx == 0,                                      \
-                    "Kernel's Input should appear before Attributes."); \
-      static_assert(out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");    \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
-      std::vector<tensor_type> arg = std::move(                         \
-          ctx->InputBetween<tensor_type>(range.first, range.second));   \
-      KernelCallHelper<Tail...>::                                       \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
-              ctx, pargs..., arg);                                      \
-    }                                                                   \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
+  template <typename... Tail>                                              \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(attr_idx == 0,                                         \
+                    "Kernel's Input should appear before Attributes.");    \
+      static_assert(out_idx == 0,                                          \
+                    "Kernel's Input should appear before Outputs.");       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      std::vector<tensor_type> arg = std::move(                            \
+          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg);                                         \
+    }                                                                      \
  }

 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \

--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
                          const std::vector<int64_t>& shape,
                          DenseTensor* out) {
  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
    out->Resize(out_meta.dims);
    return;
  }
@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                CPU,
                                ANY,
                                pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
                          const std::vector<int64_t>& shape,
                          DenseTensor* out) {
  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
    out->Resize(out_meta.dims);
    return;
  }
@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                CUDA,
                                ANY,
                                pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}