diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6ef44fb127afbe56207136116fbc09e933b1c7f0..d60fdd90e2a2a438a461a2d6fd7462ccfe5576d8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } BuildPtenKernelContext(*runtime_ctx, dev_ctx); (*pt_kernel_)(pt_kernel_context_.get()); - WriteBackToOutputs(runtime_ctx); - pt_kernel_context_->ClearData(); } else { (*kernel_func_)( @@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t start_idx = (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (pt_kernel_context_->InputsSize() == start_idx) { - paddle::SmallVector> tmp_inputs; - for (auto* var : ins_vector) { - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(*var, in_def)); - } - pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs)); - } else if (pt_kernel_context_->InputsSize() > start_idx) { - size_t input_size = pt_kernel_context_->InputsSize(); - for (size_t j = 0; j < ins_vector.size(); ++j) { - if (input_size > start_idx + j) { + auto current_vector_size = pt_kernel_context_->InputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + auto& input_ptr = + pt_kernel_context_->MutableInputPtrAt(start_idx + offset); + if (input_ptr == nullptr) { + input_ptr = experimental::MakePtenTensorBaseFromVar( + *ins_vector[offset], in_def); + } else { experimental::ReMakePtenDenseTensorFromVar( - *ins_vector[j], in_def, + *ins_vector[offset], in_def, pt_kernel_context_->MutableInputAt(start_idx + - j)); - // TODO(chentianyu03): When multi input kernel, open this code - /* - } else { - pt_kernel_context_->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(*ins_vector[j], - in_def)); - */ + offset)); } + } else { + pt_kernel_context_->EmplaceBackInputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], + in_def)); } - pt_kernel_context_->MutableInputRangeAt(i) = - std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.inputs.size() is " - "`%d`.", - start_idx, pt_kernel_context_->InputsSize())); } + pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t start_idx = (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second); size_t end_idx = start_idx + outs_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (pt_kernel_context_->OutputsSize() == start_idx) { - paddle::SmallVector> tmp_outputs; - for (auto* var : outs_vector) { - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(var, out_def)); - } - pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs)); - } else if (pt_kernel_context_->OutputsSize() > start_idx) { - size_t output_size = pt_kernel_context_->OutputsSize(); - for (size_t j = 0; j < outs_vector.size(); ++j) { - if (output_size > start_idx + j) { - experimental::ReMakePtenDenseTensorFromVar( - outs_vector[j], out_def, - pt_kernel_context_->MutableOutputAt(start_idx + - j)); - - // TODO(chentianyu03): When multi output kernel, open this code - /* - } else { - pt_kernel_context_->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(outs_vector[j], - out_def)); - */ - } + auto current_vector_size = pt_kernel_context_->OutputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[offset], out_def, + pt_kernel_context_->MutableOutputAt(start_idx + + offset)); + } else { + pt_kernel_context_->EmplaceBackOutputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(outs_vector[offset], + out_def)); } - pt_kernel_context_->MutableOutputRangeAt(i) = - std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.outputs.size() is " - "`%d`.", - start_idx, pt_kernel_context_->OutputsSize())); } + pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx), + i); } for (size_t i = 0; i < attr_names.size(); ++i) { diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 604f9d2be9e487237789243e0a44486af7a75918..8e61b7d2eed880d58c6bdd832f9e95ff4bff6d2c 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (kernel_ctx->InputsSize() == start_idx) { - paddle::SmallVector> tmp_inputs; - for (const auto& var : ins_vector) { - const auto& variable = var->Var(); - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, in_def)); - } - kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs)); - } else if (kernel_ctx->InputsSize() > start_idx) { - size_t input_size = kernel_ctx->InputsSize(); - for (size_t j = 0; j < ins_vector.size(); ++j) { - if (input_size > start_idx + j) { + auto current_vector_size = kernel_ctx->InputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + const auto& variable = ins_vector[offset]->Var(); + if (current_vector_size > start_idx + offset) { + auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset); + if (input_ptr == nullptr) { + input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def); + } else { experimental::ReMakePtenDenseTensorFromVar( - ins_vector[j]->Var(), in_def, - kernel_ctx->MutableInputAt(start_idx + j)); - // TODO(chentianyu03): When multi input kernel, open this code - /* - } else { - kernel_ctx->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(), - in_def)); - */ + variable, in_def, kernel_ctx->MutableInputAt( + start_idx + offset)); } + } else { + kernel_ctx->EmplaceBackInputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(variable, in_def)); } - kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.inputs.size() is " - "`%d`.", - start_idx, kernel_ctx->InputsSize())); } + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second); size_t end_idx = start_idx + outs_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (kernel_ctx->OutputsSize() == start_idx) { - paddle::SmallVector> tmp_outputs; - for (auto& var : outs_vector) { - auto* variable = var->MutableVar(); - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, out_def)); - } - kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs)); - } else if (kernel_ctx->OutputsSize() > start_idx) { - size_t output_size = kernel_ctx->OutputsSize(); - for (size_t j = 0; j < outs_vector.size(); ++j) { - if (output_size > i + j) { - experimental::ReMakePtenDenseTensorFromVar( - outs_vector[j]->MutableVar(), out_def, - kernel_ctx->MutableOutputAt(i + j)); - // TODO(chentianyu03): When multi output kernel, open this code - /* - } else { - kernel_ctx->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar( - outs_vector[j]->MutableVar(), out_def)); - */ - } + auto current_vector_size = kernel_ctx->OutputsSize(); + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[offset]->MutableVar(), out_def, + kernel_ctx->MutableOutputAt(start_idx + offset)); + } else { + kernel_ctx->EmplaceBackOutputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar( + outs_vector[offset]->MutableVar(), out_def)); } - kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.outputs.size() is " - "`%d`.", - start_idx, kernel_ctx->OutputsSize())); } + kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h index 4f4d673dfe6c4bc0a5c21d3282a18e9435ee914a..8a87a5b735e99eb13114e9bd60777aff7e18ac7f 100644 --- a/paddle/pten/core/kernel_context.h +++ b/paddle/pten/core/kernel_context.h @@ -104,14 +104,18 @@ class KernelContext { return static_cast(*(inputs_.at(idx))); } + std::shared_ptr& MutableInputPtrAt(size_t idx) { + return inputs_.at(idx); + } + template - std::vector InputBetween(size_t start, size_t end) const { + std::vector MoveInputsBetween(size_t start, size_t end) { std::vector v; for (size_t i = start; i < end; ++i) { auto t = std::dynamic_pointer_cast(inputs_.at(i)); v.emplace_back(std::move(*t.get())); + inputs_.at(i) = nullptr; } - return v; } @@ -123,12 +127,32 @@ class KernelContext { return output_range_.at(idx); } - std::pair& MutableInputRangeAt(size_t idx) { - return input_range_[idx]; + void AssignInputRange(std::pair&& range, size_t idx) { + if (idx < input_range_.size()) { + input_range_[idx] = range; + } else if (idx == input_range_.size()) { + input_range_.emplace_back(range); + } else { + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "Invalid idx when trying to set InputRange, " + "index is `%d`, it is greater than the size(%d) of InputRange.", + idx, + input_range_.size())); + } } - std::pair& MutableOutputRangeAt(size_t idx) { - return output_range_[idx]; + void AssignOutputRange(std::pair&& range, size_t idx) { + if (idx < output_range_.size()) { + output_range_[idx] = range; + } else if (idx == output_range_.size()) { + output_range_.emplace_back(range); + } else { + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "Invalid idx when trying to set InputRange, " + "index is `%d`, it is greater than the size(%d) of InputRange.", + idx, + output_range_.size())); + } } template @@ -165,8 +189,10 @@ class KernelContext { // Only deal with DenseTensor now void ClearData() { for (auto& in : inputs_) { - CompatibleDenseTensorUtils::ClearStorage( - static_cast(in.get())); + if (in) { + CompatibleDenseTensorUtils::ClearStorage( + static_cast(in.get())); + } } for (auto& out : outputs_) { CompatibleDenseTensorUtils::ClearStorage( diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index 7e6be1c391400685b5e4d28f4e2a64f75f4a4fae..dcfc8c55644d99c9c2094f259e1e003167b08557 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext; } \ } -#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ - template \ - struct KernelCallHelper&, Tail...> { \ - template \ - static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ - std::vector arg = std::move( \ - ctx->InputBetween(range.first, range.second)); \ - KernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ +#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ + template \ + struct KernelCallHelper&, Tail...> { \ + template \ + static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::pair range = ctx->InputRangeAt(in_idx); \ + std::vector arg = std::move( \ + ctx->MoveInputsBetween(range.first, range.second)); \ + KernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ } #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc index 8f559b01b3bcb37275a4c0f24146b0fa323c35f1..7693e204eaa0912376e36d2a22d1e4d39fded5d9 100644 --- a/paddle/pten/kernels/cpu/manipulation.cc +++ b/paddle/pten/kernels/cpu/manipulation.cc @@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx, const std::vector& shape, DenseTensor* out) { auto out_meta = InferMetaFromVecValue(x.meta(), shape); - if (&x == out) { + if (x.data() == out->data() && x.numel() == out->numel()) { out->Resize(out_meta.dims); return; } @@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid", CPU, ANY, pten::ReshapeFromVectorValWithXShape) {} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host", + CPU, + ANY, + pten::ReshapeFromDT) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid", + CPU, + ANY, + pten::ReshapeFromDTWithXShape) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost", + CPU, + ANY, + pten::ReshapeFromVectorDT) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid", + CPU, + ANY, + pten::ReshapeFromVectorDTWithXShape) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu index 22ada75304f24559c559b9fe10ce32477608a44b..1a1d5cef300d4b3551c0f0009d7152ed0109219a 100644 --- a/paddle/pten/kernels/cuda/manipulation.cu +++ b/paddle/pten/kernels/cuda/manipulation.cu @@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx, const std::vector& shape, DenseTensor* out) { auto out_meta = InferMetaFromVecValue(x.meta(), shape); - if (&x == out) { + if (x.data() == out->data() && x.numel() == out->numel()) { out->Resize(out_meta.dims); return; } @@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid", CUDA, ANY, pten::ReshapeFromVectorValWithXShape) {} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host", + CUDA, + ANY, + pten::ReshapeFromDT) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid", + CUDA, + ANY, + pten::ReshapeFromDTWithXShape) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost", + CUDA, + ANY, + pten::ReshapeFromVectorDT) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +} + +PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid", + CUDA, + ANY, + pten::ReshapeFromVectorDTWithXShape) { + kernel->InputAt(1).SetBackend(pten::Backend::CPU); + kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); +}