未验证 提交 c1fd1b1c 编写于 作者: Y YuanRisheng 提交者: GitHub

[PTen]Make inplace_op and vector<DenseTensor> input compatible with old architecture (#37674)

* add inplace op adaptation

* optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor>

* refactor logic that transform variable to densetensor

* update func name
上级 f306965d
......@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
BuildPtenKernelContext(*runtime_ctx, dev_ctx);
(*pt_kernel_)(pt_kernel_context_.get());
WriteBackToOutputs(runtime_ctx);
pt_kernel_context_->ClearData();
} else {
(*kernel_func_)(
......@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (pt_kernel_context_->InputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
for (auto* var : ins_vector) {
tmp_inputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(*var, in_def));
}
pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
} else if (pt_kernel_context_->InputsSize() > start_idx) {
size_t input_size = pt_kernel_context_->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > start_idx + j) {
auto current_vector_size = pt_kernel_context_->InputsSize();
// If the memory needed is less than the current memory allocated, we will
// reuse the current memory by using ReMakePtenDenseTensorFromVar.
// Otherwise,we will create new storage.
for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
if (current_vector_size > start_idx + offset) {
auto& input_ptr =
pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
if (input_ptr == nullptr) {
input_ptr = experimental::MakePtenTensorBaseFromVar(
*ins_vector[offset], in_def);
} else {
experimental::ReMakePtenDenseTensorFromVar(
*ins_vector[j], in_def,
*ins_vector[offset], in_def,
pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
j));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
in_def));
*/
offset));
}
} else {
pt_kernel_context_->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
in_def));
}
pt_kernel_context_->MutableInputRangeAt(i) =
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->InputsSize()));
}
pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
for (size_t i = 0; i < output_names.size(); ++i) {
......@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (pt_kernel_context_->OutputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
for (auto* var : outs_vector) {
tmp_outputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(var, out_def));
}
pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
} else if (pt_kernel_context_->OutputsSize() > start_idx) {
size_t output_size = pt_kernel_context_->OutputsSize();
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j], out_def,
pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(outs_vector[j],
out_def));
*/
}
auto current_vector_size = pt_kernel_context_->OutputsSize();
// If the memory needed is less than the current memory allocated, we will
// reuse the current memory by using ReMakePtenDenseTensorFromVar.
// Otherwise,we will create new storage.
for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
if (current_vector_size > start_idx + offset) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[offset], out_def,
pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
offset));
} else {
pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
out_def));
}
pt_kernel_context_->MutableOutputRangeAt(i) =
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->OutputsSize()));
}
pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
i);
}
for (size_t i = 0; i < attr_names.size(); ++i) {
......
......@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext(
size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (kernel_ctx->InputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
for (const auto& var : ins_vector) {
const auto& variable = var->Var();
tmp_inputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(variable, in_def));
}
kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
} else if (kernel_ctx->InputsSize() > start_idx) {
size_t input_size = kernel_ctx->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > start_idx + j) {
auto current_vector_size = kernel_ctx->InputsSize();
// If the memory needed is less than the current memory allocated, we will
// reuse the current memory by using ReMakePtenDenseTensorFromVar.
// Otherwise,we will create new storage.
for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
const auto& variable = ins_vector[offset]->Var();
if (current_vector_size > start_idx + offset) {
auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
if (input_ptr == nullptr) {
input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
} else {
experimental::ReMakePtenDenseTensorFromVar(
ins_vector[j]->Var(), in_def,
kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
in_def));
*/
variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
start_idx + offset));
}
} else {
kernel_ctx->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(variable, in_def));
}
kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, kernel_ctx->InputsSize()));
}
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
for (size_t i = 0; i < output_names.size(); ++i) {
......@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext(
size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (kernel_ctx->OutputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
for (auto& var : outs_vector) {
auto* variable = var->MutableVar();
tmp_outputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(variable, out_def));
}
kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
} else if (kernel_ctx->OutputsSize() > start_idx) {
size_t output_size = kernel_ctx->OutputsSize();
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > i + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j]->MutableVar(), out_def,
kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(
outs_vector[j]->MutableVar(), out_def));
*/
}
auto current_vector_size = kernel_ctx->OutputsSize();
// If the memory needed is less than the current memory allocated, we will
// reuse the current memory by using ReMakePtenDenseTensorFromVar.
// Otherwise,we will create new storage.
for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
if (current_vector_size > start_idx + offset) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[offset]->MutableVar(), out_def,
kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
} else {
kernel_ctx->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(
outs_vector[offset]->MutableVar(), out_def));
}
kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, kernel_ctx->OutputsSize()));
}
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
}
for (size_t i = 0; i < attr_names.size(); ++i) {
......
......@@ -104,14 +104,18 @@ class KernelContext {
return static_cast<const TensorType&>(*(inputs_.at(idx)));
}
std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
return inputs_.at(idx);
}
template <typename TensorType>
std::vector<TensorType> InputBetween(size_t start, size_t end) const {
std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
std::vector<TensorType> v;
for (size_t i = start; i < end; ++i) {
auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
v.emplace_back(std::move(*t.get()));
inputs_.at(i) = nullptr;
}
return v;
}
......@@ -123,12 +127,32 @@ class KernelContext {
return output_range_.at(idx);
}
std::pair<int, int>& MutableInputRangeAt(size_t idx) {
return input_range_[idx];
void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
if (idx < input_range_.size()) {
input_range_[idx] = range;
} else if (idx == input_range_.size()) {
input_range_.emplace_back(range);
} else {
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"Invalid idx when trying to set InputRange, "
"index is `%d`, it is greater than the size(%d) of InputRange.",
idx,
input_range_.size()));
}
}
std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
return output_range_[idx];
void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
if (idx < output_range_.size()) {
output_range_[idx] = range;
} else if (idx == output_range_.size()) {
output_range_.emplace_back(range);
} else {
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"Invalid idx when trying to set InputRange, "
"index is `%d`, it is greater than the size(%d) of InputRange.",
idx,
output_range_.size()));
}
}
template <typename TensorType>
......@@ -165,8 +189,10 @@ class KernelContext {
// Only deal with DenseTensor now
void ClearData() {
for (auto& in : inputs_) {
CompatibleDenseTensorUtils::ClearStorage(
static_cast<DenseTensor*>(in.get()));
if (in) {
CompatibleDenseTensorUtils::ClearStorage(
static_cast<DenseTensor*>(in.get()));
}
}
for (auto& out : outputs_) {
CompatibleDenseTensorUtils::ClearStorage(
......
......@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
} \
}
#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
template <typename... Tail> \
struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int attr_idx, \
int out_idx, \
typename... PreviousArgs> \
static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \
static_assert(attr_idx == 0, \
"Kernel's Input should appear before Attributes."); \
static_assert(out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
const std::pair<int, int> range = ctx->InputRangeAt(in_idx); \
std::vector<tensor_type> arg = std::move( \
ctx->InputBetween<tensor_type>(range.first, range.second)); \
KernelCallHelper<Tail...>:: \
template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
ctx, pargs..., arg); \
} \
#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
template <typename... Tail> \
struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int attr_idx, \
int out_idx, \
typename... PreviousArgs> \
static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \
static_assert(attr_idx == 0, \
"Kernel's Input should appear before Attributes."); \
static_assert(out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
const std::pair<int, int> range = ctx->InputRangeAt(in_idx); \
std::vector<tensor_type> arg = std::move( \
ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
KernelCallHelper<Tail...>:: \
template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
ctx, pargs..., arg); \
} \
}
#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \
......
......@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
const std::vector<int64_t>& shape,
DenseTensor* out) {
auto out_meta = InferMetaFromVecValue(x.meta(), shape);
if (&x == out) {
if (x.data() == out->data() && x.numel() == out->numel()) {
out->Resize(out_meta.dims);
return;
}
......@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
CPU,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
CPU,
ANY,
pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
CPU,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
CPU,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
CPU,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
......@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
const std::vector<int64_t>& shape,
DenseTensor* out) {
auto out_meta = InferMetaFromVecValue(x.meta(), shape);
if (&x == out) {
if (x.data() == out->data() && x.numel() == out->numel()) {
out->Resize(out_meta.dims);
return;
}
......@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
CUDA,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
CUDA,
ANY,
pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
CUDA,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
CUDA,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
CUDA,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册