From d71b9ba75cdf65de2f3cbc753f178df404f9d2c9 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 29 Jul 2021 14:41:34 +0800 Subject: [PATCH] [NPU] Avoid cpu tensor freed before copying to npu completed (#34475) --- paddle/fluid/framework/tensor_util.cc | 33 +++++++++++++++---- .../fluid/operators/lookup_table_v2_op_npu.cc | 4 --- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d2616da7a12..15021b6267b 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN -#include "dnnl_debug.h" +#include "dnnl_debug.h" // NOLINT #endif namespace paddle { @@ -112,11 +112,32 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, - stream); + // 1. cpu tensor -> npu pinned tensor + platform::NPUPinnedPlace npu_pinned_place; + Tensor npu_pinned_tensor; + npu_pinned_tensor.Resize(src.dims()); + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(npu_pinned_place, src.type()); + memory::Copy(npu_pinned_place, npu_pinned_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } else if (platform::is_npu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 020dbad5307..2a8f4746234 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,10 +40,6 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); - // add copy ids to ensure ids_t is prepared. - std::vector ids; - TensorToVector(*ids_t, ctx.device_context(), &ids); - NpuOpRunner runner; runner.SetType("GatherV2") .AddInput(*table_t) -- GitLab