[phi decoupling] remove fluid gpu_info usage in phi (#51699)

* remove fluid thread_data_registry * update * fix bug

[phi decoupling] remove fluid gpu_info usage in phi (#51699)
* remove fluid thread_data_registry * update * fix bug
907433a7 · Huang Jiyi · GitHub · 3f3372b6 · 3f3372b6 · 907433a7
12 changed file
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <thread>
-#include <type_traits>
-#include <unordered_map>
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-class ThreadDataRegistry {
- public:
-  // Singleton
-  static ThreadDataRegistry& GetInstance() {
-    static ThreadDataRegistry instance;
-    return instance;
-  }
-
-  T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
-
-  const T& GetCurrentThreadData() { return CurrentThreadData(); }
-
-  template <typename Alias = T,
-            typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
-  void SetCurrentThreadData(const T& val) {
-    CurrentThreadData() = val;
-  }
-
-  // Returns current snapshot of all threads. Make sure there is no thread
-  // create/destory when using it.
-  template <
-      typename Alias = T,
-      typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
-  std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
-    return impl_->GetAllThreadDataByValue();
-  }
-
-  // Returns current snapshot of all threads. Make sure there is no thread
-  // create/destory when using it.
-  std::unordered_map<uint64_t, std::reference_wrapper<T>>
-  GetAllThreadDataByRef() {
-    return impl_->GetAllThreadDataByRef();
-  }
-
- private:
-// types
-// Lock types
-#if defined(__clang__) || defined(__GNUC__)  // CLANG or GCC
-#ifndef __APPLE__
-#if __cplusplus >= 201703L
-  using LockType = std::shared_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
-#elif __cplusplus >= 201402L
-  using LockType = std::shared_timed_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-// Special case : mac. https://github.com/facebook/react-native/issues/31250
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-#elif defined(_MSC_VER)  // MSVC
-#if _MSVC_LANG >= 201703L
-  using LockType = std::shared_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
-#elif _MSVC_LANG >= 201402L
-  using LockType = std::shared_timed_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-#else  // other compilers
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-
-  class ThreadDataHolder;
-  class ThreadDataRegistryImpl {
-   public:
-    void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
-      std::lock_guard<LockType> guard(lock_);
-      tid_map_[tid] = tls_obj;
-    }
-
-    void UnregisterData(uint64_t tid) {
-      std::lock_guard<LockType> guard(lock_);
-      tid_map_.erase(tid);
-    }
-
-    template <
-        typename Alias = T,
-        typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
-    std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
-      std::unordered_map<uint64_t, T> data_copy;
-      SharedLockGuardType guard(lock_);
-      data_copy.reserve(tid_map_.size());
-      for (auto& kv : tid_map_) {
-        data_copy.emplace(kv.first, kv.second->GetData());
-      }
-      return data_copy;
-    }
-
-    std::unordered_map<uint64_t, std::reference_wrapper<T>>
-    GetAllThreadDataByRef() {
-      std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
-      SharedLockGuardType guard(lock_);
-      data_ref.reserve(tid_map_.size());
-      for (auto& kv : tid_map_) {
-        data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
-      }
-      return data_ref;
-    }
-
-   private:
-    LockType lock_;
-    std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_;  // not owned
-  };
-
-  class ThreadDataHolder {
-   public:
-    explicit ThreadDataHolder(
-        std::shared_ptr<ThreadDataRegistryImpl> registry) {
-      registry_ = std::move(registry);
-      tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
-      registry_->RegisterData(tid_, this);
-    }
-
-    ~ThreadDataHolder() { registry_->UnregisterData(tid_); }
-
-    T& GetData() { return data_; }
-
-   private:
-    std::shared_ptr<ThreadDataRegistryImpl> registry_;
-    uint64_t tid_;
-    T data_;
-  };
-
-  // methods
-  ThreadDataRegistry() { impl_ = std::make_shared<ThreadDataRegistryImpl>(); }
-
-  ThreadDataRegistry(const ThreadDataRegistry&) = delete;
-
-  ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
-
-  T& CurrentThreadData() {
-    static thread_local ThreadDataHolder thread_data(impl_);
-    return thread_data.GetData();
-  }
-
-  // data
-  std::shared_ptr<ThreadDataRegistryImpl> impl_;
-};
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -18,15 +18,15 @@ limitations under the License. */
 #include <map>
 #include <string>

-#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/common/thread_data_registry.h"

 namespace paddle {
 namespace memory {

-using framework::ThreadDataRegistry;
+using phi::ThreadDataRegistry;

 struct ThreadLocalStatBase {
  int64_t current{0};

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
                            "Whether to print the message of gpu memory usage "
                            "MB as a unit of measurement.");

-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
@@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }

 size_t GpuAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = GpuMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
+  return phi::backends::gpu::GpuAvailableMemToAlloc();
 }

 size_t GpuMaxAllocSize() {
@@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }

 size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }

+size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); }
+
 size_t GpuMaxChunkSize() {
  size_t max_chunk_size = GpuMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";

--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -82,7 +82,8 @@ size_t GpuInitAllocSize();
 //! Get the re-allocation size of current GPU device.
 size_t GpuReallocSize();

-using phi::backends::gpu::GpuMinChunkSize;
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();

 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cupti.h"
@@ -468,6 +469,9 @@ void InitMemoryMethod() {
    memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
    memory_method->device_memory_stat_current_value =
        paddle::memory::DeviceMemoryStatCurrentValue;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
+#endif
    memory_utils.Init(std::move(memory_method));
  });
 }

--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -18,7 +18,6 @@
 #include <type_traits>
 #include <vector>

-#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/phi/api/profiler/host_event_recorder.h"

--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #include <vector>

 #include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "paddle/phi/common/memory_utils.h"

 DECLARE_string(selected_gpus);

@@ -56,6 +59,30 @@ std::vector<int> GetSelectedDevices() {
  return devices;
 }

+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+size_t GpuAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  memory_utils::GpuMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);

+//! Get the available memory to allocate, which is the size of available gpu
+//! minus reserving.
+size_t GpuAvailableMemToAlloc();
+
 //! Get the minimum chunk size for GPU buddy allocator.
-inline size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
+size_t GpuMinChunkSize();

 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst,

--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
  return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
                                                              dev_id);
 }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void GpuMemoryUsage(size_t* available, size_t* total) {
+  return MemoryUtils::Instance().GpuMemoryUsage(available, total);
+}
+#endif
+
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -113,6 +113,16 @@ struct MemoryInterface {
   */
  int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                              int dev_id);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  /**
+   * @brief get the memory usage of current GPU device.
+   *
+   * @param[size_t] available  device available memory to alloc
+   * @param[size_t] total      device total memory
+   */
+  void (*gpu_memory_usage)(size_t* available, size_t* total);
+#endif
 };

 class MemoryUtils {
@@ -234,6 +244,18 @@ class MemoryUtils {
    return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
  }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void GpuMemoryUsage(size_t* available, size_t* total) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NOT_NULL(
+        memory_method_->gpu_memory_usage,
+        phi::errors::Unavailable(
+            "gpu_memory_usage method in memory_method_ is not initiazed "
+            "yet. You need init it first."));
+    return memory_method_->gpu_memory_usage(available, total);
+  }
+#endif
+
  void CheckMemoryMethod() {
    PADDLE_ENFORCE_NE(
        memory_method_.get(),
@@ -288,7 +310,13 @@ void Copy(const Place& dst_place,
          const Place& src_place,
          const void* src,
          size_t num);
+
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void GpuMemoryUsage(size_t* available, size_t* total);
+#endif
+
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"
@@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
        memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
    int64_t reserved =
        memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
-    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
+    int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";

--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx,
      compute_format == phi::backends::gpu::DataLayout::kNHWC
          ? phi::backends::gpu::DataLayout::kNHWC
          : phi::backends::gpu::DataLayout::kNCHW;
-  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
  if (transformed_input.dims().size() == 5) {
    layout = compute_format == phi::backends::gpu::DataLayout::kNHWC
                 ? phi::backends::gpu::DataLayout::kNDHWC