未验证 提交 907433a7 编写于 作者: H Huang Jiyi 提交者: GitHub

[phi decoupling] remove fluid gpu_info usage in phi (#51699)

* remove fluid thread_data_registry

* update

* fix bug
上级 3f3372b6
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <thread>
#include <type_traits>
#include <unordered_map>
namespace paddle {
namespace framework {
template <typename T>
class ThreadDataRegistry {
public:
// Singleton
static ThreadDataRegistry& GetInstance() {
static ThreadDataRegistry instance;
return instance;
}
T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
const T& GetCurrentThreadData() { return CurrentThreadData(); }
template <typename Alias = T,
typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
void SetCurrentThreadData(const T& val) {
CurrentThreadData() = val;
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
return impl_->GetAllThreadDataByValue();
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
return impl_->GetAllThreadDataByRef();
}
private:
// types
// Lock types
#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC
#ifndef __APPLE__
#if __cplusplus >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif __cplusplus >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
// Special case : mac. https://github.com/facebook/react-native/issues/31250
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#elif defined(_MSC_VER) // MSVC
#if _MSVC_LANG >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif _MSVC_LANG >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#else // other compilers
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
class ThreadDataHolder;
class ThreadDataRegistryImpl {
public:
void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
std::lock_guard<LockType> guard(lock_);
tid_map_[tid] = tls_obj;
}
void UnregisterData(uint64_t tid) {
std::lock_guard<LockType> guard(lock_);
tid_map_.erase(tid);
}
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
std::unordered_map<uint64_t, T> data_copy;
SharedLockGuardType guard(lock_);
data_copy.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_copy.emplace(kv.first, kv.second->GetData());
}
return data_copy;
}
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
SharedLockGuardType guard(lock_);
data_ref.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
}
return data_ref;
}
private:
LockType lock_;
std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_; // not owned
};
class ThreadDataHolder {
public:
explicit ThreadDataHolder(
std::shared_ptr<ThreadDataRegistryImpl> registry) {
registry_ = std::move(registry);
tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
registry_->RegisterData(tid_, this);
}
~ThreadDataHolder() { registry_->UnregisterData(tid_); }
T& GetData() { return data_; }
private:
std::shared_ptr<ThreadDataRegistryImpl> registry_;
uint64_t tid_;
T data_;
};
// methods
ThreadDataRegistry() { impl_ = std::make_shared<ThreadDataRegistryImpl>(); }
ThreadDataRegistry(const ThreadDataRegistry&) = delete;
ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
T& CurrentThreadData() {
static thread_local ThreadDataHolder thread_data(impl_);
return thread_data.GetData();
}
// data
std::shared_ptr<ThreadDataRegistryImpl> impl_;
};
} // namespace framework
} // namespace paddle
......@@ -18,15 +18,15 @@ limitations under the License. */
#include <map>
#include <string>
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/common/thread_data_registry.h"
namespace paddle {
namespace memory {
using framework::ThreadDataRegistry;
using phi::ThreadDataRegistry;
struct ThreadLocalStatBase {
int64_t current{0};
......
......@@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
"Whether to print the message of gpu memory usage "
"MB as a unit of measurement.");
constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT;
namespace paddle {
namespace platform {
......@@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
}
size_t GpuAvailableMemToAlloc() {
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GpuMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
return phi::backends::gpu::GpuAvailableMemToAlloc();
}
size_t GpuMaxAllocSize() {
......@@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); }
size_t GpuMaxChunkSize() {
size_t max_chunk_size = GpuMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
......@@ -82,7 +82,8 @@ size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device.
size_t GpuReallocSize();
using phi::backends::gpu::GpuMinChunkSize;
//! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cupti.h"
......@@ -468,6 +469,9 @@ void InitMemoryMethod() {
memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
memory_method->device_memory_stat_current_value =
paddle::memory::DeviceMemoryStatCurrentValue;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
#endif
memory_utils.Init(std::move(memory_method));
});
}
......
......@@ -18,7 +18,6 @@
#include <type_traits>
#include <vector>
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/phi/api/profiler/host_event_recorder.h"
......
......@@ -18,6 +18,9 @@ limitations under the License. */
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/phi/common/memory_utils.h"
DECLARE_string(selected_gpus);
......@@ -56,6 +59,30 @@ std::vector<int> GetSelectedDevices() {
return devices;
}
constexpr static float fraction_reserve_gpu_memory = 0.05f;
size_t GpuAvailableMemToAlloc() {
size_t total = 0;
size_t available = 0;
memory_utils::GpuMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GpuMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
}
size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
} // namespace gpu
} // namespace backends
} // namespace phi
......@@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution.
void SetDeviceId(int device_id);
//! Get the available memory to allocate, which is the size of available gpu
//! minus reserving.
size_t GpuAvailableMemToAlloc();
//! Get the minimum chunk size for GPU buddy allocator.
inline size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t GpuMinChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst,
......
......@@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
dev_id);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total) {
return MemoryUtils::Instance().GpuMemoryUsage(available, total);
}
#endif
} // namespace memory_utils
} // namespace phi
......@@ -113,6 +113,16 @@ struct MemoryInterface {
*/
int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
int dev_id);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* @brief get the memory usage of current GPU device.
*
* @param[size_t] available device available memory to alloc
* @param[size_t] total device total memory
*/
void (*gpu_memory_usage)(size_t* available, size_t* total);
#endif
};
class MemoryUtils {
......@@ -234,6 +244,18 @@ class MemoryUtils {
return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total) {
CheckMemoryMethod();
PADDLE_ENFORCE_NOT_NULL(
memory_method_->gpu_memory_usage,
phi::errors::Unavailable(
"gpu_memory_usage method in memory_method_ is not initiazed "
"yet. You need init it first."));
return memory_method_->gpu_memory_usage(available, total);
}
#endif
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
......@@ -288,7 +310,13 @@ void Copy(const Place& dst_place,
const Place& src_place,
const void* src,
size_t num);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void GpuMemoryUsage(size_t* available, size_t* total);
#endif
} // namespace memory_utils
} // namespace phi
......@@ -20,7 +20,6 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h"
......@@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved)
<< " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
......
......@@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx,
compute_format == phi::backends::gpu::DataLayout::kNHWC
? phi::backends::gpu::DataLayout::kNHWC
: phi::backends::gpu::DataLayout::kNCHW;
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
if (transformed_input.dims().size() == 5) {
layout = compute_format == phi::backends::gpu::DataLayout::kNHWC
? phi::backends::gpu::DataLayout::kNDHWC
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册