提交 02270a62 编写于 作者: X xiaojun.lin

MS-568 Fix GpuResource free error


Former-commit-id: 52acd35e93a5293c70c45bb681fc54046b43a2cb
上级 aec55278
......@@ -5,6 +5,7 @@ Please mark all change in change log and use the ticket from JIRA.
# Milvus 0.5.0 (TODO)
## Bug
- MS-568 - Fix gpuresource free error
## Improvement
- MS-552 - Add and change the easylogging library
......
......@@ -48,6 +48,7 @@ set(index_srcs
knowhere/index/vector_index/nsg/nsg_io.cpp
knowhere/index/vector_index/nsg/utils.cpp
knowhere/index/vector_index/cloner.cpp
knowhere/index/vector_index/FaissGpuResourceMgr.cpp
)
set(depend_libs
......
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "FaissGpuResourceMgr.h"
namespace zilliz {
namespace knowhere {
FaissGpuResourceMgr &FaissGpuResourceMgr::GetInstance() {
static FaissGpuResourceMgr instance;
return instance;
}
void FaissGpuResourceMgr::AllocateTempMem(ResPtr &resource,
const int64_t &device_id,
const int64_t &size) {
if (size) {
resource->faiss_res->setTempMemory(size);
}
else {
auto search = devices_params_.find(device_id);
if (search != devices_params_.end()) {
resource->faiss_res->setTempMemory(search->second.temp_mem_size);
}
// else do nothing. allocate when use.
}
}
void FaissGpuResourceMgr::InitDevice(int64_t device_id,
int64_t pin_mem_size,
int64_t temp_mem_size,
int64_t res_num) {
DeviceParams params;
params.pinned_mem_size = pin_mem_size;
params.temp_mem_size = temp_mem_size;
params.resource_num = res_num;
devices_params_.emplace(device_id, params);
}
void FaissGpuResourceMgr::InitResource() {
if(is_init) return ;
is_init = true;
//std::cout << "InitResource" << std::endl;
for(auto& device : devices_params_) {
auto& device_id = device.first;
mutex_cache_.emplace(device_id, std::make_unique<std::mutex>());
//std::cout << "Device Id: " << device_id << std::endl;
auto& device_param = device.second;
auto& bq = idle_map_[device_id];
for (int64_t i = 0; i < device_param.resource_num; ++i) {
//std::cout << "Resource Id: " << i << std::endl;
auto raw_resource = std::make_shared<faiss::gpu::StandardGpuResources>();
// TODO(linxj): enable set pinned memory
auto res_wrapper = std::make_shared<Resource>(raw_resource);
AllocateTempMem(res_wrapper, device_id, 0);
bq.Put(res_wrapper);
}
}
//std::cout << "End initResource" << std::endl;
}
ResPtr FaissGpuResourceMgr::GetRes(const int64_t &device_id,
const int64_t &alloc_size) {
InitResource();
auto finder = idle_map_.find(device_id);
if (finder != idle_map_.end()) {
auto& bq = finder->second;
auto&& resource = bq.Take();
AllocateTempMem(resource, device_id, alloc_size);
return resource;
}
return nullptr;
}
void FaissGpuResourceMgr::MoveToIdle(const int64_t &device_id, const ResPtr &res) {
auto finder = idle_map_.find(device_id);
if (finder != idle_map_.end()) {
auto& bq = finder->second;
bq.Put(res);
}
}
void FaissGpuResourceMgr::Free() {
for (auto &item : idle_map_) {
auto& bq = item.second;
while (!bq.Empty()) {
bq.Take();
}
}
is_init = false;
}
void
FaissGpuResourceMgr::Dump() {
for (auto &item : idle_map_) {
auto& bq = item.second;
std::cout << "device_id: " << item.first
<< ", resource count:" << bq.Size();
}
}
} // knowhere
} // zilliz
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <mutex>
#include <map>
#include <faiss/gpu/StandardGpuResources.h>
#include "src/utils/BlockingQueue.h"
namespace zilliz {
namespace knowhere {
struct Resource {
explicit Resource(std::shared_ptr<faiss::gpu::StandardGpuResources> &r) : faiss_res(r) {
static int64_t global_id = 0;
id = global_id++;
}
std::shared_ptr<faiss::gpu::StandardGpuResources> faiss_res;
int64_t id;
std::mutex mutex;
};
using ResPtr = std::shared_ptr<Resource>;
using ResWPtr = std::weak_ptr<Resource>;
class FaissGpuResourceMgr {
public:
friend class ResScope;
using ResBQ = zilliz::milvus::server::BlockingQueue<ResPtr>;
public:
struct DeviceParams {
int64_t temp_mem_size = 0;
int64_t pinned_mem_size = 0;
int64_t resource_num = 2;
};
public:
static FaissGpuResourceMgr &
GetInstance();
// Free gpu resource, avoid cudaGetDevice error when deallocate.
// this func should be invoke before main return
void
Free();
void
AllocateTempMem(ResPtr &resource, const int64_t& device_id, const int64_t& size);
void
InitDevice(int64_t device_id,
int64_t pin_mem_size = 0,
int64_t temp_mem_size = 0,
int64_t res_num = 2);
void
InitResource();
// allocate gpu memory invoke by build or copy_to_gpu
ResPtr
GetRes(const int64_t &device_id, const int64_t& alloc_size = 0);
void
MoveToIdle(const int64_t &device_id, const ResPtr& res);
void
Dump();
protected:
bool is_init = false;
std::map<int64_t ,std::unique_ptr<std::mutex>> mutex_cache_;
std::map<int64_t, DeviceParams> devices_params_;
std::map<int64_t, ResBQ> idle_map_;
};
class ResScope {
public:
ResScope(ResPtr &res, const int64_t& device_id, const bool& isown)
: resource(res), device_id(device_id), move(true), own(isown) {
Lock();
}
// specif for search
// get the ownership of gpuresource and gpu
ResScope(ResWPtr &res, const int64_t &device_id)
:device_id(device_id),move(false),own(true) {
resource = res.lock();
Lock();
}
void Lock() {
if (own) FaissGpuResourceMgr::GetInstance().mutex_cache_[device_id]->lock();
resource->mutex.lock();
}
~ResScope() {
if (own) FaissGpuResourceMgr::GetInstance().mutex_cache_[device_id]->unlock();
if (move) FaissGpuResourceMgr::GetInstance().MoveToIdle(device_id, resource);
resource->mutex.unlock();
}
private:
ResPtr resource; // hold resource until deconstruct
int64_t device_id;
bool move = true;
bool own = false;
};
} // knowhere
} // zilliz
\ No newline at end of file
......@@ -67,9 +67,9 @@ void GPUIVF::set_index_model(IndexModelPtr model) {
auto host_index = std::static_pointer_cast<IVFIndexModel>(model);
if (auto gpures = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
ResScope rs(gpures, gpu_id_, false);
res_ = gpures;
auto device_index = faiss::gpu::index_cpu_to_gpu(res_->faiss_res.get(), gpu_id_, host_index->index_.get());
auto device_index = faiss::gpu::index_cpu_to_gpu(gpures->faiss_res.get(), gpu_id_, host_index->index_.get());
index_.reset(device_index);
res_ = gpures;
} else {
KNOWHERE_THROW_MSG("load index model error, can't get gpu_resource");
}
......@@ -114,9 +114,9 @@ void GPUIVF::LoadImpl(const BinarySet &index_binary) {
if (auto temp_res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
ResScope rs(temp_res, gpu_id_, false);
res_ = temp_res;
auto device_index = faiss::gpu::index_cpu_to_gpu(res_->faiss_res.get(), gpu_id_, index);
auto device_index = faiss::gpu::index_cpu_to_gpu(temp_res->faiss_res.get(), gpu_id_, index);
index_.reset(device_index);
res_ = temp_res;
} else {
KNOWHERE_THROW_MSG("Load error, can't get gpu resource");
}
......@@ -176,12 +176,13 @@ VectorIndexPtr GPUIVF::CopyGpuToGpu(const int64_t &device_id, const Config &conf
auto host_index = CopyGpuToCpu(config);
return std::static_pointer_cast<IVF>(host_index)->CopyCpuToGpu(device_id, config);
}
void GPUIVF::Add(const DatasetPtr &dataset, const Config &config) {
auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_);
if (temp_resource != nullptr) {
ResScope rs(temp_resource, gpu_id_, true);
if (auto spt = res_.lock()) {
ResScope rs(res_, gpu_id_);
IVF::Add(dataset, config);
} else {
}
else {
KNOWHERE_THROW_MSG("Add IVF can't get gpu resource");
}
}
......@@ -264,108 +265,6 @@ VectorIndexPtr GPUIVFSQ::CopyGpuToCpu(const Config &config) {
return std::make_shared<IVFSQ>(new_index);
}
FaissGpuResourceMgr &FaissGpuResourceMgr::GetInstance() {
static FaissGpuResourceMgr instance;
return instance;
}
void FaissGpuResourceMgr::AllocateTempMem(ResPtr &resource,
const int64_t &device_id,
const int64_t &size) {
if (size) {
resource->faiss_res->setTempMemory(size);
}
else {
auto search = devices_params_.find(device_id);
if (search != devices_params_.end()) {
resource->faiss_res->setTempMemory(search->second.temp_mem_size);
}
// else do nothing. allocate when use.
}
}
void FaissGpuResourceMgr::InitDevice(int64_t device_id,
int64_t pin_mem_size,
int64_t temp_mem_size,
int64_t res_num) {
DeviceParams params;
params.pinned_mem_size = pin_mem_size;
params.temp_mem_size = temp_mem_size;
params.resource_num = res_num;
devices_params_.emplace(device_id, params);
}
void FaissGpuResourceMgr::InitResource() {
if(is_init) return ;
is_init = true;
//std::cout << "InitResource" << std::endl;
for(auto& device : devices_params_) {
auto& device_id = device.first;
mutex_cache_.emplace(device_id, std::make_unique<std::mutex>());
//std::cout << "Device Id: " << device_id << std::endl;
auto& device_param = device.second;
auto& bq = idle_map_[device_id];
for (int64_t i = 0; i < device_param.resource_num; ++i) {
//std::cout << "Resource Id: " << i << std::endl;
auto raw_resource = std::make_shared<faiss::gpu::StandardGpuResources>();
// TODO(linxj): enable set pinned memory
auto res_wrapper = std::make_shared<Resource>(raw_resource);
AllocateTempMem(res_wrapper, device_id, 0);
bq.Put(res_wrapper);
}
}
//std::cout << "End initResource" << std::endl;
}
ResPtr FaissGpuResourceMgr::GetRes(const int64_t &device_id,
const int64_t &alloc_size) {
InitResource();
auto finder = idle_map_.find(device_id);
if (finder != idle_map_.end()) {
auto& bq = finder->second;
auto&& resource = bq.Take();
AllocateTempMem(resource, device_id, alloc_size);
return resource;
}
return nullptr;
}
void FaissGpuResourceMgr::MoveToIdle(const int64_t &device_id, const ResPtr &res) {
auto finder = idle_map_.find(device_id);
if (finder != idle_map_.end()) {
auto& bq = finder->second;
bq.Put(res);
}
}
void FaissGpuResourceMgr::Free() {
for (auto &item : idle_map_) {
auto& bq = item.second;
while (!bq.Empty()) {
bq.Take();
}
}
is_init = false;
}
void
FaissGpuResourceMgr::Dump() {
for (auto &item : idle_map_) {
auto& bq = item.second;
std::cout << "device_id: " << item.first
<< ", resource count:" << bq.Size();
}
}
void GPUIndex::SetGpuDevice(const int &gpu_id) {
gpu_id_ = gpu_id;
}
......
......@@ -18,118 +18,18 @@
#pragma once
#include <faiss/gpu/StandardGpuResources.h>
#include "ivf.h"
#include "src/utils/BlockingQueue.h"
#include "FaissGpuResourceMgr.h"
namespace zilliz {
namespace knowhere {
struct Resource {
explicit Resource(std::shared_ptr<faiss::gpu::StandardGpuResources> &r): faiss_res(r) {
static int64_t global_id = 0;
id = global_id++;
}
std::shared_ptr<faiss::gpu::StandardGpuResources> faiss_res;
int64_t id;
std::mutex mutex;
};
using ResPtr = std::shared_ptr<Resource>;
using ResWPtr = std::weak_ptr<Resource>;
class FaissGpuResourceMgr {
public:
friend class ResScope;
public:
using ResBQ = zilliz::milvus::server::BlockingQueue<ResPtr>;
struct DeviceParams {
int64_t temp_mem_size = 0;
int64_t pinned_mem_size = 0;
int64_t resource_num = 2;
};
public:
static FaissGpuResourceMgr &
GetInstance();
// Free gpu resource, avoid cudaGetDevice error when deallocate.
// this func should be invoke before main return
void
Free();
void
AllocateTempMem(ResPtr &resource, const int64_t& device_id, const int64_t& size);
void
InitDevice(int64_t device_id,
int64_t pin_mem_size = 0,
int64_t temp_mem_size = 0,
int64_t res_num = 2);
void
InitResource();
// allocate gpu memory invoke by build or copy_to_gpu
ResPtr
GetRes(const int64_t &device_id, const int64_t& alloc_size = 0);
// allocate gpu memory before search
// this func will return True if the device is idle and exists an idle resource.
//bool
//GetRes(const int64_t& device_id, ResPtr &res, const int64_t& alloc_size = 0);
void
MoveToIdle(const int64_t &device_id, const ResPtr& res);
void
Dump();
protected:
bool is_init = false;
std::map<int64_t ,std::unique_ptr<std::mutex>> mutex_cache_;
std::map<int64_t, DeviceParams> devices_params_;
std::map<int64_t, ResBQ> idle_map_;
};
class ResScope {
public:
ResScope(ResPtr &res, const int64_t& device_id, const bool& isown)
: resource(res), device_id(device_id), move(true), own(isown) {
if (isown) FaissGpuResourceMgr::GetInstance().mutex_cache_[device_id]->lock();
res->mutex.lock();
}
// specif for search
// get the ownership of gpuresource and gpu
ResScope(ResPtr &res, const int64_t &device_id)
: resource(res), device_id(device_id), move(false), own(true) {
FaissGpuResourceMgr::GetInstance().mutex_cache_[device_id]->lock();
res->mutex.lock();
}
~ResScope() {
if (own) FaissGpuResourceMgr::GetInstance().mutex_cache_[device_id]->unlock();
if (move) FaissGpuResourceMgr::GetInstance().MoveToIdle(device_id, resource);
resource->mutex.unlock();
}
private:
ResPtr resource;
int64_t device_id;
bool move = true;
bool own = false;
};
class GPUIndex {
public:
explicit GPUIndex(const int &device_id) : gpu_id_(device_id) {}
GPUIndex(const int& device_id, ResPtr resource): gpu_id_(device_id), res_(std::move(resource)){}
GPUIndex(const int& device_id, const ResPtr& resource): gpu_id_(device_id), res_(resource){}
virtual VectorIndexPtr CopyGpuToCpu(const Config &config) = 0;
virtual VectorIndexPtr CopyGpuToGpu(const int64_t &device_id, const Config &config) = 0;
......@@ -139,7 +39,7 @@ class GPUIndex {
protected:
int64_t gpu_id_;
ResPtr res_ = nullptr;
ResWPtr res_;
};
class GPUIVF : public IVF, public GPUIndex {
......
......@@ -224,9 +224,9 @@ void GPUIDMAP::LoadImpl(const BinarySet &index_binary) {
if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_) ){
ResScope rs(res, gpu_id_, false);
res_ = res;
auto device_index = faiss::gpu::index_cpu_to_gpu(res->faiss_res.get(), gpu_id_, index);
index_.reset(device_index);
res_ = res;
} else {
KNOWHERE_THROW_MSG("Load error, can't get gpu resource");
}
......
......@@ -32,6 +32,7 @@ set(ivf_srcs
${CORE_SOURCE_DIR}/knowhere/knowhere/adapter/structure.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/common/exception.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/common/timer.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/index/vector_index/FaissGpuResourceMgr.cpp
utils.cpp
)
if(NOT TARGET test_ivf)
......@@ -48,6 +49,7 @@ set(idmap_srcs
${CORE_SOURCE_DIR}/knowhere/knowhere/adapter/structure.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/common/exception.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/common/timer.cpp
${CORE_SOURCE_DIR}/knowhere/knowhere/index/vector_index/FaissGpuResourceMgr.cpp
utils.cpp
)
if(NOT TARGET test_idmap)
......
......@@ -17,6 +17,7 @@
#include "KnowhereResource.h"
#include "knowhere/index/vector_index/FaissGpuResourceMgr.h"
#include "server/ServerConfig.h"
#include <map>
......
......@@ -19,7 +19,6 @@
#pragma once
#include "utils/Error.h"
#include "knowhere/index/vector_index/gpu_ivf.h"
namespace zilliz {
namespace milvus {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册