未验证 提交 46440076 编写于 作者: S shengjun.li 提交者: GitHub

#1928 Too many data and uid copies when loading files (#1931)

Signed-off-by: Nshengjun.li <shengjun.li@zilliz.com>
Co-authored-by: NJin Hai <hai.jin@zilliz.com>
上级 7ed6edc5
......@@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub
- \#1885 Optimize knowhere unittest
- \#1886 Refactor log on search and insert request
- \#1897 Heap pop and push can be realized by heap_swap_top
- \#1928 Fix too many data and uid copies when loading files
- \#1930 Upgrade mishards to 0.8.0
## Task
......
......@@ -90,15 +90,12 @@ DefaultVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::Vectors
for (; it != it_end; ++it) {
const auto& path = it->path();
if (path.extension().string() == raw_vector_extension_) {
std::vector<uint8_t> vector_list;
auto& vector_list = vectors_read->GetMutableData();
read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list);
vectors_read->AddData(vector_list);
vectors_read->SetName(path.stem().string());
}
if (path.extension().string() == user_id_extension_) {
std::vector<segment::doc_id_t> uids;
} else if (path.extension().string() == user_id_extension_) {
auto& uids = vectors_read->GetMutableUids();
read_uids_internal(fs_ptr, path.string(), uids);
vectors_read->AddUids(uids);
}
}
}
......
......@@ -375,8 +375,6 @@ ExecutionEngineImpl::Serialize() {
Status
ExecutionEngineImpl::Load(bool to_cache) {
// TODO(zhiru): refactor
index_ = std::static_pointer_cast<knowhere::VecIndex>(cache::CpuCacheMgr::GetInstance()->GetIndex(location_));
bool already_in_cache = (index_ != nullptr);
if (!already_in_cache) {
......@@ -411,21 +409,19 @@ ExecutionEngineImpl::Load(bool to_cache) {
auto& vectors = segment_ptr->vectors_ptr_;
auto& deleted_docs = segment_ptr->deleted_docs_ptr_->GetDeletedDocs();
auto vectors_uids = vectors->GetUids();
auto& vectors_uids = vectors->GetMutableUids();
auto count = vectors_uids.size();
index_->SetUids(vectors_uids);
ENGINE_LOG_DEBUG << "set uids " << index_->GetUids().size() << " for index " << location_;
auto vectors_data = vectors->GetData();
auto& vectors_data = vectors->GetData();
faiss::ConcurrentBitsetPtr concurrent_bitset_ptr =
std::make_shared<faiss::ConcurrentBitset>(vectors->GetCount());
faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(count);
for (auto& offset : deleted_docs) {
if (!concurrent_bitset_ptr->test(offset)) {
concurrent_bitset_ptr->set(offset);
}
concurrent_bitset_ptr->set(offset);
}
auto dataset = knowhere::GenDataset(vectors->GetCount(), this->dim_, vectors_data.data());
auto dataset = knowhere::GenDataset(count, this->dim_, vectors_data.data());
if (index_type_ == EngineType::FAISS_IDMAP) {
auto bf_index = std::static_pointer_cast<knowhere::IDMAP>(index_);
bf_index->Train(knowhere::DatasetPtr(), conf);
......
......@@ -28,10 +28,6 @@
namespace milvus {
namespace segment {
Vectors::Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name)
: data_(std::move(data)), uids_(std::move(uids)), name_(name) {
}
void
Vectors::AddData(const std::vector<uint8_t>& data) {
data_.reserve(data_.size() + data.size());
......@@ -120,6 +116,16 @@ Vectors::Erase(std::vector<int32_t>& offsets) {
<< diff.count() << " s";
}
std::vector<uint8_t>&
Vectors::GetMutableData() {
return data_;
}
std::vector<doc_id_t>&
Vectors::GetMutableUids() {
return uids_;
}
const std::vector<uint8_t>&
Vectors::GetData() const {
return data_;
......
......@@ -28,8 +28,6 @@ using doc_id_t = int64_t;
class Vectors {
public:
Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name);
Vectors() = default;
void
......@@ -41,6 +39,12 @@ class Vectors {
void
SetName(const std::string& name);
std::vector<uint8_t>&
GetMutableData();
std::vector<doc_id_t>&
GetMutableUids();
const std::vector<uint8_t>&
GetData() const;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册