From 464400767f8723b9867755c63ab07f7cfe2ecb29 Mon Sep 17 00:00:00 2001 From: "shengjun.li" <49774184+shengjun1985@users.noreply.github.com> Date: Tue, 14 Apr 2020 23:30:37 +0800 Subject: [PATCH] #1928 Too many data and uid copies when loading files (#1931) Signed-off-by: shengjun.li Co-authored-by: Jin Hai --- CHANGELOG.md | 1 + core/src/codecs/default/DefaultVectorsFormat.cpp | 9 +++------ core/src/db/engine/ExecutionEngineImpl.cpp | 16 ++++++---------- core/src/segment/Vectors.cpp | 14 ++++++++++---- core/src/segment/Vectors.h | 8 ++++++-- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d545123..e9e09af2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub - \#1885 Optimize knowhere unittest - \#1886 Refactor log on search and insert request - \#1897 Heap pop and push can be realized by heap_swap_top +- \#1928 Fix too many data and uid copies when loading files - \#1930 Upgrade mishards to 0.8.0 ## Task diff --git a/core/src/codecs/default/DefaultVectorsFormat.cpp b/core/src/codecs/default/DefaultVectorsFormat.cpp index 93d1e605..9151f9a9 100644 --- a/core/src/codecs/default/DefaultVectorsFormat.cpp +++ b/core/src/codecs/default/DefaultVectorsFormat.cpp @@ -90,15 +90,12 @@ DefaultVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::Vectors for (; it != it_end; ++it) { const auto& path = it->path(); if (path.extension().string() == raw_vector_extension_) { - std::vector vector_list; + auto& vector_list = vectors_read->GetMutableData(); read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list); - vectors_read->AddData(vector_list); vectors_read->SetName(path.stem().string()); - } - if (path.extension().string() == user_id_extension_) { - std::vector uids; + } else if (path.extension().string() == user_id_extension_) { + auto& uids = vectors_read->GetMutableUids(); read_uids_internal(fs_ptr, path.string(), uids); - vectors_read->AddUids(uids); } } } diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index efdedae6..d2a8dc96 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -375,8 +375,6 @@ ExecutionEngineImpl::Serialize() { Status ExecutionEngineImpl::Load(bool to_cache) { - // TODO(zhiru): refactor - index_ = std::static_pointer_cast(cache::CpuCacheMgr::GetInstance()->GetIndex(location_)); bool already_in_cache = (index_ != nullptr); if (!already_in_cache) { @@ -411,21 +409,19 @@ ExecutionEngineImpl::Load(bool to_cache) { auto& vectors = segment_ptr->vectors_ptr_; auto& deleted_docs = segment_ptr->deleted_docs_ptr_->GetDeletedDocs(); - auto vectors_uids = vectors->GetUids(); + auto& vectors_uids = vectors->GetMutableUids(); + auto count = vectors_uids.size(); index_->SetUids(vectors_uids); ENGINE_LOG_DEBUG << "set uids " << index_->GetUids().size() << " for index " << location_; - auto vectors_data = vectors->GetData(); + auto& vectors_data = vectors->GetData(); - faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = - std::make_shared(vectors->GetCount()); + faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared(count); for (auto& offset : deleted_docs) { - if (!concurrent_bitset_ptr->test(offset)) { - concurrent_bitset_ptr->set(offset); - } + concurrent_bitset_ptr->set(offset); } - auto dataset = knowhere::GenDataset(vectors->GetCount(), this->dim_, vectors_data.data()); + auto dataset = knowhere::GenDataset(count, this->dim_, vectors_data.data()); if (index_type_ == EngineType::FAISS_IDMAP) { auto bf_index = std::static_pointer_cast(index_); bf_index->Train(knowhere::DatasetPtr(), conf); diff --git a/core/src/segment/Vectors.cpp b/core/src/segment/Vectors.cpp index 04c68d51..0c8e1073 100644 --- a/core/src/segment/Vectors.cpp +++ b/core/src/segment/Vectors.cpp @@ -28,10 +28,6 @@ namespace milvus { namespace segment { -Vectors::Vectors(std::vector data, std::vector uids, const std::string& name) - : data_(std::move(data)), uids_(std::move(uids)), name_(name) { -} - void Vectors::AddData(const std::vector& data) { data_.reserve(data_.size() + data.size()); @@ -120,6 +116,16 @@ Vectors::Erase(std::vector& offsets) { << diff.count() << " s"; } +std::vector& +Vectors::GetMutableData() { + return data_; +} + +std::vector& +Vectors::GetMutableUids() { + return uids_; +} + const std::vector& Vectors::GetData() const { return data_; diff --git a/core/src/segment/Vectors.h b/core/src/segment/Vectors.h index 2be6e626..b5594cc9 100644 --- a/core/src/segment/Vectors.h +++ b/core/src/segment/Vectors.h @@ -28,8 +28,6 @@ using doc_id_t = int64_t; class Vectors { public: - Vectors(std::vector data, std::vector uids, const std::string& name); - Vectors() = default; void @@ -41,6 +39,12 @@ class Vectors { void SetName(const std::string& name); + std::vector& + GetMutableData(); + + std::vector& + GetMutableUids(); + const std::vector& GetData() const; -- GitLab