diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d545123602cbf4c0f2a5b11edae5863ff1b6474..e9e09af20bbc04e85df83f6a5da6e1659fd65b38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub - \#1885 Optimize knowhere unittest - \#1886 Refactor log on search and insert request - \#1897 Heap pop and push can be realized by heap_swap_top +- \#1928 Fix too many data and uid copies when loading files - \#1930 Upgrade mishards to 0.8.0 ## Task diff --git a/core/src/codecs/default/DefaultVectorsFormat.cpp b/core/src/codecs/default/DefaultVectorsFormat.cpp index 93d1e6059e49c880423104f8d508f53b99735838..9151f9a9eaa97f1fca6dfc6f5d8c9c6682c898a6 100644 --- a/core/src/codecs/default/DefaultVectorsFormat.cpp +++ b/core/src/codecs/default/DefaultVectorsFormat.cpp @@ -90,15 +90,12 @@ DefaultVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::Vectors for (; it != it_end; ++it) { const auto& path = it->path(); if (path.extension().string() == raw_vector_extension_) { - std::vector vector_list; + auto& vector_list = vectors_read->GetMutableData(); read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list); - vectors_read->AddData(vector_list); vectors_read->SetName(path.stem().string()); - } - if (path.extension().string() == user_id_extension_) { - std::vector uids; + } else if (path.extension().string() == user_id_extension_) { + auto& uids = vectors_read->GetMutableUids(); read_uids_internal(fs_ptr, path.string(), uids); - vectors_read->AddUids(uids); } } } diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index efdedae6f300801d88b43ed15acbbad9c1496a68..d2a8dc9687ae1f65a2504fd1ff4591a9a5f7d46b 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -375,8 +375,6 @@ ExecutionEngineImpl::Serialize() { Status ExecutionEngineImpl::Load(bool to_cache) { - // TODO(zhiru): refactor - index_ = std::static_pointer_cast(cache::CpuCacheMgr::GetInstance()->GetIndex(location_)); bool already_in_cache = (index_ != nullptr); if (!already_in_cache) { @@ -411,21 +409,19 @@ ExecutionEngineImpl::Load(bool to_cache) { auto& vectors = segment_ptr->vectors_ptr_; auto& deleted_docs = segment_ptr->deleted_docs_ptr_->GetDeletedDocs(); - auto vectors_uids = vectors->GetUids(); + auto& vectors_uids = vectors->GetMutableUids(); + auto count = vectors_uids.size(); index_->SetUids(vectors_uids); ENGINE_LOG_DEBUG << "set uids " << index_->GetUids().size() << " for index " << location_; - auto vectors_data = vectors->GetData(); + auto& vectors_data = vectors->GetData(); - faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = - std::make_shared(vectors->GetCount()); + faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared(count); for (auto& offset : deleted_docs) { - if (!concurrent_bitset_ptr->test(offset)) { - concurrent_bitset_ptr->set(offset); - } + concurrent_bitset_ptr->set(offset); } - auto dataset = knowhere::GenDataset(vectors->GetCount(), this->dim_, vectors_data.data()); + auto dataset = knowhere::GenDataset(count, this->dim_, vectors_data.data()); if (index_type_ == EngineType::FAISS_IDMAP) { auto bf_index = std::static_pointer_cast(index_); bf_index->Train(knowhere::DatasetPtr(), conf); diff --git a/core/src/segment/Vectors.cpp b/core/src/segment/Vectors.cpp index 04c68d51866006b2cc662f2ecf695de230371110..0c8e10738fca10ae729f3b75d79abc10a239b2b1 100644 --- a/core/src/segment/Vectors.cpp +++ b/core/src/segment/Vectors.cpp @@ -28,10 +28,6 @@ namespace milvus { namespace segment { -Vectors::Vectors(std::vector data, std::vector uids, const std::string& name) - : data_(std::move(data)), uids_(std::move(uids)), name_(name) { -} - void Vectors::AddData(const std::vector& data) { data_.reserve(data_.size() + data.size()); @@ -120,6 +116,16 @@ Vectors::Erase(std::vector& offsets) { << diff.count() << " s"; } +std::vector& +Vectors::GetMutableData() { + return data_; +} + +std::vector& +Vectors::GetMutableUids() { + return uids_; +} + const std::vector& Vectors::GetData() const { return data_; diff --git a/core/src/segment/Vectors.h b/core/src/segment/Vectors.h index 2be6e62646c30472cb61671626729b5b9bed2f29..b5594cc9570abf6ec6ddf5b9ae9c10c188b47a07 100644 --- a/core/src/segment/Vectors.h +++ b/core/src/segment/Vectors.h @@ -28,8 +28,6 @@ using doc_id_t = int64_t; class Vectors { public: - Vectors(std::vector data, std::vector uids, const std::string& name); - Vectors() = default; void @@ -41,6 +39,12 @@ class Vectors { void SetName(const std::string& name); + std::vector& + GetMutableData(); + + std::vector& + GetMutableUids(); + const std::vector& GetData() const;