Vectors.cpp 4.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "segment/Vectors.h"

#include <algorithm>
#include <chrono>
#include <iostream>
#include <utility>
#include <vector>

#include "utils/Log.h"

namespace milvus {
namespace segment {

void
Vectors::AddData(const std::vector<uint8_t>& data) {
    data_.reserve(data_.size() + data.size());
    data_.insert(data_.end(), std::make_move_iterator(data.begin()), std::make_move_iterator(data.end()));
}

void
Vectors::AddUids(const std::vector<doc_id_t>& uids) {
    uids_.reserve(uids_.size() + uids.size());
    uids_.insert(uids_.end(), std::make_move_iterator(uids.begin()), std::make_move_iterator(uids.end()));
}

void
Vectors::Erase(int32_t offset) {
    auto code_length = GetCodeLength();
    if (code_length != 0) {
        auto step = offset * code_length;
        data_.erase(data_.begin() + step, data_.begin() + step + code_length);
        uids_.erase(uids_.begin() + offset, uids_.begin() + offset + 1);
    }
}

void
Vectors::Erase(std::vector<int32_t>& offsets) {
    if (offsets.empty()) {
        return;
    }

    // Sort and remove duplicates
    auto start = std::chrono::high_resolution_clock::now();

    std::sort(offsets.begin(), offsets.end());

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    ENGINE_LOG_DEBUG << "Sorting " << offsets.size() << " offsets to delete took " << diff.count() << " s";

    start = std::chrono::high_resolution_clock::now();

    offsets.erase(std::unique(offsets.begin(), offsets.end()), offsets.end());

    end = std::chrono::high_resolution_clock::now();
    diff = end - start;
    ENGINE_LOG_DEBUG << "Deduplicating " << offsets.size() << " offsets to delete took " << diff.count() << " s";

    // Reconstruct raw vectors and uids
    ENGINE_LOG_DEBUG << "Begin erasing...";

    size_t new_size = uids_.size() - offsets.size();
    std::vector<doc_id_t> new_uids(new_size);
    auto code_length = GetCodeLength();
    std::vector<uint8_t> new_data(new_size * code_length);

    auto count = 0;
    auto skip = offsets.cbegin();
    auto loop_size = uids_.size();

    for (size_t i = 0; i < loop_size;) {
        while (i == *skip && skip != offsets.cend()) {
            ++i;
            ++skip;
        }

        if (i == loop_size) {
            break;
        }

        new_uids[count] = uids_[i];

        for (size_t j = 0; j < code_length; ++j) {
            new_data[count * code_length + j] = data_[i * code_length + j];
        }

        ++count;
        ++i;
    }

    data_.clear();
    uids_.clear();
    data_.swap(new_data);
    uids_.swap(new_uids);

    end = std::chrono::high_resolution_clock::now();
    diff = end - start;
    ENGINE_LOG_DEBUG << "Erasing " << offsets.size() << " vectors out of " << loop_size << " vectors took "
                     << diff.count() << " s";
}

119 120 121 122 123 124 125 126 127 128
std::vector<uint8_t>&
Vectors::GetMutableData() {
    return data_;
}

std::vector<doc_id_t>&
Vectors::GetMutableUids() {
    return uids_;
}

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
const std::vector<uint8_t>&
Vectors::GetData() const {
    return data_;
}

const std::vector<doc_id_t>&
Vectors::GetUids() const {
    return uids_;
}

size_t
Vectors::GetCount() const {
    return uids_.size();
}

size_t
Vectors::GetCodeLength() const {
    return uids_.empty() ? 0 : data_.size() / uids_.size();
}

size_t
150 151 152 153 154 155 156
Vectors::VectorsSize() {
    return data_.size();
}

size_t
Vectors::UidsSize() {
    return uids_.size();
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
}

void
Vectors::SetName(const std::string& name) {
    name_ = name;
}

const std::string&
Vectors::GetName() const {
    return name_;
}

void
Vectors::Clear() {
    data_.clear();
    data_.shrink_to_fit();
    uids_.clear();
    uids_.shrink_to_fit();
}

}  // namespace segment
}  // namespace milvus