From 8f26e69f4628d1ca0bed4826a7829b9be95f1013 Mon Sep 17 00:00:00 2001 From: TheBloodthirster <13606074505@163.com> Date: Fri, 28 Aug 2020 18:02:58 +0800 Subject: [PATCH] update kmeans (#3489) Signed-off-by: hjp <13606074505@163.com> --- .../src/index/thirdparty/faiss/Clustering.cpp | 139 ++++++++++++++++-- core/src/index/thirdparty/faiss/Clustering.h | 31 ++++ 2 files changed, 157 insertions(+), 13 deletions(-) mode change 100644 => 100755 core/src/index/thirdparty/faiss/Clustering.cpp mode change 100644 => 100755 core/src/index/thirdparty/faiss/Clustering.h diff --git a/core/src/index/thirdparty/faiss/Clustering.cpp b/core/src/index/thirdparty/faiss/Clustering.cpp old mode 100644 new mode 100755 index eba243d1..017fb199 --- a/core/src/index/thirdparty/faiss/Clustering.cpp +++ b/core/src/index/thirdparty/faiss/Clustering.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include namespace faiss { @@ -258,11 +259,115 @@ int split_clusters (size_t d, size_t k, size_t n, return nsplit; } +}; +KmeansType kmeans_type = KmeansType::KMEANS; +void Clustering::kmeans_algorithm(std::vector& centroids_index, int64_t random_seed, + size_t n_input_centroids, size_t d, size_t k, + idx_t nx, const uint8_t *x_in) +{ + // centroids with random points from the dataset + rand_perm (centroids_index.data(), nx, random_seed); +} +void Clustering::kmeans_plus_plus_algorithm(std::vector& centroids_index, int64_t random_seed, + size_t n_input_centroids, size_t d, + size_t k, idx_t nx, const uint8_t *x_in) +{ + FAISS_THROW_IF_NOT_MSG ( + n_input_centroids == 0, + "Kmeans plus plus only support the provided input centroids number of zero" + ); -}; + size_t thread_max_num = omp_get_max_threads(); + auto x = reinterpret_cast(x_in); + + // The square of distance to current centroid + std::vector dx_distance(nx, 1.0 / 0.0); + std::vector pre_sum(nx); + + // task of each thread when calculate P(x) + std::vector task(thread_max_num, nx); + size_t step = (nx + thread_max_num - 1) / thread_max_num; + for (size_t i = 0; i + 1 < thread_max_num; i++) { + task[i] = (i + 1) * step; + } + + // Record the centroids that has been calculated + // Input : + // nx : int -> nb of points + // d : size_t -> nb of dimensions + // k : size_t -> nb of centroids + // x : unsigned char -> data : the x[i*d] means the i-th point's d-th value + // Output: + // centroids : array -> the cluster centers + + // 1. get the pre-n-input-centroids: if equal to 0, + // then should get the first random start point + RandomGenerator rng (random_seed); + //if (n_input_centroids == 0) {} + size_t first_center; + first_center = static_cast(rng.rand_int64() % nx); + centroids_index[0] = first_center; + + // 2. use the first few centroids to calculate the next centroid,and already has first random start point + //size_t current_centroids = n_input_centroids == 0 ? 1 : n_input_centroids; + size_t current_centroids = 1; + // For every epoch there is i-th centroids,and we want to calculate the i+1 centroid + for (size_t i = current_centroids; i < k; i++) { + auto last_centroids_data = x + centroids_index[i - 1] * d; + // for every point + #pragma omp parallel for + for (size_t point_it = 0; point_it < nx; point_it++) { + float distance_of_point_and_centroid = 0; + distance_of_point_and_centroid = fvec_L2sqr((x + point_it * d), last_centroids_data, d); + if (distance_of_point_and_centroid < dx_distance[point_it]) { + dx_distance[point_it] = distance_of_point_and_centroid; + } + } + + //calculate P(x) + #pragma omp parallel for + for (size_t point_it = 0; point_it < thread_max_num; point_it++) { + size_t left = point_it == 0 ? 0 : task[point_it - 1]; + size_t right = task[point_it]; + // cout <<"Thread = "<< omp_get_thread_num() <<" left = "< perm (nx); - - rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L); + { + int64_t random_seed = seed + 1 + redo * 15486557L; + std::vector centroids_index(nx); - if (!codec) { - for (int i = n_input_centroids; i < k ; i++) { - memcpy (¢roids[i * d], x + perm[i] * line_size, line_size); + if (KmeansType::KMEANS == kmeans_type) { + //Use classic kmeans algorithm + kmeans_algorithm(centroids_index, random_seed, n_input_centroids, d, k, nx, x_in); + } else if (KmeansType::KMEANS_PLUSPLUS == kmeans_type) { + //Use kmeans++ algorithm + kmeans_plus_plus_algorithm(centroids_index, random_seed, n_input_centroids, d, k, nx, x_in); } - } else { - for (int i = n_input_centroids; i < k ; i++) { - codec->sa_decode (1, x + perm[i] * line_size, ¢roids[i * d]); + + centroids.resize(d * k); + if (!codec) { + for (int i = n_input_centroids; i < k; i++) { + memcpy(¢roids[i * d], x + centroids_index[i] * line_size, line_size); + } + } else { + for (int i = n_input_centroids; i < k; i++) { + codec->sa_decode(1, x + centroids_index[i] * line_size, ¢roids[i * d]); + } } } - post_process_centroids (); + post_process_centroids(); // prepare the index diff --git a/core/src/index/thirdparty/faiss/Clustering.h b/core/src/index/thirdparty/faiss/Clustering.h old mode 100644 new mode 100755 index 46410af7..e4818860 --- a/core/src/index/thirdparty/faiss/Clustering.h +++ b/core/src/index/thirdparty/faiss/Clustering.h @@ -15,6 +15,19 @@ namespace faiss { +/** + * The algorithm of Kmeans Type + */ +enum KmeansType +{ + KMEANS, + KMEANS_PLUSPLUS, + KMEANS_TWO, +}; + +//The default algorithm use the KMEANS_PLUSPLUS +extern KmeansType kmeans_type; + /** Class for the clustering parameters. Can be passed to the * constructor of the Clustering object. @@ -87,6 +100,24 @@ struct Clustering: ClusteringParameters { virtual void train (idx_t n, const float * x, faiss::Index & index, const float *x_weights = nullptr); + /** + * @brief Kmeans algorithm + * + * @param centroids_index [out] centroids index + * @param random_seed seed for the random number generator + * @param n_input_centroids the number of centroids that user input + * @param d dimension + * @param k number of centroids + * @param nx size of data + * @param x_in data of point + */ + void kmeans_algorithm(std::vector& centroids_index, int64_t random_seed, + size_t n_input_centroids, size_t d, size_t k, + idx_t nx, const uint8_t *x_in); + + void kmeans_plus_plus_algorithm(std::vector& centroids_index, int64_t random_seed, + size_t n_input_centroids, size_t d, size_t k, + idx_t nx, const uint8_t *x_in); /** run with encoded vectors * -- GitLab