Adaptation for cuda11 and legacy gpu/cpu (#17143)

- compile gpu with cuda11 - add support for sm35, sm50 for legacy GPUs - add sm86 for ampere GPU, like RTX 30xx, Tesla Ax - support cpu only support SSE42 Signed-off-by: N Ji Bin <matrixji@live.com>

Adaptation for cuda11 and legacy gpu/cpu (#17143)
- compile gpu with cuda11 - add support for sm35, sm50 for legacy GPUs - add sm86 for ampere GPU, like RTX 30xx, Tesla Ax - support cpu only support SSE42 Signed-off-by: N Ji Bin <matrixji@live.com>
b10cc1c4 · Ji Bin · GitHub · de8050d5 · b10cc1c4 · b10cc1c4
9 changed file
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -168,7 +168,7 @@ if (MILVUS_GPU_VERSION)
    message(STATUS "Building Milvus GPU version")
    add_compile_definitions("MILVUS_GPU_VERSION")
    enable_language(CUDA)
-    find_package(CUDA 10 REQUIRED)
+    find_package(CUDA 11 REQUIRED)
    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fPIC -std=c++11 -D_FORCE_INLINES --expt-extended-lambda")
 else ()
    message(STATUS "Building Milvus CPU version")

--- a/core/src/index/CMakeLists.txt
+++ b/core/src/index/CMakeLists.txt
@@ -79,7 +79,7 @@ if (MILVUS_GPU_VERSION OR KNOWHERE_GPU_VERSION)
    message(STATUS "Building Knowhere GPU version")
    add_compile_definitions("MILVUS_GPU_VERSION")
    enable_language(CUDA)
-    find_package(CUDA 10 REQUIRED)
+    find_package(CUDA 11 REQUIRED)
    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fPIC -std=c++11 -D_FORCE_INLINES --expt-extended-lambda")
 else ()
    message(STATUS "Building Knowhere CPU version")

--- a/core/src/index/cmake/ThirdPartyPackagesCore.cmake
+++ b/core/src/index/cmake/ThirdPartyPackagesCore.cmake
@@ -513,7 +513,7 @@ macro(build_faiss)
    set(FAISS_CONFIGURE_ARGS
            "--prefix=${FAISS_PREFIX}"
            "CFLAGS=${EP_C_FLAGS}"
-            "CXXFLAGS=${EP_CXX_FLAGS} -mf16c -O3"
+            "CXXFLAGS=${EP_CXX_FLAGS} -msse4.2 -O3"
            --without-python)

    if (FAISS_WITH_MKL)
@@ -535,7 +535,7 @@ macro(build_faiss)
    if (KNOWHERE_GPU_VERSION)
        set(FAISS_CONFIGURE_ARGS ${FAISS_CONFIGURE_ARGS}
                "--with-cuda=${CUDA_TOOLKIT_ROOT_DIR}"
-                "--with-cuda-arch=-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"
+                "--with-cuda-arch=-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86"
                )
    else ()
        set(FAISS_CONFIGURE_ARGS ${FAISS_CONFIGURE_ARGS}

--- a/core/src/index/thirdparty/faiss/Makefile
+++ b/core/src/index/thirdparty/faiss/Makefile
@@ -45,11 +45,11 @@ libfaiss.$(SHAREDEXT): $(OBJ)

 # support avx
 %avx.o: %avx.cpp
-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx2 -c $< -o $@
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mf16c -mavx2 -c $< -o $@

 # support avx512
 %avx512.o: %avx512.cpp
-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mf16c -mavx512f -mavx512dq -mavx512bw -c $< -o $@

 %.o: %.cu
 	$(NVCC) $(NVCCFLAGS) -c $< -o $@

--- a/core/src/index/thirdparty/faiss/gpu/utils/DeviceDefs.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceDefs.cuh
@@ -13,7 +13,7 @@
 namespace faiss { namespace gpu {

 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 750
+#if __CUDA_ARCH__ <= 860
 constexpr int kWarpSize = 32;
 #else
 #error Unknown __CUDA_ARCH__; please define parameters for compute capability

--- a/core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp
+++ b/core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp
@@ -188,16 +188,16 @@ float fvec_Linf_avx (const float* x, const float* y, size_t d) {
    return  _mm_cvtss_f32 (msum2);
 }

-const __m256i lookup = _mm256_setr_epi8(
-        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
-        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
-        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
-        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
-
-        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
-        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
-        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
-        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
+#define DECLARE_LOOKUP \
+const __m256i lookup = _mm256_setr_epi8( \
+        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, \
+        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, \
+        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, \
+        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, \
+        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, \
+        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, \
+        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, \
+        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 \
 );

 int popcnt_AVX2_lookup(const uint8_t* data, const size_t n) {
@@ -218,6 +218,7 @@ int popcnt_AVX2_lookup(const uint8_t* data, const size_t n) {
        i += 32; \
    }

+    DECLARE_LOOKUP
    while (i + 8*32 <= n) {
        __m256i local = _mm256_setzero_si256();
        ITER ITER ITER ITER
@@ -271,6 +272,7 @@ int xor_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const siz
        i += 32; \
    }

+    DECLARE_LOOKUP
    while (i + 8*32 <= n) {
        __m256i local = _mm256_setzero_si256();
        ITER ITER ITER ITER
@@ -324,6 +326,7 @@ int or_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const size
        i += 32; \
    }

+    DECLARE_LOOKUP
    while (i + 8*32 <= n) {
        __m256i local = _mm256_setzero_si256();
        ITER ITER ITER ITER
@@ -377,6 +380,7 @@ int and_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const siz
        i += 32; \
    }

+    DECLARE_LOOKUP
    while (i + 8*32 <= n) {
        __m256i local = _mm256_setzero_si256();
        ITER ITER ITER ITER

--- a/core/src/index/unittest/faiss_ori/CMakeLists.txt
+++ b/core/src/index/unittest/faiss_ori/CMakeLists.txt
@@ -47,4 +47,4 @@ if (KNOWHERE_GPU_VERSION)

    install(TARGETS test_gpu DESTINATION unittest)

-endif ()
\ No newline at end of file
+endif ()
--- a/docker/build_env/gpu/centos7/Dockerfile
+++ b/docker/build_env/gpu/centos7/Dockerfile
@@ -9,7 +9,7 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under the License.

-FROM nvidia/cuda:10.1-devel-centos7
+FROM nvidia/cuda:11.6.2-devel-centos7

 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility


--- a/docker/deploy/gpu/centos7/Dockerfile
+++ b/docker/deploy/gpu/centos7/Dockerfile
@@ -10,8 +10,9 @@
 # or implied. See the License for the specific language governing permissions and limitations under the License.

 FROM hectormolinero/tini:v18 AS tini
+FROM nvidia/cuda:11.6.2-runtime-centos7 AS runtime

-FROM nvidia/cuda:10.1-devel-centos7
+FROM nvidia/cuda:11.6.2-base-centos7

 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility

@@ -24,6 +25,10 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/var/lib/milvus/lib"

 COPY --from=tini /usr/bin/tini /tini

+# copy cublas library from runtime
+COPY --from=runtime /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.11 /var/lib/milvus/lib/libcublasLt.so.11
+COPY --from=runtime /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11 /var/lib/milvus/lib/libcublas.so.11
+
 ENTRYPOINT ["/tini", "--"]

 WORKDIR /var/lib/milvus