未验证 提交 b10cc1c4 编写于 作者: J Ji Bin 提交者: GitHub

Adaptation for cuda11 and legacy gpu/cpu (#17143)

- compile gpu with cuda11
  - add support for sm35, sm50 for legacy GPUs
  - add sm86 for ampere GPU, like RTX 30xx, Tesla Ax
  - support cpu only support SSE42
Signed-off-by: NJi Bin <matrixji@live.com>
上级 de8050d5
......@@ -168,7 +168,7 @@ if (MILVUS_GPU_VERSION)
message(STATUS "Building Milvus GPU version")
add_compile_definitions("MILVUS_GPU_VERSION")
enable_language(CUDA)
find_package(CUDA 10 REQUIRED)
find_package(CUDA 11 REQUIRED)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fPIC -std=c++11 -D_FORCE_INLINES --expt-extended-lambda")
else ()
message(STATUS "Building Milvus CPU version")
......
......@@ -79,7 +79,7 @@ if (MILVUS_GPU_VERSION OR KNOWHERE_GPU_VERSION)
message(STATUS "Building Knowhere GPU version")
add_compile_definitions("MILVUS_GPU_VERSION")
enable_language(CUDA)
find_package(CUDA 10 REQUIRED)
find_package(CUDA 11 REQUIRED)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fPIC -std=c++11 -D_FORCE_INLINES --expt-extended-lambda")
else ()
message(STATUS "Building Knowhere CPU version")
......
......@@ -513,7 +513,7 @@ macro(build_faiss)
set(FAISS_CONFIGURE_ARGS
"--prefix=${FAISS_PREFIX}"
"CFLAGS=${EP_C_FLAGS}"
"CXXFLAGS=${EP_CXX_FLAGS} -mf16c -O3"
"CXXFLAGS=${EP_CXX_FLAGS} -msse4.2 -O3"
--without-python)
if (FAISS_WITH_MKL)
......@@ -535,7 +535,7 @@ macro(build_faiss)
if (KNOWHERE_GPU_VERSION)
set(FAISS_CONFIGURE_ARGS ${FAISS_CONFIGURE_ARGS}
"--with-cuda=${CUDA_TOOLKIT_ROOT_DIR}"
"--with-cuda-arch=-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"
"--with-cuda-arch=-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86"
)
else ()
set(FAISS_CONFIGURE_ARGS ${FAISS_CONFIGURE_ARGS}
......
......@@ -45,11 +45,11 @@ libfaiss.$(SHAREDEXT): $(OBJ)
# support avx
%avx.o: %avx.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx2 -c $< -o $@
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mf16c -mavx2 -c $< -o $@
# support avx512
%avx512.o: %avx512.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mf16c -mavx512f -mavx512dq -mavx512bw -c $< -o $@
%.o: %.cu
$(NVCC) $(NVCCFLAGS) -c $< -o $@
......
......@@ -13,7 +13,7 @@
namespace faiss { namespace gpu {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ <= 750
#if __CUDA_ARCH__ <= 860
constexpr int kWarpSize = 32;
#else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
......
......@@ -188,16 +188,16 @@ float fvec_Linf_avx (const float* x, const float* y, size_t d) {
return _mm_cvtss_f32 (msum2);
}
const __m256i lookup = _mm256_setr_epi8(
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
#define DECLARE_LOOKUP \
const __m256i lookup = _mm256_setr_epi8( \
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, \
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, \
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, \
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, \
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, \
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, \
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, \
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 \
);
int popcnt_AVX2_lookup(const uint8_t* data, const size_t n) {
......@@ -218,6 +218,7 @@ int popcnt_AVX2_lookup(const uint8_t* data, const size_t n) {
i += 32; \
}
DECLARE_LOOKUP
while (i + 8*32 <= n) {
__m256i local = _mm256_setzero_si256();
ITER ITER ITER ITER
......@@ -271,6 +272,7 @@ int xor_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const siz
i += 32; \
}
DECLARE_LOOKUP
while (i + 8*32 <= n) {
__m256i local = _mm256_setzero_si256();
ITER ITER ITER ITER
......@@ -324,6 +326,7 @@ int or_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const size
i += 32; \
}
DECLARE_LOOKUP
while (i + 8*32 <= n) {
__m256i local = _mm256_setzero_si256();
ITER ITER ITER ITER
......@@ -377,6 +380,7 @@ int and_popcnt_AVX2_lookup(const uint8_t* data1, const uint8_t* data2, const siz
i += 32; \
}
DECLARE_LOOKUP
while (i + 8*32 <= n) {
__m256i local = _mm256_setzero_si256();
ITER ITER ITER ITER
......
......@@ -47,4 +47,4 @@ if (KNOWHERE_GPU_VERSION)
install(TARGETS test_gpu DESTINATION unittest)
endif ()
\ No newline at end of file
endif ()
......@@ -9,7 +9,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License.
FROM nvidia/cuda:10.1-devel-centos7
FROM nvidia/cuda:11.6.2-devel-centos7
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
......
......@@ -10,8 +10,9 @@
# or implied. See the License for the specific language governing permissions and limitations under the License.
FROM hectormolinero/tini:v18 AS tini
FROM nvidia/cuda:11.6.2-runtime-centos7 AS runtime
FROM nvidia/cuda:10.1-devel-centos7
FROM nvidia/cuda:11.6.2-base-centos7
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
......@@ -24,6 +25,10 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/var/lib/milvus/lib"
COPY --from=tini /usr/bin/tini /tini
# copy cublas library from runtime
COPY --from=runtime /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.11 /var/lib/milvus/lib/libcublasLt.so.11
COPY --from=runtime /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11 /var/lib/milvus/lib/libcublas.so.11
ENTRYPOINT ["/tini", "--"]
WORKDIR /var/lib/milvus
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册