未验证 提交 a474ff26 编写于 作者: Y yukun 提交者: GitHub

Server down during loading data (#2807)

* Server down during loading data
Signed-off-by: Nfishpenguin <kun.yu@zilliz.com>

* Fix test_config.cpp
Signed-off-by: Nfishpenguin <kun.yu@zilliz.com>

* ci retry
Signed-off-by: Nfishpenguin <kun.yu@zilliz.com>

* Change limit_in_bytes init value to max
Signed-off-by: Nfishpenguin <kun.yu@zilliz.com>
Co-authored-by: NJin Hai <hai.jin@zilliz.com>
上级 585c8ea0
......@@ -20,6 +20,7 @@ Please mark all change in change log and use the issue from GitHub
- \#2752 Milvus formats vectors data to double-precision and return to http client
- \#2767 Fix a bug of getting wrong nprobe limitation in knowhere on GPU version
- \#2768 After building the index,the number of vectors increases
- \#2774 Server down during loading data
- \#2776 Fix too many data copies during creating IVF index
- \#2813 To implemente RNSG IP
......
......@@ -16,6 +16,7 @@
#include <chrono>
#include <fstream>
#include <iostream>
#include <limits>
#include <regex>
#include <string>
#include <thread>
......@@ -1333,6 +1334,15 @@ Config::CheckCacheConfigCpuCacheCapacity(const std::string& value) {
int64_t total_mem = 0, free_mem = 0;
CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_mem = std::numeric_limits<int64_t>::max();
CommonUtil::GetSysCgroupMemLimit(cgroup_limit_mem);
if (cgroup_limit_mem < total_mem && cache_size >= cgroup_limit_mem) {
std::string msg = "Invalid cpu cache size: " + value +
". Possible reason: cache.cache_size exceeds system cgroup memory.";
return Status{SERVER_INVALID_ARGUMENT, msg};
}
if (cache_size >= total_mem) {
std::string msg =
"Invalid cpu cache size: " + value + ". Possible reason: cache.cache_size exceeds system memory.";
......
......@@ -53,6 +53,19 @@ CommonUtil::GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem) {
return ret == 0; // succeed 0, failed -1
}
bool
CommonUtil::GetSysCgroupMemLimit(int64_t& limit_in_bytes) {
try {
std::ifstream file("/sys/fs/cgroup/memory/memory.limit_in_bytes");
file >> limit_in_bytes;
} catch (std::exception& ex) {
std::string msg =
"Failed to read /sys/fs/cgroup/memory/memory.limit_in_bytes, reason: " + std::string(ex.what());
LOG_SERVER_ERROR_ << msg;
return 0;
}
}
bool
CommonUtil::GetSystemAvailableThreads(int64_t& thread_count) {
// threadCnt = std::thread::hardware_concurrency();
......
......@@ -24,6 +24,8 @@ class CommonUtil {
static bool
GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem);
static bool
GetSysCgroupMemLimit(int64_t& limit_in_bytes);
static bool
GetSystemAvailableThreads(int64_t& thread_count);
static bool
......
......@@ -223,35 +223,35 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
ASSERT_TRUE(config.GetStorageConfigPath(str_val).ok());
ASSERT_TRUE(str_val == storage_primary_path);
// bool storage_s3_enable = true;
// ASSERT_TRUE(config.SetStorageConfigS3Enable(std::to_string(storage_s3_enable)).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Enable(bool_val).ok());
// ASSERT_TRUE(bool_val == storage_s3_enable);
//
// std::string storage_s3_addr = "192.168.1.100";
// ASSERT_TRUE(config.SetStorageConfigS3Address(storage_s3_addr).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Address(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_addr);
//
// std::string storage_s3_port = "12345";
// ASSERT_TRUE(config.SetStorageConfigS3Port(storage_s3_port).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Port(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_port);
//
// std::string storage_s3_access_key = "minioadmin";
// ASSERT_TRUE(config.SetStorageConfigS3AccessKey(storage_s3_access_key).ok());
// ASSERT_TRUE(config.GetStorageConfigS3AccessKey(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_access_key);
//
// std::string storage_s3_secret_key = "minioadmin";
// ASSERT_TRUE(config.SetStorageConfigS3SecretKey(storage_s3_secret_key).ok());
// ASSERT_TRUE(config.GetStorageConfigS3SecretKey(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_secret_key);
//
// std::string storage_s3_bucket = "s3bucket";
// ASSERT_TRUE(config.SetStorageConfigS3Bucket(storage_s3_bucket).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Bucket(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_bucket);
// bool storage_s3_enable = true;
// ASSERT_TRUE(config.SetStorageConfigS3Enable(std::to_string(storage_s3_enable)).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Enable(bool_val).ok());
// ASSERT_TRUE(bool_val == storage_s3_enable);
//
// std::string storage_s3_addr = "192.168.1.100";
// ASSERT_TRUE(config.SetStorageConfigS3Address(storage_s3_addr).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Address(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_addr);
//
// std::string storage_s3_port = "12345";
// ASSERT_TRUE(config.SetStorageConfigS3Port(storage_s3_port).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Port(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_port);
//
// std::string storage_s3_access_key = "minioadmin";
// ASSERT_TRUE(config.SetStorageConfigS3AccessKey(storage_s3_access_key).ok());
// ASSERT_TRUE(config.GetStorageConfigS3AccessKey(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_access_key);
//
// std::string storage_s3_secret_key = "minioadmin";
// ASSERT_TRUE(config.SetStorageConfigS3SecretKey(storage_s3_secret_key).ok());
// ASSERT_TRUE(config.GetStorageConfigS3SecretKey(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_secret_key);
//
// std::string storage_s3_bucket = "s3bucket";
// ASSERT_TRUE(config.SetStorageConfigS3Bucket(storage_s3_bucket).ok());
// ASSERT_TRUE(config.GetStorageConfigS3Bucket(str_val).ok());
// ASSERT_TRUE(str_val == storage_s3_bucket);
/* metric config */
bool metric_enable_monitor = false;
......@@ -294,8 +294,15 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
// #2564
int64_t total_mem = 0, free_mem = 0;
milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_size = 0;
milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size);
ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok());
int64_t cache_cpu_cache_size = total_mem / 2;
int64_t cache_cpu_cache_size = 0;
if (cgroup_limit_size < total_mem) {
cache_cpu_cache_size = cgroup_limit_size / 2;
} else {
cache_cpu_cache_size = total_mem / 2;
}
float cache_cpu_cache_threshold = 0.7;
ASSERT_TRUE(config.SetCacheConfigCpuCacheThreshold(std::to_string(cache_cpu_cache_threshold)).ok());
ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok());
......@@ -306,14 +313,20 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
{
int64_t total_mem = 0, free_mem = 0;
milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_size = 0;
milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size);
ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok());
int64_t cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1
int64_t cache_cpu_cache_size = 0;
if (cgroup_limit_size < total_mem) {
cache_cpu_cache_size = cgroup_limit_size - 1073741824 - 1;
} else {
cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1
}
ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok());
ASSERT_TRUE(config.GetCacheConfigCpuCacheCapacity(int64_val).ok());
ASSERT_TRUE(int64_val == cache_cpu_cache_size);
}
/* engine config */
int64_t engine_use_blas_threshold = 50;
ASSERT_TRUE(config.SetEngineConfigUseBlasThreshold(std::to_string(engine_use_blas_threshold)).ok());
......@@ -389,7 +402,7 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
ASSERT_TRUE(config.GetWalConfigRecoveryErrorIgnore(bool_val).ok());
ASSERT_TRUE(bool_val == wal_recovery_ignore);
int64_t wal_buffer_size = 128 * 1024 * 1024; // 128 M
int64_t wal_buffer_size = 128 * 1024 * 1024; // 128 M
ASSERT_TRUE(config.SetWalConfigBufferSize(std::to_string(wal_buffer_size)).ok());
ASSERT_TRUE(config.GetWalConfigBufferSize(int64_val).ok());
ASSERT_TRUE(int64_val == wal_buffer_size);
......@@ -419,7 +432,7 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
auto s = config.SetLogsMaxLogFileSize(logs_max_log_file_size);
ASSERT_TRUE(s.ok()) << s.message();
ASSERT_TRUE(config.GetLogsMaxLogFileSize(int64_val).ok());
ASSERT_TRUE(int64_val == 1000 * 1024 * 1024); // 1000MB
ASSERT_TRUE(int64_val == 1000 * 1024 * 1024); // 1000MB
int64_t logs_log_rotate_num = 100;
ASSERT_TRUE(config.SetLogsLogRotateNum(std::to_string(logs_log_rotate_num)).ok());
......@@ -498,8 +511,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_CLI_TEST) {
s = config.ProcessConfigCli(result, get_cmd);
ASSERT_TRUE(s.ok());
/* cache config */
std::string cache_cpu_cache_capacity = "1";
get_cmd = gen_get_command(ms::CONFIG_CACHE, ms::CONFIG_CACHE_CPU_CACHE_CAPACITY);
......@@ -681,7 +692,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) {
ASSERT_FALSE(config.SetDBConfigArchiveDaysThreshold("0x10").ok());
/* storage config */
ASSERT_FALSE(config.SetStorageConfigPath("").ok());
ASSERT_FALSE(config.SetStorageConfigPath("./milvus").ok());
......@@ -691,18 +701,18 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) {
ASSERT_FALSE(config.SetStorageConfigAutoFlushInterval("0.1").ok());
// ASSERT_FALSE(config.SetStorageConfigS3Enable("10").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Address("127.0.0").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Port("100").ok());
// ASSERT_FALSE(config.SetStorageConfigS3Port("100000").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3AccessKey("").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3SecretKey("").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Bucket("").ok());
// ASSERT_FALSE(config.SetStorageConfigS3Enable("10").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Address("127.0.0").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Port("100").ok());
// ASSERT_FALSE(config.SetStorageConfigS3Port("100000").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3AccessKey("").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3SecretKey("").ok());
//
// ASSERT_FALSE(config.SetStorageConfigS3Bucket("").ok());
/* metric config */
ASSERT_FALSE(config.SetMetricConfigEnableMonitor("Y").ok());
......@@ -1288,8 +1298,8 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) {
std::string reply_set, reply_get;
std::string cmd_set, cmd_get;
auto lambda = [&conf_file](const std::string& key, const std::string& child_key,
const std::string& default_value, std::string& value) {
auto lambda = [&conf_file](const std::string& key, const std::string& child_key, const std::string& default_value,
std::string& value) {
auto* ymgr = milvus::server::YamlConfigMgr::GetInstance();
auto status = ymgr->LoadConfigFile(conf_file);
......@@ -1310,52 +1320,58 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) {
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_CACHE, ms::CONFIG_CACHE_INSERT_BUFFER_SIZE,
ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value).ok());
ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("2", yaml_value);
// test boolean config value
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "True");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("true", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "On");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("true", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "False");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("false", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "Off");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("false", yaml_value);
// test path
cmd_set = gen_set_command(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, "/tmp/milvus_config_unittest");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH,
ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok());
ASSERT_EQ("/tmp/milvus_config_unittest", yaml_value);
#ifdef MILVUS_GPU_VERSION
cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "gpu0");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES,
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok());
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("gpu0", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "GPU0");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES,
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok());
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("gpu0", yaml_value);
#endif
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册