diff --git a/include/knowhere/config.h b/include/knowhere/config.h index 208d1ce31..faf36d6cb 100644 --- a/include/knowhere/config.h +++ b/include/knowhere/config.h @@ -508,6 +508,8 @@ class BaseConfig : public Config { CFG_BOOL retrieve_friendly; CFG_STRING data_path; CFG_STRING index_prefix; + // the size of the raw vector data + CFG_FLOAT vec_field_size_gb; // for distance metrics, we search for vectors with distance in [range_filter, radius). // for similarity metrics, we search for vectors with similarity in (radius, range_filter]. CFG_FLOAT radius; @@ -559,6 +561,10 @@ class BaseConfig : public Config { .allow_empty_without_default() .for_train() .for_deserialize(); + KNOWHERE_CONFIG_DECLARE_FIELD(vec_field_size_gb) + .description("the size (in GB) of the raw vector data.") + .set_default(0) + .for_train(); KNOWHERE_CONFIG_DECLARE_FIELD(k) .set_default(10) .description("search for top k similar vector.") diff --git a/src/index/diskann/diskann_config.h b/src/index/diskann/diskann_config.h index 660e1902d..db94bd922 100644 --- a/src/index/diskann/diskann_config.h +++ b/src/index/diskann/diskann_config.h @@ -33,6 +33,11 @@ class DiskANNConfig : public BaseConfig { // complexity. Plz set this value larger than the max_degree unless you need to build indices really quickly and can // somewhat compromise on quality. CFG_INT search_list_size; + + // The ratio of the size reserved for the pq code to the size of the raw data (defined with vec_field_size_gb) + // This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side. + // The index can indirectly obtain pq_code_budget_gb by vec_field_size_gb * pq_code_budget_gb_ratio + CFG_FLOAT pq_code_budget_gb_ratio; // Limit the size of the PQ code after the raw vector has been PQ-encoded. PQ code is a (pq_code_budget_gb * 1024 * // 1024 * 1024) / row_num)-dimensional uint8 vector. If pq_code_budget_gb is too large, it will be adjusted to the // size of dim*row_num. @@ -50,6 +55,12 @@ class DiskANNConfig : public BaseConfig { // This is the flag to enable fast build, in which we will not build vamana graph by full 2 round. This can // accelerate index build ~30% with an ~1% recall regression. CFG_BOOL accelerate_build; + + // The ratio of the size reserved for the search cache to the size of the raw data (defined with vec_field_size_gb) + // This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side. + // The index can indirectly obtain search_cache_budget_gb by vec_field_size_gb * search_cache_budget_gb_ratio + CFG_FLOAT search_cache_budget_gb_ratio; + // While serving the index, the entire graph is stored on SSD. For faster search performance, you can cache a few // frequently accessed nodes in memory. CFG_FLOAT search_cache_budget_gb; @@ -86,12 +97,19 @@ class DiskANNConfig : public BaseConfig { .for_search() .for_range_search() .for_iterator(); + KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb_ratio) + .description("the size of PQ compared with vector field data") + .set_default(0) + .set_range(0, std::numeric_limits::max()) + .for_train(); KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb) - .description("the size of PQ compressed representation in GB.") + .description("the ratio of the size reserved for the pq code to the size of the raw data.") + .set_default(0) .set_range(0, std::numeric_limits::max()) .for_train(); KNOWHERE_CONFIG_DECLARE_FIELD(build_dram_budget_gb) .description("limit on the memory allowed for building the index in GB.") + .set_default(0) .set_range(0, std::numeric_limits::max()) .for_train(); KNOWHERE_CONFIG_DECLARE_FIELD(disk_pq_dims) @@ -102,6 +120,12 @@ class DiskANNConfig : public BaseConfig { .description("a flag to enbale fast build.") .set_default(false) .for_train(); + KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb_ratio) + .description("the ratio of the size reserved for the search cache to the size of the raw data.") + .set_default(0) + .set_range(0, std::numeric_limits::max()) + .for_train() + .for_deserialize(); KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb) .description("the size of cached nodes in GB.") .set_default(0) @@ -148,6 +172,10 @@ class DiskANNConfig : public BaseConfig { if (!search_list_size.has_value()) { search_list_size = kDefaultSearchListSizeForBuild; } + pq_code_budget_gb = + std::max(pq_code_budget_gb.value(), pq_code_budget_gb_ratio.value() * vec_field_size_gb.value()); + search_cache_budget_gb = std::max(search_cache_budget_gb.value(), + search_cache_budget_gb_ratio.value() * vec_field_size_gb.value()); break; } case PARAM_TYPE::SEARCH: { diff --git a/tests/ut/test_diskann.cc b/tests/ut/test_diskann.cc index f59154670..666fda856 100644 --- a/tests/ut/test_diskann.cc +++ b/tests/ut/test_diskann.cc @@ -64,6 +64,52 @@ WriteRawDataToDisk(const std::string data_path, const DataType* raw_data, const } } // namespace +TEST_CASE("Valid diskann build params test", "[diskann]") { + int rows_num = 1000000; + auto version = GenTestVersionList(); + + auto ratio = GENERATE(as{}, 0.01, 0.1, 0.125); + + float pq_code_budget_gb = sizeof(float) * kDim * rows_num * 0.125 / (1024 * 1024 * 1024); + float search_cache_budget_gb = sizeof(float) * kDim * rows_num * 0.05 / (1024 * 1024 * 1024); + + auto test_gen = [&]() { + knowhere::Json json; + json["dim"] = kDim; + json["metric_type"] = "L2"; + json["k"] = 100; + json["index_prefix"] = kL2IndexPrefix; + json["data_path"] = kRawDataPath; + json["max_degree"] = 24; + json["search_list_size"] = 64; + json["vec_field_size_gb"] = 1.0; + json["pq_code_budget_gb_ratio"] = ratio; + json["pq_code_budget_gb"] = pq_code_budget_gb; + json["build_dram_budget_gb"] = 32.0; + json["search_cache_budget_gb_ratio"] = ratio; + json["search_cache_budget_gb"] = search_cache_budget_gb; + json["beamwidth"] = 8; + json["min_k"] = 10; + json["max_k"] = 8000; + return json; + }; + + SECTION("Dynamic param check") { + knowhere::Json test_json = test_gen(); + + auto cfg = knowhere::IndexStaticFaced::CreateConfig(knowhere::IndexEnum::INDEX_DISKANN, version); + knowhere::Json json_(test_json); + std::string msg; + auto res = knowhere::Config::FormatAndCheck(*cfg, json_, &msg); + REQUIRE(res == knowhere::Status::success); + res = knowhere::Config::Load(*cfg, json_, knowhere::PARAM_TYPE::TRAIN, &msg); + REQUIRE(res == knowhere::Status::success); + + knowhere::DiskANNConfig diskCfg = static_cast(*cfg); + REQUIRE(diskCfg.pq_code_budget_gb == std::max(pq_code_budget_gb, 1.0f * ratio)); + REQUIRE(diskCfg.search_cache_budget_gb == std::max(search_cache_budget_gb, 1.0f * ratio)); + } +} TEST_CASE("Invalid diskann params test", "[diskann]") { fs::remove_all(kDir);