Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

Commit

Permalink
Add Sanity Check, Query after Build (#764)
Browse files Browse the repository at this point in the history
Signed-off-by: Patrick Weizhi Xu <[email protected]>
  • Loading branch information
PwzXxm authored Mar 22, 2023
1 parent a7cc1f7 commit f184542
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 30 deletions.
4 changes: 4 additions & 0 deletions knowhere/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ if (NOT TARGET knowhere)
target_include_directories(knowhere PUBLIC ${KNOWHERE_SOURCE_DIR}/knowere)
endif ()

if (KNOWHERE_WITH_DISKANN)
target_compile_definitions(knowhere PUBLIC KNOWHERE_WITH_DISKANN)
endif()

target_link_libraries(knowhere ${depend_libs})

set (KNOWHERE_INCLUDE_DIRS
Expand Down
26 changes: 26 additions & 0 deletions knowhere/index/VecIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,52 @@
#include <memory>
#include <utility>
#include <vector>
#include <climits>

#include "knowhere/common/Dataset.h"
#include "knowhere/common/Exception.h"
#include "knowhere/common/Typedef.h"
#include "knowhere/common/Utils.h"
#include "knowhere/index/Index.h"
#include "knowhere/index/IndexType.h"
#include "knowhere/index/vector_index/Statistics.h"
#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
#include "knowhere/utils/BitsetView.h"
#ifdef KNOWHERE_WITH_DISKANN
#include "knowhere/index/vector_index/IndexDiskANNConfig.h"
#endif

namespace knowhere {

#define RAW_DATA "RAW_DATA"
#define QUANTIZATION_DATA "QUANTIZATION_DATA"

const int64_t kSanityCheckNumberOfQueries = 1;

class VecIndex : public Index {
public:
virtual void
BuildAll(const DatasetPtr& dataset_ptr, const Config& config) {
Train(dataset_ptr, config);
AddWithoutIds(dataset_ptr, config);

// sanity check
auto dim_on_storage = Dim();
Config sanity_check_config = GenSanityCheckConfig(config);
if (IndexEnum::INDEX_FAISS_BIN_IDMAP == index_type_ || IndexEnum::INDEX_FAISS_BIN_IVFFLAT == index_type_) {
auto num_bits = CHAR_BIT * sizeof(float);
dim_on_storage = (dim_on_storage + num_bits - 1) / num_bits;
}

#ifdef KNOWHERE_WITH_DISKANN
if (IndexEnum::INDEX_DISKANN == index_type_) {
sanity_check_config = GenSanityCheckDiskANNConfig(sanity_check_config);
Prepare(sanity_check_config);
}
#endif
std::vector<float> query_data(dim_on_storage, 0);
auto query_dataset = GenDataset(kSanityCheckNumberOfQueries, Dim(), query_data.data());
Query(query_dataset, sanity_check_config, nullptr);
}

virtual void
Expand Down
44 changes: 30 additions & 14 deletions knowhere/index/vector_index/IndexAnnoy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,29 +79,45 @@ IndexAnnoy::Load(const BinarySet& index_binary) {
}

void
IndexAnnoy::BuildAll(const DatasetPtr& dataset_ptr, const Config& config) {
if (index_) {
// it is builded all
LOG_KNOWHERE_DEBUG_ << "IndexAnnoy::BuildAll: index_ has been built!";
return;
IndexAnnoy::Train(const DatasetPtr& dataset_ptr, const Config& config) {
try {
GET_TENSOR_DATA_DIM(dataset_ptr)
metric_type_ = GetMetaMetricType(config);
if (metric_type_ == metric::L2) {
index_ =
std::make_shared<AnnoyIndex<int64_t, float, ::Euclidean, ::Kiss64Random, ThreadedBuildPolicy>>(dim);
} else if (metric_type_ == metric::IP) {
index_ =
std::make_shared<AnnoyIndex<int64_t, float, ::DotProduct, ::Kiss64Random, ThreadedBuildPolicy>>(dim);
} else {
KNOWHERE_THROW_MSG("metric not supported " + metric_type_);
}
} catch (std::exception& e) {
KNOWHERE_THROW_MSG(e.what());
}
is_build_ = false;
}

GET_TENSOR_DATA_DIM(dataset_ptr)
void
IndexAnnoy::AddWithoutIds(const DatasetPtr& dataset_ptr, const Config& config) {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize");
}

utils::SetBuildOmpThread(config);
metric_type_ = GetMetaMetricType(config);
if (metric_type_ == metric::L2) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::Euclidean, ::Kiss64Random, ThreadedBuildPolicy>>(dim);
} else if (metric_type_ == metric::IP) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::DotProduct, ::Kiss64Random, ThreadedBuildPolicy>>(dim);
} else {
KNOWHERE_THROW_MSG("metric not supported " + metric_type_);
// Annoy does not support `add` function, multiple calls will be ignored, same behaviour as before
if (is_build_) {
LOG_KNOWHERE_DEBUG_ << "IndexAnnoy::AddWithoutIds: index_ has been built! "
<< "Annoy not support build item dynamically, please invoke BuildAll interface.";
return;
}

GET_TENSOR_DATA_DIM(dataset_ptr)
utils::SetBuildOmpThread(config);
for (int i = 0; i < rows; ++i) {
index_->add_item(i, static_cast<const float*>(p_data) + dim * i);
}
index_->build(GetIndexParamNtrees(config));
is_build_ = true;
}

DatasetPtr
Expand Down
12 changes: 3 additions & 9 deletions knowhere/index/vector_index/IndexAnnoy.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,10 @@ class IndexAnnoy : public VecIndex {
Load(const BinarySet&) override;

void
BuildAll(const DatasetPtr&, const Config&) override;
Train(const DatasetPtr&, const Config&) override;

void
Train(const DatasetPtr&, const Config&) override {
KNOWHERE_THROW_MSG("Annoy not support build item dynamically, please invoke BuildAll interface.");
}

void
AddWithoutIds(const DatasetPtr&, const Config&) override {
KNOWHERE_THROW_MSG("Incremental index is not supported");
}
AddWithoutIds(const DatasetPtr&, const Config&) override;

DatasetPtr
GetVectorById(const DatasetPtr&, const Config&) override;
Expand All @@ -66,6 +59,7 @@ class IndexAnnoy : public VecIndex {
Size() override;

private:
bool is_build_ = false;
std::string metric_type_;
std::shared_ptr<ThreadPool> pool_;
std::shared_ptr<AnnoyIndexInterface<int64_t, float>> index_ = nullptr;
Expand Down
10 changes: 10 additions & 0 deletions knowhere/index/vector_index/IndexDiskANNConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,14 @@ void
DiskANNQueryByRangeConfig::Set(Config& config, const DiskANNQueryByRangeConfig& query_conf) {
config[kDiskANNQueryByRangeConfig] = query_conf;
}

const DiskANNPrepareConfig kSanityCheckDiskANNPrepareConfig; // use default
const DiskANNQueryConfig kSanityCheckDiskANNQueryConfig{kSanityCheckMinTopK, kSanityCheckMinTopK};

Config GenSanityCheckDiskANNConfig(const Config& build_config) {
Config config = build_config;
DiskANNPrepareConfig::Set(config, kSanityCheckDiskANNPrepareConfig);
DiskANNQueryConfig::Set(config, kSanityCheckDiskANNQueryConfig);
return config;
}
} // namespace knowhere
1 change: 1 addition & 0 deletions knowhere/index/vector_index/IndexDiskANNConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,5 @@ struct DiskANNQueryByRangeConfig {
Set(Config& config, const DiskANNQueryByRangeConfig& query_conf);
};

Config GenSanityCheckDiskANNConfig(const Config& build_config);
} // namespace knowhere
10 changes: 10 additions & 0 deletions knowhere/index/vector_index/helpers/IndexParameter.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,14 @@ GetFaissMetricType(const Config& cfg) {
return GetFaissMetricType(GetMetaMetricType(cfg));
}

constexpr int64_t kSanityCheckMinTopK = 1;

inline Config GenSanityCheckConfig(const Config& build_config) {
Config config = build_config;
SetMetaTopk(config, kSanityCheckMinTopK);
SetIndexParamEf(config, kSanityCheckMinTopK);
SetIndexParamNprobe(config, kSanityCheckMinTopK);
SetIndexParamSearchK(config, kSanityCheckMinTopK);
return config;
}
} // namespace knowhere
16 changes: 12 additions & 4 deletions knowhere/index/vector_offset_index/IndexIVF_NM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,19 @@ void
IVF_NM::Load(const BinarySet& binary_set) {
LoadImpl(binary_set, index_type_);

// Construct arranged data from original data
auto binary = binary_set.GetByName(RAW_DATA);
auto ivf_index = static_cast<faiss::IndexIVF*>(index_.get());
auto invlists = ivf_index->invlists;
auto d = ivf_index->d;
size_t nb = binary->size / invlists->code_size;
ArrangeData(nb, binary->data.get());
}

void
IVF_NM::ArrangeData(const size_t n, const uint8_t* data) {
auto ivf_index = static_cast<faiss::IndexIVF*>(index_.get());
auto invlists = ivf_index->invlists;
auto d = ivf_index->d;
ivf_index->prefix_sum.resize(invlists->nlist + 1);
size_t curr_index = 0;

Expand All @@ -78,12 +85,12 @@ IVF_NM::Load(const BinarySet& binary_set) {

#ifndef KNOWHERE_GPU_VERSION
auto ails = dynamic_cast<faiss::ArrayInvertedLists*>(invlists);
ivf_index->arranged_codes.resize(d * nb * sizeof(float));
ivf_index->arranged_codes.resize(d * n * sizeof(float));
for (size_t i = 0; i < invlists->nlist; i++) {
auto list_size = ails->ids[i].size();
for (size_t j = 0; j < list_size; j++) {
memcpy(ivf_index->arranged_codes.data() + d * (curr_index + j) * sizeof(float),
binary->data.get() + d * ails->ids[i][j] * sizeof(float), d * sizeof(float));
data + d * ails->ids[i][j] * sizeof(float), d * sizeof(float));
}
ivf_index->prefix_sum[i] = curr_index;
curr_index += list_size;
Expand All @@ -98,7 +105,7 @@ IVF_NM::Load(const BinarySet& binary_set) {
auto list_size = lengths[i];
for (size_t j = 0; j < list_size; j++) {
memcpy(arranged_data + d * (curr_index + j),
binary->data.get() + d * rol_ids[curr_index + j] * sizeof(float),
data + d * rol_ids[curr_index + j] * sizeof(float),
d * sizeof(float));
}
ivf_index->prefix_sum[i] = curr_index;
Expand Down Expand Up @@ -136,6 +143,7 @@ IVF_NM::AddWithoutIds(const DatasetPtr& dataset_ptr, const Config& config) {

GET_TENSOR_DATA(dataset_ptr)
index_->add_without_codes(rows, reinterpret_cast<const float*>(p_data));
ArrangeData(rows, reinterpret_cast<const uint8_t*>(p_data));
}

DatasetPtr
Expand Down
3 changes: 3 additions & 0 deletions knowhere/index/vector_offset_index/IndexIVF_NM.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ class IVF_NM : public VecIndex, public OffsetBaseIndex {

// ro_codes: if GPU, hold a ptr of read only codes so that destruction won't be done twice
faiss::PageLockMemoryPtr ro_codes_ = nullptr;

private:
void ArrangeData(const size_t n, const uint8_t* data);
};

using IVFNMPtr = std::shared_ptr<IVF_NM>;
Expand Down
2 changes: 0 additions & 2 deletions unittest/test_annoy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@ TEST_P(AnnoyTest, annoy_basic) {

// null faiss index
{
ASSERT_ANY_THROW(index_->Train(base_dataset, conf_));
ASSERT_ANY_THROW(index_->Query(query_dataset, conf_, nullptr));
ASSERT_ANY_THROW(index_->Serialize(conf_));
ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
ASSERT_ANY_THROW(index_->Count());
ASSERT_ANY_THROW(index_->Dim());
}
Expand Down
2 changes: 1 addition & 1 deletion unittest/test_async.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ TEST_P(AsyncIndexTest, async_query_thread_num) {
index_->BuildAll(base_dataset, conf_);
int32_t num_threads_after_build = knowhere::threadchecker::GetThreadNum(pid);
EXPECT_GE(knowhere::threadchecker::GetBuildOmpThread(conf_),
num_threads_after_build - num_threads_before_build + 1);
num_threads_after_build - num_threads_before_build);
for (int i = 0; i < kQuerySum; i++) {
index_->QueryAsync(query_dataset, conf_, nullptr);
}
Expand Down

0 comments on commit f184542

Please sign in to comment.