From f18454228ffcdc618f2b977cd6105fda1afbb748 Mon Sep 17 00:00:00 2001 From: Patrick Weizhi Xu Date: Wed, 22 Mar 2023 23:09:56 +0800 Subject: [PATCH] Add Sanity Check, Query after Build (#764) Signed-off-by: Patrick Weizhi Xu --- knowhere/CMakeLists.txt | 4 ++ knowhere/index/VecIndex.h | 26 +++++++++++ knowhere/index/vector_index/IndexAnnoy.cpp | 44 +++++++++++++------ knowhere/index/vector_index/IndexAnnoy.h | 12 ++--- .../index/vector_index/IndexDiskANNConfig.cpp | 10 +++++ .../index/vector_index/IndexDiskANNConfig.h | 1 + .../vector_index/helpers/IndexParameter.h | 10 +++++ .../index/vector_offset_index/IndexIVF_NM.cpp | 16 +++++-- .../index/vector_offset_index/IndexIVF_NM.h | 3 ++ unittest/test_annoy.cpp | 2 - unittest/test_async.cpp | 2 +- 11 files changed, 100 insertions(+), 30 deletions(-) diff --git a/knowhere/CMakeLists.txt b/knowhere/CMakeLists.txt index 148fcc411..090cab25f 100644 --- a/knowhere/CMakeLists.txt +++ b/knowhere/CMakeLists.txt @@ -125,6 +125,10 @@ if (NOT TARGET knowhere) target_include_directories(knowhere PUBLIC ${KNOWHERE_SOURCE_DIR}/knowere) endif () +if (KNOWHERE_WITH_DISKANN) + target_compile_definitions(knowhere PUBLIC KNOWHERE_WITH_DISKANN) +endif() + target_link_libraries(knowhere ${depend_libs}) set (KNOWHERE_INCLUDE_DIRS diff --git a/knowhere/index/VecIndex.h b/knowhere/index/VecIndex.h index 0900c5d0e..0b31b0571 100644 --- a/knowhere/index/VecIndex.h +++ b/knowhere/index/VecIndex.h @@ -14,26 +14,52 @@ #include #include #include +#include #include "knowhere/common/Dataset.h" #include "knowhere/common/Exception.h" #include "knowhere/common/Typedef.h" +#include "knowhere/common/Utils.h" #include "knowhere/index/Index.h" #include "knowhere/index/IndexType.h" #include "knowhere/index/vector_index/Statistics.h" +#include "knowhere/index/vector_index/adapter/VectorAdapter.h" #include "knowhere/utils/BitsetView.h" +#ifdef KNOWHERE_WITH_DISKANN +#include "knowhere/index/vector_index/IndexDiskANNConfig.h" +#endif namespace knowhere { #define RAW_DATA "RAW_DATA" #define QUANTIZATION_DATA "QUANTIZATION_DATA" +const int64_t kSanityCheckNumberOfQueries = 1; + class VecIndex : public Index { public: virtual void BuildAll(const DatasetPtr& dataset_ptr, const Config& config) { Train(dataset_ptr, config); AddWithoutIds(dataset_ptr, config); + + // sanity check + auto dim_on_storage = Dim(); + Config sanity_check_config = GenSanityCheckConfig(config); + if (IndexEnum::INDEX_FAISS_BIN_IDMAP == index_type_ || IndexEnum::INDEX_FAISS_BIN_IVFFLAT == index_type_) { + auto num_bits = CHAR_BIT * sizeof(float); + dim_on_storage = (dim_on_storage + num_bits - 1) / num_bits; + } + +#ifdef KNOWHERE_WITH_DISKANN + if (IndexEnum::INDEX_DISKANN == index_type_) { + sanity_check_config = GenSanityCheckDiskANNConfig(sanity_check_config); + Prepare(sanity_check_config); + } +#endif + std::vector query_data(dim_on_storage, 0); + auto query_dataset = GenDataset(kSanityCheckNumberOfQueries, Dim(), query_data.data()); + Query(query_dataset, sanity_check_config, nullptr); } virtual void diff --git a/knowhere/index/vector_index/IndexAnnoy.cpp b/knowhere/index/vector_index/IndexAnnoy.cpp index c1fe5ed09..eea236e99 100644 --- a/knowhere/index/vector_index/IndexAnnoy.cpp +++ b/knowhere/index/vector_index/IndexAnnoy.cpp @@ -79,29 +79,45 @@ IndexAnnoy::Load(const BinarySet& index_binary) { } void -IndexAnnoy::BuildAll(const DatasetPtr& dataset_ptr, const Config& config) { - if (index_) { - // it is builded all - LOG_KNOWHERE_DEBUG_ << "IndexAnnoy::BuildAll: index_ has been built!"; - return; +IndexAnnoy::Train(const DatasetPtr& dataset_ptr, const Config& config) { + try { + GET_TENSOR_DATA_DIM(dataset_ptr) + metric_type_ = GetMetaMetricType(config); + if (metric_type_ == metric::L2) { + index_ = + std::make_shared>(dim); + } else if (metric_type_ == metric::IP) { + index_ = + std::make_shared>(dim); + } else { + KNOWHERE_THROW_MSG("metric not supported " + metric_type_); + } + } catch (std::exception& e) { + KNOWHERE_THROW_MSG(e.what()); } + is_build_ = false; +} - GET_TENSOR_DATA_DIM(dataset_ptr) +void +IndexAnnoy::AddWithoutIds(const DatasetPtr& dataset_ptr, const Config& config) { + if (!index_) { + KNOWHERE_THROW_MSG("index not initialize"); + } - utils::SetBuildOmpThread(config); - metric_type_ = GetMetaMetricType(config); - if (metric_type_ == metric::L2) { - index_ = std::make_shared>(dim); - } else if (metric_type_ == metric::IP) { - index_ = std::make_shared>(dim); - } else { - KNOWHERE_THROW_MSG("metric not supported " + metric_type_); + // Annoy does not support `add` function, multiple calls will be ignored, same behaviour as before + if (is_build_) { + LOG_KNOWHERE_DEBUG_ << "IndexAnnoy::AddWithoutIds: index_ has been built! " + << "Annoy not support build item dynamically, please invoke BuildAll interface."; + return; } + GET_TENSOR_DATA_DIM(dataset_ptr) + utils::SetBuildOmpThread(config); for (int i = 0; i < rows; ++i) { index_->add_item(i, static_cast(p_data) + dim * i); } index_->build(GetIndexParamNtrees(config)); + is_build_ = true; } DatasetPtr diff --git a/knowhere/index/vector_index/IndexAnnoy.h b/knowhere/index/vector_index/IndexAnnoy.h index 23c76ae19..7ef3abb00 100644 --- a/knowhere/index/vector_index/IndexAnnoy.h +++ b/knowhere/index/vector_index/IndexAnnoy.h @@ -38,17 +38,10 @@ class IndexAnnoy : public VecIndex { Load(const BinarySet&) override; void - BuildAll(const DatasetPtr&, const Config&) override; + Train(const DatasetPtr&, const Config&) override; void - Train(const DatasetPtr&, const Config&) override { - KNOWHERE_THROW_MSG("Annoy not support build item dynamically, please invoke BuildAll interface."); - } - - void - AddWithoutIds(const DatasetPtr&, const Config&) override { - KNOWHERE_THROW_MSG("Incremental index is not supported"); - } + AddWithoutIds(const DatasetPtr&, const Config&) override; DatasetPtr GetVectorById(const DatasetPtr&, const Config&) override; @@ -66,6 +59,7 @@ class IndexAnnoy : public VecIndex { Size() override; private: + bool is_build_ = false; std::string metric_type_; std::shared_ptr pool_; std::shared_ptr> index_ = nullptr; diff --git a/knowhere/index/vector_index/IndexDiskANNConfig.cpp b/knowhere/index/vector_index/IndexDiskANNConfig.cpp index 23ca47ea5..3998c7cf2 100644 --- a/knowhere/index/vector_index/IndexDiskANNConfig.cpp +++ b/knowhere/index/vector_index/IndexDiskANNConfig.cpp @@ -290,4 +290,14 @@ void DiskANNQueryByRangeConfig::Set(Config& config, const DiskANNQueryByRangeConfig& query_conf) { config[kDiskANNQueryByRangeConfig] = query_conf; } + +const DiskANNPrepareConfig kSanityCheckDiskANNPrepareConfig; // use default +const DiskANNQueryConfig kSanityCheckDiskANNQueryConfig{kSanityCheckMinTopK, kSanityCheckMinTopK}; + +Config GenSanityCheckDiskANNConfig(const Config& build_config) { + Config config = build_config; + DiskANNPrepareConfig::Set(config, kSanityCheckDiskANNPrepareConfig); + DiskANNQueryConfig::Set(config, kSanityCheckDiskANNQueryConfig); + return config; +} } // namespace knowhere diff --git a/knowhere/index/vector_index/IndexDiskANNConfig.h b/knowhere/index/vector_index/IndexDiskANNConfig.h index 05ae1dfdc..3929c5419 100644 --- a/knowhere/index/vector_index/IndexDiskANNConfig.h +++ b/knowhere/index/vector_index/IndexDiskANNConfig.h @@ -121,4 +121,5 @@ struct DiskANNQueryByRangeConfig { Set(Config& config, const DiskANNQueryByRangeConfig& query_conf); }; +Config GenSanityCheckDiskANNConfig(const Config& build_config); } // namespace knowhere diff --git a/knowhere/index/vector_index/helpers/IndexParameter.h b/knowhere/index/vector_index/helpers/IndexParameter.h index ffc9b731b..5d2a215cd 100644 --- a/knowhere/index/vector_index/helpers/IndexParameter.h +++ b/knowhere/index/vector_index/helpers/IndexParameter.h @@ -199,4 +199,14 @@ GetFaissMetricType(const Config& cfg) { return GetFaissMetricType(GetMetaMetricType(cfg)); } +constexpr int64_t kSanityCheckMinTopK = 1; + +inline Config GenSanityCheckConfig(const Config& build_config) { + Config config = build_config; + SetMetaTopk(config, kSanityCheckMinTopK); + SetIndexParamEf(config, kSanityCheckMinTopK); + SetIndexParamNprobe(config, kSanityCheckMinTopK); + SetIndexParamSearchK(config, kSanityCheckMinTopK); + return config; +} } // namespace knowhere diff --git a/knowhere/index/vector_offset_index/IndexIVF_NM.cpp b/knowhere/index/vector_offset_index/IndexIVF_NM.cpp index 8d440e225..7e24dbe9a 100644 --- a/knowhere/index/vector_offset_index/IndexIVF_NM.cpp +++ b/knowhere/index/vector_offset_index/IndexIVF_NM.cpp @@ -61,12 +61,19 @@ void IVF_NM::Load(const BinarySet& binary_set) { LoadImpl(binary_set, index_type_); - // Construct arranged data from original data auto binary = binary_set.GetByName(RAW_DATA); auto ivf_index = static_cast(index_.get()); auto invlists = ivf_index->invlists; auto d = ivf_index->d; size_t nb = binary->size / invlists->code_size; + ArrangeData(nb, binary->data.get()); +} + +void +IVF_NM::ArrangeData(const size_t n, const uint8_t* data) { + auto ivf_index = static_cast(index_.get()); + auto invlists = ivf_index->invlists; + auto d = ivf_index->d; ivf_index->prefix_sum.resize(invlists->nlist + 1); size_t curr_index = 0; @@ -78,12 +85,12 @@ IVF_NM::Load(const BinarySet& binary_set) { #ifndef KNOWHERE_GPU_VERSION auto ails = dynamic_cast(invlists); - ivf_index->arranged_codes.resize(d * nb * sizeof(float)); + ivf_index->arranged_codes.resize(d * n * sizeof(float)); for (size_t i = 0; i < invlists->nlist; i++) { auto list_size = ails->ids[i].size(); for (size_t j = 0; j < list_size; j++) { memcpy(ivf_index->arranged_codes.data() + d * (curr_index + j) * sizeof(float), - binary->data.get() + d * ails->ids[i][j] * sizeof(float), d * sizeof(float)); + data + d * ails->ids[i][j] * sizeof(float), d * sizeof(float)); } ivf_index->prefix_sum[i] = curr_index; curr_index += list_size; @@ -98,7 +105,7 @@ IVF_NM::Load(const BinarySet& binary_set) { auto list_size = lengths[i]; for (size_t j = 0; j < list_size; j++) { memcpy(arranged_data + d * (curr_index + j), - binary->data.get() + d * rol_ids[curr_index + j] * sizeof(float), + data + d * rol_ids[curr_index + j] * sizeof(float), d * sizeof(float)); } ivf_index->prefix_sum[i] = curr_index; @@ -136,6 +143,7 @@ IVF_NM::AddWithoutIds(const DatasetPtr& dataset_ptr, const Config& config) { GET_TENSOR_DATA(dataset_ptr) index_->add_without_codes(rows, reinterpret_cast(p_data)); + ArrangeData(rows, reinterpret_cast(p_data)); } DatasetPtr diff --git a/knowhere/index/vector_offset_index/IndexIVF_NM.h b/knowhere/index/vector_offset_index/IndexIVF_NM.h index 5900ae2d8..6c64e0470 100644 --- a/knowhere/index/vector_offset_index/IndexIVF_NM.h +++ b/knowhere/index/vector_offset_index/IndexIVF_NM.h @@ -104,6 +104,9 @@ class IVF_NM : public VecIndex, public OffsetBaseIndex { // ro_codes: if GPU, hold a ptr of read only codes so that destruction won't be done twice faiss::PageLockMemoryPtr ro_codes_ = nullptr; + + private: + void ArrangeData(const size_t n, const uint8_t* data); }; using IVFNMPtr = std::shared_ptr; diff --git a/unittest/test_annoy.cpp b/unittest/test_annoy.cpp index 01e7f53ec..2fbcd1621 100644 --- a/unittest/test_annoy.cpp +++ b/unittest/test_annoy.cpp @@ -46,10 +46,8 @@ TEST_P(AnnoyTest, annoy_basic) { // null faiss index { - ASSERT_ANY_THROW(index_->Train(base_dataset, conf_)); ASSERT_ANY_THROW(index_->Query(query_dataset, conf_, nullptr)); ASSERT_ANY_THROW(index_->Serialize(conf_)); - ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_)); ASSERT_ANY_THROW(index_->Count()); ASSERT_ANY_THROW(index_->Dim()); } diff --git a/unittest/test_async.cpp b/unittest/test_async.cpp index f81f77c00..3b2099dc8 100644 --- a/unittest/test_async.cpp +++ b/unittest/test_async.cpp @@ -63,7 +63,7 @@ TEST_P(AsyncIndexTest, async_query_thread_num) { index_->BuildAll(base_dataset, conf_); int32_t num_threads_after_build = knowhere::threadchecker::GetThreadNum(pid); EXPECT_GE(knowhere::threadchecker::GetBuildOmpThread(conf_), - num_threads_after_build - num_threads_before_build + 1); + num_threads_after_build - num_threads_before_build); for (int i = 0; i < kQuerySum; i++) { index_->QueryAsync(query_dataset, conf_, nullptr); }