From 8b3a9295cf2f5160bbfc0317f905b5e26e3a5625 Mon Sep 17 00:00:00 2001 From: Cai Yudong Date: Tue, 24 May 2022 10:42:27 +0800 Subject: [PATCH] Optimize benchmark code structure (#190) Signed-off-by: yudong.cai --- unittest/benchmark/benchmark_faiss_test.cpp | 368 ++-------------- .../benchmark/benchmark_knowhere_perf.cpp | 401 +++--------------- .../benchmark/benchmark_knowhere_test.cpp | 398 +++-------------- unittest/benchmark/benchmark_sift.h | 297 +++++++++++++ .../{ => ref_log}/benchmark_faiss_ref.log | 0 .../{ => ref_log}/benchmark_knowhere_ref.log | 0 6 files changed, 442 insertions(+), 1022 deletions(-) create mode 100644 unittest/benchmark/benchmark_sift.h rename unittest/benchmark/{ => ref_log}/benchmark_faiss_ref.log (100%) rename unittest/benchmark/{ => ref_log}/benchmark_knowhere_ref.log (100%) diff --git a/unittest/benchmark/benchmark_faiss_test.cpp b/unittest/benchmark/benchmark_faiss_test.cpp index c92eff5a6..4d0bd9bf9 100644 --- a/unittest/benchmark/benchmark_faiss_test.cpp +++ b/unittest/benchmark/benchmark_faiss_test.cpp @@ -9,284 +9,35 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License. -#include -#include -#include -#include -#include - #include #include #include +#include -#include "unittest/utils.h" - -/***************************************************** - * To run this test, please download the HDF5 from - * https://support.hdfgroup.org/ftp/HDF5/releases/ - * and install it to /usr/local/hdf5 . - *****************************************************/ -#define DEBUG_VERBOSE 0 - -const char HDF5_POSTFIX[] = ".hdf5"; -const char HDF5_DATASET_TRAIN[] = "train"; -const char HDF5_DATASET_TEST[] = "test"; -const char HDF5_DATASET_NEIGHBORS[] = "neighbors"; -const char HDF5_DATASET_DISTANCES[] = "distances"; - -enum QueryMode { MODE_CPU = 0, MODE_MIX, MODE_GPU }; - -double elapsed() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return tv.tv_sec + tv.tv_usec * 1e-6; -} - -void normalize(float* arr, int32_t nq, int32_t dim) { - for (int32_t i = 0; i < nq; i++) { - double vecLen = 0.0, inv_vecLen = 0.0; - for (int32_t j = 0; j < dim; j++) { - double val = arr[i * dim + j]; - vecLen += val * val; - } - inv_vecLen = 1.0 / std::sqrt(vecLen); - for (int32_t j = 0; j < dim; j++) { - arr[i * dim + j] = (float)(arr[i * dim + j] * inv_vecLen); - } - } -} - -void* hdf5_read( - const std::string& file_name, - const std::string& dataset_name, - H5T_class_t dataset_class, - int32_t& d_out, - int32_t& n_out) { - - hid_t file, dataset, datatype, dataspace, memspace; - H5T_class_t t_class; /* data type class */ - hsize_t dimsm[3]; /* memory space dimensions */ - hsize_t dims_out[2]; /* dataset dimensions */ - hsize_t count[2]; /* size of the hyperslab in the file */ - hsize_t offset[2]; /* hyperslab offset in the file */ - hsize_t count_out[3]; /* size of the hyperslab in memory */ - hsize_t offset_out[3]; /* hyperslab offset in memory */ - void* data_out = nullptr; /* output buffer */ - - /* Open the file and the dataset. */ - file = H5Fopen(file_name.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); - dataset = H5Dopen2(file, dataset_name.c_str(), H5P_DEFAULT); - - /* Get datatype and dataspace handles and then query - * dataset class, order, size, rank and dimensions. */ - datatype = H5Dget_type(dataset); /* datatype handle */ - t_class = H5Tget_class(datatype); - assert(t_class == dataset_class || !"Illegal dataset class type"); - - dataspace = H5Dget_space(dataset); /* dataspace handle */ - H5Sget_simple_extent_dims(dataspace, dims_out, nullptr); - n_out = dims_out[0]; - d_out = dims_out[1]; - - /* Define hyperslab in the dataset. */ - offset[0] = offset[1] = 0; - count[0] = dims_out[0]; - count[1] = dims_out[1]; - H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, nullptr, count, nullptr); - - /* Define the memory dataspace. */ - dimsm[0] = dims_out[0]; - dimsm[1] = dims_out[1]; - dimsm[2] = 1; - memspace = H5Screate_simple(3, dimsm, nullptr); - - /* Define memory hyperslab. */ - offset_out[0] = offset_out[1] = offset_out[2] = 0; - count_out[0] = dims_out[0]; - count_out[1] = dims_out[1]; - count_out[2] = 1; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset_out, nullptr, count_out, nullptr); - - /* Read data from hyperslab in the file into the hyperslab in memory and display. */ - switch (t_class) { - case H5T_INTEGER: - data_out = new int[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_INT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - case H5T_FLOAT: - data_out = new float[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - default: - printf("Illegal dataset class type\n"); - break; - } - - /* Close/release resources. */ - H5Tclose(datatype); - H5Dclose(dataset); - H5Sclose(dataspace); - H5Sclose(memspace); - H5Fclose(file); - - return data_out; -} - -#if DEBUG_VERBOSE -void -print_array(const char* header, bool is_integer, const void* arr, int32_t nq, int32_t k) { - const int ROW = 10; - const int COL = 10; - assert(ROW <= nq); - assert(COL <= k); - printf("%s\n", header); - printf("==============================================\n"); - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - if (is_integer) { - printf("%7ld ", ((int64_t*)arr)[i * k + j]); - } else { - printf("%.6f ", ((float*)arr)[i * k + j]); - } - } - printf("\n"); - } - printf("\n"); -} -#endif +#include -/************************************************************************************ - * https://github.com/erikbern/ann-benchmarks - * - * Dataset Dimensions Train_size Test_size Neighbors Distance Download - * Fashion- - MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * GIST 960 1,000,000 1,000 100 Euclidean HDF5 (3.6GB) - * GloVe 100 1,183,514 10,000 100 Angular HDF5 (463MB) - * GloVe 200 1,183,514 10,000 100 Angular HDF5 (918MB) - * MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * NYTimes 256 290,000 10,000 100 Angular HDF5 (301MB) - * SIFT 128 1,000,000 10,000 100 Euclidean HDF5 (501MB) - *************************************************************************************/ +#include "unittest/benchmark/benchmark_sift.h" +#include "unittest/utils.h" -using idx_t = faiss::Index::idx_t; -using distance_t = faiss::Index::distance_t; +#define CALC_TIME_SPAN(X) \ + double t_start = elapsed(); \ + X; \ + double t_diff = elapsed() - t_start; -class Benchmark_faiss : public ::testing::Test { +class Benchmark_faiss : public Benchmark_sift { public: - double get_time_diff() { - return elapsed() - T0_; - } - - bool parse_ann_test_name() { - size_t pos1, pos2; - - if (ann_test_name_.empty()) { - return false; - } - - pos1 = ann_test_name_.find_first_of('-', 0); - if (pos1 == std::string::npos) { - return false; - } - pos2 = ann_test_name_.find_first_of('-', pos1 + 1); - if (pos2 == std::string::npos) { - return false; - } - - dim_ = std::stoi(ann_test_name_.substr(pos1 + 1, pos2 - pos1 - 1)); - std::string metric_str = ann_test_name_.substr(pos2 + 1); - if (metric_str == "angular") { - metric_type_ = faiss::METRIC_INNER_PRODUCT; - } else if (metric_str == "euclidean") { - metric_type_ = faiss::METRIC_L2; - } else { - return false; - } - - return true; - } - - int32_t CalcRecall(const idx_t* ids, int32_t nq, int32_t k) { - int32_t min_k = std::min(gt_k_, k); - int32_t hit = 0; - for (int32_t i = 0; i < nq; i++) { - std::unordered_set ground(gt_ids_ + i * gt_k_, gt_ids_ + i * gt_k_ + min_k); - for (int32_t j = 0; j < min_k; j++) { - idx_t id = ids[i * k + j]; - if (ground.count(id) > 0) { - hit++; - } - } - } - return hit; - } - - void load_base_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - printf("[%.3f s] Loading HDF5 file: %s\n", get_time_diff(), ann_file_name.c_str()); - xb_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TRAIN, H5T_FLOAT, dim, nb_); - assert(dim == dim_ || !"dataset does not have correct dimension"); - - if (metric_type_ == faiss::METRIC_INNER_PRODUCT) { - printf("[%.3f s] Normalizing base data set \n", get_time_diff()); - normalize(xb_, nb_, dim_); - } - } - - void load_query_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - xq_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TEST, H5T_FLOAT, dim, nq_); - assert(dim == dim_ || !"query does not have same dimension as train set"); - - if (metric_type_ == faiss::METRIC_INNER_PRODUCT) { - printf("[%.3f s] Normalizing query data \n", get_time_diff()); - normalize(xq_, nq_, dim_); - } - } - - void load_ground_truth() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - // load ground-truth and convert int to long - int32_t gt_nq; - int* gt_int = (int*)hdf5_read(ann_file_name, HDF5_DATASET_NEIGHBORS, H5T_INTEGER, gt_k_, gt_nq); - assert(gt_nq == nq_ || !"incorrect nb of ground truth index"); - - gt_ids_ = new idx_t[gt_k_ * nq_]; - for (int32_t i = 0; i < gt_k_ * nq_; i++) { - gt_ids_[i] = gt_int[i]; - } - delete[] gt_int; - -#if DEBUG_VERBOSE - faiss::Index::distance_t* gt_dist; // nq * k matrix of ground-truth nearest-neighbors distances - gt_dist = (float*)hdf5_read(ann_file_name, HDF5_DATASET_DISTANCES, H5T_FLOAT, k, nq2); - assert(nq2 == nq || !"incorrect nb of ground truth distance"); - - std::string str; - str = ann_test_name + " ground truth index"; - print_array(str.c_str(), true, gt, nq, k); - str = ann_test_name + " ground truth distance"; - print_array(str.c_str(), false, gt_dist, nq, k); - - delete gt_dist; -#endif - } - - void write_index(const std::string& filename) { + void + write_index(const std::string& filename) { faiss::write_index(index_, filename.c_str()); } - void read_index(const std::string& filename) { + void + read_index(const std::string& filename) { index_ = faiss::read_index(filename.c_str()); } - void create_cpu_index(const std::string& index_file_name) { + void + create_cpu_index(const std::string& index_file_name) { try { printf("[%.3f s] Reading index file: %s\n", get_time_diff(), index_file_name.c_str()); read_index(index_file_name); @@ -305,22 +56,18 @@ class Benchmark_faiss : public ::testing::Test { } } - void test_idmap() { + void + test_idmap() { idx_t* I = new idx_t[NQs_.back() * TOPKs_.back()]; distance_t* D = new distance_t[NQs_.back() * TOPKs_.back()]; - printf("\n[%0.3f s] %s | %s \n", - get_time_diff(), ann_test_name_.c_str(), index_key_.c_str()); + printf("\n[%0.3f s] %s | %s \n", get_time_diff(), ann_test_name_.c_str(), index_key_.c_str()); printf("================================================================================\n"); for (auto nq : NQs_) { for (auto k : TOPKs_) { - double t_start = elapsed(), t_end; - index_->search(nq, xq_, k, D, I); - t_end = elapsed(); - - int32_t hit = CalcRecall(I, nq, k); - printf(" nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + CALC_TIME_SPAN(index_->search(nq, xq_, k, D, I)); + float recall = CalcRecall(I, nq, k); + printf(" nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", nq, k, t_diff, recall); } } printf("================================================================================\n"); @@ -330,12 +77,12 @@ class Benchmark_faiss : public ::testing::Test { delete[] D; } - void test_ivf(const int32_t nlist) { + void + test_ivf(const int32_t nlist) { idx_t* I = new idx_t[NQs_.back() * TOPKs_.back()]; distance_t* D = new distance_t[NQs_.back() * TOPKs_.back()]; - printf("\n[%0.3f s] %s | %s | nlist=%d\n", - get_time_diff(), ann_test_name_.c_str(), index_key_.c_str(), nlist); + printf("\n[%0.3f s] %s | %s | nlist=%d\n", get_time_diff(), ann_test_name_.c_str(), index_key_.c_str(), nlist); printf("================================================================================\n"); for (auto nprobe : NPROBEs_) { faiss::ParameterSpace params; @@ -343,13 +90,10 @@ class Benchmark_faiss : public ::testing::Test { params.set_index_parameters(index_, nprobe_str.c_str()); for (auto nq : NQs_) { for (auto k : TOPKs_) { - double t_start = elapsed(), t_end; - index_->search(nq, xq_, k, D, I); - t_end = elapsed(); - - int32_t hit = CalcRecall(I, nq, k); - printf(" nprobe = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - nprobe, nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + CALC_TIME_SPAN(index_->search(nq, xq_, k, D, I)); + float recall = CalcRecall(I, nq, k); + printf(" nprobe = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", nprobe, nq, k, t_diff, + recall); } } } @@ -360,23 +104,20 @@ class Benchmark_faiss : public ::testing::Test { delete[] D; } - void test_hnsw(const int64_t M, const int64_t efConstruction) { + void + test_hnsw(const int64_t M, const int64_t efConstruction) { idx_t* I = new idx_t[NQs_.back() * TOPKs_.back()]; distance_t* D = new distance_t[NQs_.back() * TOPKs_.back()]; - printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", - get_time_diff(), ann_test_name_.c_str(), index_key_.c_str(), M, efConstruction); + printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", get_time_diff(), ann_test_name_.c_str(), + index_key_.c_str(), M, efConstruction); printf("================================================================================\n"); - for (auto ef: EFs_) { + for (auto ef : EFs_) { for (auto nq : NQs_) { for (auto k : TOPKs_) { - double t_start = elapsed(), t_end; - index_->search(nq_, xq_, k, D, I); - t_end = elapsed(); - - int32_t hit = CalcRecall(I, nq, k); - printf(" ef = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - ef, nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + CALC_TIME_SPAN(index_->search(nq_, xq_, k, D, I)); + float recall = CalcRecall(I, nq, k); + printf(" ef = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", ef, nq, k, t_diff, recall); } } } @@ -388,43 +129,16 @@ class Benchmark_faiss : public ::testing::Test { } protected: - void SetUp() override { - T0_ = elapsed(); - - if (!parse_ann_test_name()) { - assert(true); - } - - printf("[%.3f s] Loading base data\n", get_time_diff()); - load_base_data(); - - printf("[%.3f s] Loading queries\n", get_time_diff()); - load_query_data(); - - printf("[%.3f s] Loading ground truth\n", get_time_diff()); - load_ground_truth(); - + void + SetUp() override { + set_ann_test_name("sift-128-euclidean"); + Benchmark_sift::SetUp(); + metric_type_ = (metric_str_ == METRIC_IP_STR) ? faiss::METRIC_INNER_PRODUCT : faiss::METRIC_L2; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AUTO); } - void TearDown() override { - delete[] xb_; - delete[] xq_; - delete[] gt_ids_; - } - protected: - double T0_; - std::string ann_test_name_ = "sift-128-euclidean"; faiss::MetricType metric_type_; - int32_t dim_; - int32_t nb_; - int32_t nq_; - int32_t gt_k_; - distance_t* xb_; - distance_t* xq_; - idx_t* gt_ids_; // ground-truth index - std::string index_key_; faiss::Index* index_ = nullptr; diff --git a/unittest/benchmark/benchmark_knowhere_perf.cpp b/unittest/benchmark/benchmark_knowhere_perf.cpp index 4cb8b9cca..99282186b 100644 --- a/unittest/benchmark/benchmark_knowhere_perf.cpp +++ b/unittest/benchmark/benchmark_knowhere_perf.cpp @@ -10,278 +10,24 @@ // or implied. See the License for the specific language governing permissions and limitations under the License. #include -#include -#include + #include -#include #include "knowhere/index/IndexType.h" #include "knowhere/index/VecIndexFactory.h" #include "knowhere/index/vector_index/adapter/VectorAdapter.h" +#include "unittest/benchmark/benchmark_sift.h" #include "unittest/utils.h" -/***************************************************** - * To run this test, please download the HDF5 from - * https://support.hdfgroup.org/ftp/HDF5/releases/ - * and install it to /usr/local/hdf5 . - *****************************************************/ -#define DEBUG_VERBOSE 0 - -const char HDF5_POSTFIX[] = ".hdf5"; -const char HDF5_DATASET_TRAIN[] = "train"; -const char HDF5_DATASET_TEST[] = "test"; -const char HDF5_DATASET_NEIGHBORS[] = "neighbors"; -const char HDF5_DATASET_DISTANCES[] = "distances"; - -enum QueryMode { MODE_CPU = 0, MODE_MIX, MODE_GPU }; - -double elapsed() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return tv.tv_sec + tv.tv_usec * 1e-6; -} - -void normalize(float* arr, int32_t nq, int32_t dim) { - for (int32_t i = 0; i < nq; i++) { - double vecLen = 0.0, inv_vecLen = 0.0; - for (int32_t j = 0; j < dim; j++) { - double val = arr[i * dim + j]; - vecLen += val * val; - } - inv_vecLen = 1.0 / std::sqrt(vecLen); - for (int32_t j = 0; j < dim; j++) { - arr[i * dim + j] = (float)(arr[i * dim + j] * inv_vecLen); - } - } -} - -void* hdf5_read( - const std::string& file_name, - const std::string& dataset_name, - H5T_class_t dataset_class, - int32_t& d_out, - int32_t& n_out) { - - hid_t file, dataset, datatype, dataspace, memspace; - H5T_class_t t_class; /* data type class */ - hsize_t dimsm[3]; /* memory space dimensions */ - hsize_t dims_out[2]; /* dataset dimensions */ - hsize_t count[2]; /* size of the hyperslab in the file */ - hsize_t offset[2]; /* hyperslab offset in the file */ - hsize_t count_out[3]; /* size of the hyperslab in memory */ - hsize_t offset_out[3]; /* hyperslab offset in memory */ - void* data_out = nullptr; /* output buffer */ - - /* Open the file and the dataset. */ - file = H5Fopen(file_name.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); - dataset = H5Dopen2(file, dataset_name.c_str(), H5P_DEFAULT); - - /* Get datatype and dataspace handles and then query - * dataset class, order, size, rank and dimensions. */ - datatype = H5Dget_type(dataset); /* datatype handle */ - t_class = H5Tget_class(datatype); - assert(t_class == dataset_class || !"Illegal dataset class type"); - - dataspace = H5Dget_space(dataset); /* dataspace handle */ - H5Sget_simple_extent_dims(dataspace, dims_out, nullptr); - n_out = dims_out[0]; - d_out = dims_out[1]; - - /* Define hyperslab in the dataset. */ - offset[0] = offset[1] = 0; - count[0] = dims_out[0]; - count[1] = dims_out[1]; - H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, nullptr, count, nullptr); - - /* Define the memory dataspace. */ - dimsm[0] = dims_out[0]; - dimsm[1] = dims_out[1]; - dimsm[2] = 1; - memspace = H5Screate_simple(3, dimsm, nullptr); - - /* Define memory hyperslab. */ - offset_out[0] = offset_out[1] = offset_out[2] = 0; - count_out[0] = dims_out[0]; - count_out[1] = dims_out[1]; - count_out[2] = 1; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset_out, nullptr, count_out, nullptr); - - /* Read data from hyperslab in the file into the hyperslab in memory and display. */ - switch (t_class) { - case H5T_INTEGER: - data_out = new int[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_INT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - case H5T_FLOAT: - data_out = new float[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - default: - printf("Illegal dataset class type\n"); - break; - } - - /* Close/release resources. */ - H5Tclose(datatype); - H5Dclose(dataset); - H5Sclose(dataspace); - H5Sclose(memspace); - H5Fclose(file); - - return data_out; -} +#define CALC_TIME_SPAN(X) \ + double t_start = elapsed(); \ + X; \ + double t_diff = elapsed() - t_start; -#if DEBUG_VERBOSE -void -print_array(const char* header, bool is_integer, const void* arr, int32_t nq, int32_t k) { - const int ROW = 10; - const int COL = 10; - assert(ROW <= nq); - assert(COL <= k); - printf("%s\n", header); - printf("==============================================\n"); - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - if (is_integer) { - printf("%7ld ", ((int64_t*)arr)[i * k + j]); - } else { - printf("%.6f ", ((float*)arr)[i * k + j]); - } - } - printf("\n"); - } - printf("\n"); -} -#endif - -/************************************************************************************ - * https://github.com/erikbern/ann-benchmarks - * - * Dataset Dimensions Train_size Test_size Neighbors Distance Download - * Fashion- - MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * GIST 960 1,000,000 1,000 100 Euclidean HDF5 (3.6GB) - * GloVe 100 1,183,514 10,000 100 Angular HDF5 (463MB) - * GloVe 200 1,183,514 10,000 100 Angular HDF5 (918MB) - * MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * NYTimes 256 290,000 10,000 100 Angular HDF5 (301MB) - * SIFT 128 1,000,000 10,000 100 Euclidean HDF5 (501MB) - *************************************************************************************/ - -using idx_t = int64_t; -using distance_t = float; - -class Benchmark_knowhere_perf : public ::testing::Test { +class Benchmark_knowhere_perf : public Benchmark_sift { public: - double get_time_diff() { - return elapsed() - T0_; - } - - bool parse_ann_test_name() { - size_t pos1, pos2; - - if (ann_test_name_.empty()) { - return false; - } - - pos1 = ann_test_name_.find_first_of('-', 0); - if (pos1 == std::string::npos) { - return false; - } - pos2 = ann_test_name_.find_first_of('-', pos1 + 1); - if (pos2 == std::string::npos) { - return false; - } - - dim_ = std::stoi(ann_test_name_.substr(pos1 + 1, pos2 - pos1 - 1)); - std::string metric_str = ann_test_name_.substr(pos2 + 1); - if (metric_str == "angular") { - metric_type_ = knowhere::metric::IP; - } else if (metric_str == "euclidean") { - metric_type_ = knowhere::metric::L2; - } else { - return false; - } - - return true; - } - - int32_t CalcRecall(const idx_t* ids, int32_t nq_start, int32_t step, int32_t k) { - assert(nq_start + step <= 10000); - int32_t min_k = std::min(gt_k_, k); - int32_t hit = 0; - for (int32_t i = 0; i < step; i++) { - std::unordered_set ground(gt_ids_ + (i + nq_start) * gt_k_, gt_ids_ + (i + nq_start) * gt_k_ + min_k); - for (int32_t j = 0; j < min_k; j++) { - idx_t id = ids[i * k + j]; - if (ground.count(id) > 0) { - hit++; - } - } - } - return hit; - } - - void load_base_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - printf("[%.3f s] Loading HDF5 file: %s\n", get_time_diff(), ann_file_name.c_str()); - xb_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TRAIN, H5T_FLOAT, dim, nb_); - assert(dim == dim_ || !"dataset does not have correct dimension"); - - if (metric_type_ == knowhere::metric::IP) { - printf("[%.3f s] Normalizing base data set \n", get_time_diff()); - normalize(xb_, nb_, dim_); - } - } - - void load_query_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - xq_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TEST, H5T_FLOAT, dim, nq_); - assert(dim == dim_ || !"query does not have same dimension as train set"); - - if (metric_type_ == knowhere::metric::IP) { - printf("[%.3f s] Normalizing query data \n", get_time_diff()); - normalize(xq_, nq_, dim_); - } - } - - void load_ground_truth() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - // load ground-truth and convert int to long - int32_t gt_nq; - int* gt_int = (int*)hdf5_read(ann_file_name, HDF5_DATASET_NEIGHBORS, H5T_INTEGER, gt_k_, gt_nq); - assert(gt_nq == nq_ || !"incorrect nb of ground truth index"); - - gt_ids_ = new idx_t[gt_k_ * nq_]; - for (int32_t i = 0; i < gt_k_ * nq_; i++) { - gt_ids_[i] = gt_int[i]; - } - delete[] gt_int; - -#if DEBUG_VERBOSE - faiss::Index::distance_t* gt_dist; // nq * k matrix of ground-truth nearest-neighbors distances - gt_dist = (float*)hdf5_read(ann_file_name, HDF5_DATASET_DISTANCES, H5T_FLOAT, k, nq2); - assert(nq2 == nq || !"incorrect nb of ground truth distance"); - - std::string str; - str = ann_test_name + " ground truth index"; - print_array(str.c_str(), true, gt, nq, k); - str = ann_test_name + " ground truth distance"; - print_array(str.c_str(), false, gt_dist, nq, k); - - delete gt_dist; -#endif - } - - void write_index( - const std::string& filename, - const knowhere::Config& conf) { - + void + write_index(const std::string& filename, const knowhere::Config& conf) { binary_set_.clear(); FileIOWriter writer(filename); @@ -301,7 +47,8 @@ class Benchmark_knowhere_perf : public ::testing::Test { } } - void read_index(const std::string& filename) { + void + read_index(const std::string& filename) { binary_set_.clear(); FileIOReader reader(filename); @@ -331,7 +78,8 @@ class Benchmark_knowhere_perf : public ::testing::Test { } } - std::string get_index_name(const std::vector& params) { + std::string + get_index_name(const std::vector& params) { std::string params_str = ""; for (size_t i = 0; i < params.size(); i++) { params_str += "_" + std::to_string(params[i]); @@ -339,10 +87,8 @@ class Benchmark_knowhere_perf : public ::testing::Test { return ann_test_name_ + "_" + std::string(index_type_) + params_str + ".index"; } - void create_cpu_index( - const std::string& index_file_name, - const knowhere::Config& conf) { - + void + create_cpu_index(const std::string& index_file_name, const knowhere::Config& conf) { printf("[%.3f s] Creating CPU index \"%s\"\n", get_time_diff(), std::string(index_type_).c_str()); auto& factory = knowhere::VecIndexFactory::GetInstance(); index_ = factory.CreateVecIndex(index_type_); @@ -360,26 +106,22 @@ class Benchmark_knowhere_perf : public ::testing::Test { } } - void test_idmap(const knowhere::Config& cfg) { + void + test_idmap(const knowhere::Config& cfg) { auto conf = cfg; int32_t no = 0; - printf("\n[%0.3f s] %s | %s \n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str()); + printf("\n[%0.3f s] %s | %s \n", get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str()); printf("================================================================================\n"); for (int32_t i = 0; i + NQ_STEP_ <= GT_NQ_; i = (i + NQ_STEP_) % GT_NQ_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(NQ_STEP_, dim_, xq_ + (i * dim_)); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, i, NQ_STEP_, k); - printf(" No.%4d: nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", - no++, i, i + NQ_STEP_, k, (t_end - t_start), (hit / float(NQ_STEP_ * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, i, NQ_STEP_, k); + printf(" No.%4d: nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", no++, i, i + NQ_STEP_, k, + t_diff, recall); } } printf("================================================================================\n"); @@ -387,13 +129,14 @@ class Benchmark_knowhere_perf : public ::testing::Test { std::string(index_type_).c_str()); } - void test_ivf(const knowhere::Config& cfg) { + void + test_ivf(const knowhere::Config& cfg) { auto conf = cfg; auto nlist = knowhere::GetIndexParamNlist(conf); int32_t no = 0; - printf("\n[%0.3f s] %s | %s | nlist=%ld\n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), nlist); + printf("\n[%0.3f s] %s | %s | nlist=%ld\n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), nlist); printf("================================================================================\n"); for (int32_t i = 0; i + NQ_STEP_ <= GT_NQ_; i = (i + NQ_STEP_) % GT_NQ_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(NQ_STEP_, dim_, xq_ + (i * dim_)); @@ -401,15 +144,11 @@ class Benchmark_knowhere_perf : public ::testing::Test { knowhere::SetIndexParamNprobe(conf, nprobe); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, i, NQ_STEP_, k); - printf(" No.%4d: nprobe = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", - no++, nprobe, i, i + NQ_STEP_, k, (t_end - t_start), (hit / float(NQ_STEP_ * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, i, NQ_STEP_, k); + printf(" No.%4d: nprobe = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", no++, + nprobe, i, i + NQ_STEP_, k, t_diff, recall); } } } @@ -418,30 +157,27 @@ class Benchmark_knowhere_perf : public ::testing::Test { std::string(index_type_).c_str()); } - void test_hnsw(const knowhere::Config& cfg) { + void + test_hnsw(const knowhere::Config& cfg) { auto conf = cfg; auto M = knowhere::GetIndexParamHNSWM(conf); auto efConstruction = knowhere::GetIndexParamEfConstruction(conf); int32_t no = 0; - printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), M, efConstruction); + printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), M, efConstruction); printf("================================================================================\n"); for (int32_t i = 0; i + NQ_STEP_ <= GT_NQ_; i = (i + NQ_STEP_) % GT_NQ_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(NQ_STEP_, dim_, xq_ + (i * dim_)); - for (auto ef: EFs_) { + for (auto ef : EFs_) { knowhere::SetIndexParamEf(conf, ef); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, i, NQ_STEP_, k); - printf(" No.%4d: ef = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", - no++, ef, i, i + NQ_STEP_, k, (t_end - t_start), (hit / float(NQ_STEP_ * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, i, NQ_STEP_, k); + printf(" No.%4d: ef = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", no++, ef, i, + i + NQ_STEP_, k, t_diff, recall); } } } @@ -450,29 +186,26 @@ class Benchmark_knowhere_perf : public ::testing::Test { std::string(index_type_).c_str()); } - void test_annoy(const knowhere::Config& cfg) { + void + test_annoy(const knowhere::Config& cfg) { auto conf = cfg; auto n_trees = knowhere::GetIndexParamNtrees(conf); int32_t no = 0; - printf("\n[%0.3f s] %s | %s | n_trees=%ld \n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), n_trees); + printf("\n[%0.3f s] %s | %s | n_trees=%ld \n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), n_trees); printf("================================================================================\n"); for (int32_t i = 0; i + NQ_STEP_ <= GT_NQ_; i = (i + NQ_STEP_) % GT_NQ_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(NQ_STEP_, dim_, xq_ + (i * dim_)); - for (auto sk: SEARCH_Ks_) { + for (auto sk : SEARCH_Ks_) { knowhere::SetIndexParamSearchK(conf, sk); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, i, NQ_STEP_, k); - printf(" No.%4d: search_k = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", - no++, sk, i, i + NQ_STEP_, k, (t_end - t_start), (hit / float(NQ_STEP_ * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, i, NQ_STEP_, k); + printf(" No.%4d: search_k = %4d, nq = [%4d, %4d), k = %4d, elapse = %.4fs, R@ = %.4f\n", no++, sk, + i, i + NQ_STEP_, k, t_diff, recall); } } } @@ -482,44 +215,17 @@ class Benchmark_knowhere_perf : public ::testing::Test { } protected: - void SetUp() override { - T0_ = elapsed(); - - if (!parse_ann_test_name()) { - assert(true); - } - - printf("[%.3f s] Loading base data\n", get_time_diff()); - load_base_data(); - - printf("[%.3f s] Loading queries\n", get_time_diff()); - load_query_data(); - - printf("[%.3f s] Loading ground truth\n", get_time_diff()); - load_ground_truth(); - + void + SetUp() override { + set_ann_test_name("sift-128-euclidean"); + Benchmark_sift::SetUp(); + metric_type_ = (metric_str_ == METRIC_IP_STR) ? knowhere::metric::IP : knowhere::metric::L2; knowhere::SetMetaMetricType(cfg_, metric_type_); knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AUTO); } - void TearDown() override { - delete[] xb_; - delete[] xq_; - delete[] gt_ids_; - } - protected: - double T0_; - std::string ann_test_name_ = "sift-128-euclidean"; knowhere::MetricType metric_type_; - int32_t dim_; - int32_t nb_; - int32_t nq_; - int32_t gt_k_; - distance_t* xb_; - distance_t* xq_; - idx_t* gt_ids_; // ground-truth index - knowhere::BinarySet binary_set_; knowhere::IndexType index_type_; knowhere::VecIndexPtr index_ = nullptr; @@ -527,7 +233,6 @@ class Benchmark_knowhere_perf : public ::testing::Test { const int32_t GT_NQ_ = 10000; const int32_t NQ_STEP_ = 10; - const std::vector NQs_ = {10000}; const std::vector TOPKs_ = {10}; // IVF index params diff --git a/unittest/benchmark/benchmark_knowhere_test.cpp b/unittest/benchmark/benchmark_knowhere_test.cpp index 6dbf3d43f..97681950f 100644 --- a/unittest/benchmark/benchmark_knowhere_test.cpp +++ b/unittest/benchmark/benchmark_knowhere_test.cpp @@ -10,277 +10,24 @@ // or implied. See the License for the specific language governing permissions and limitations under the License. #include -#include -#include + #include -#include #include "knowhere/index/IndexType.h" #include "knowhere/index/VecIndexFactory.h" #include "knowhere/index/vector_index/adapter/VectorAdapter.h" +#include "unittest/benchmark/benchmark_sift.h" #include "unittest/utils.h" -/***************************************************** - * To run this test, please download the HDF5 from - * https://support.hdfgroup.org/ftp/HDF5/releases/ - * and install it to /usr/local/hdf5 . - *****************************************************/ -#define DEBUG_VERBOSE 0 - -const char HDF5_POSTFIX[] = ".hdf5"; -const char HDF5_DATASET_TRAIN[] = "train"; -const char HDF5_DATASET_TEST[] = "test"; -const char HDF5_DATASET_NEIGHBORS[] = "neighbors"; -const char HDF5_DATASET_DISTANCES[] = "distances"; - -enum QueryMode { MODE_CPU = 0, MODE_MIX, MODE_GPU }; - -double elapsed() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return tv.tv_sec + tv.tv_usec * 1e-6; -} - -void normalize(float* arr, int32_t nq, int32_t dim) { - for (int32_t i = 0; i < nq; i++) { - double vecLen = 0.0, inv_vecLen = 0.0; - for (int32_t j = 0; j < dim; j++) { - double val = arr[i * dim + j]; - vecLen += val * val; - } - inv_vecLen = 1.0 / std::sqrt(vecLen); - for (int32_t j = 0; j < dim; j++) { - arr[i * dim + j] = (float)(arr[i * dim + j] * inv_vecLen); - } - } -} - -void* hdf5_read( - const std::string& file_name, - const std::string& dataset_name, - H5T_class_t dataset_class, - int32_t& d_out, - int32_t& n_out) { - - hid_t file, dataset, datatype, dataspace, memspace; - H5T_class_t t_class; /* data type class */ - hsize_t dimsm[3]; /* memory space dimensions */ - hsize_t dims_out[2]; /* dataset dimensions */ - hsize_t count[2]; /* size of the hyperslab in the file */ - hsize_t offset[2]; /* hyperslab offset in the file */ - hsize_t count_out[3]; /* size of the hyperslab in memory */ - hsize_t offset_out[3]; /* hyperslab offset in memory */ - void* data_out = nullptr; /* output buffer */ - - /* Open the file and the dataset. */ - file = H5Fopen(file_name.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); - dataset = H5Dopen2(file, dataset_name.c_str(), H5P_DEFAULT); - - /* Get datatype and dataspace handles and then query - * dataset class, order, size, rank and dimensions. */ - datatype = H5Dget_type(dataset); /* datatype handle */ - t_class = H5Tget_class(datatype); - assert(t_class == dataset_class || !"Illegal dataset class type"); - - dataspace = H5Dget_space(dataset); /* dataspace handle */ - H5Sget_simple_extent_dims(dataspace, dims_out, nullptr); - n_out = dims_out[0]; - d_out = dims_out[1]; - - /* Define hyperslab in the dataset. */ - offset[0] = offset[1] = 0; - count[0] = dims_out[0]; - count[1] = dims_out[1]; - H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, nullptr, count, nullptr); - - /* Define the memory dataspace. */ - dimsm[0] = dims_out[0]; - dimsm[1] = dims_out[1]; - dimsm[2] = 1; - memspace = H5Screate_simple(3, dimsm, nullptr); - - /* Define memory hyperslab. */ - offset_out[0] = offset_out[1] = offset_out[2] = 0; - count_out[0] = dims_out[0]; - count_out[1] = dims_out[1]; - count_out[2] = 1; - H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset_out, nullptr, count_out, nullptr); - - /* Read data from hyperslab in the file into the hyperslab in memory and display. */ - switch (t_class) { - case H5T_INTEGER: - data_out = new int[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_INT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - case H5T_FLOAT: - data_out = new float[dims_out[0] * dims_out[1]]; - H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_out); - break; - default: - printf("Illegal dataset class type\n"); - break; - } - - /* Close/release resources. */ - H5Tclose(datatype); - H5Dclose(dataset); - H5Sclose(dataspace); - H5Sclose(memspace); - H5Fclose(file); - - return data_out; -} +#define CALC_TIME_SPAN(X) \ + double t_start = elapsed(); \ + X; \ + double t_diff = elapsed() - t_start; -#if DEBUG_VERBOSE -void -print_array(const char* header, bool is_integer, const void* arr, int32_t nq, int32_t k) { - const int ROW = 10; - const int COL = 10; - assert(ROW <= nq); - assert(COL <= k); - printf("%s\n", header); - printf("==============================================\n"); - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - if (is_integer) { - printf("%7ld ", ((int64_t*)arr)[i * k + j]); - } else { - printf("%.6f ", ((float*)arr)[i * k + j]); - } - } - printf("\n"); - } - printf("\n"); -} -#endif - -/************************************************************************************ - * https://github.com/erikbern/ann-benchmarks - * - * Dataset Dimensions Train_size Test_size Neighbors Distance Download - * Fashion- - MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * GIST 960 1,000,000 1,000 100 Euclidean HDF5 (3.6GB) - * GloVe 100 1,183,514 10,000 100 Angular HDF5 (463MB) - * GloVe 200 1,183,514 10,000 100 Angular HDF5 (918MB) - * MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) - * NYTimes 256 290,000 10,000 100 Angular HDF5 (301MB) - * SIFT 128 1,000,000 10,000 100 Euclidean HDF5 (501MB) - *************************************************************************************/ - -using idx_t = int64_t; -using distance_t = float; - -class Benchmark_knowhere : public ::testing::Test { +class Benchmark_knowhere : public Benchmark_sift { public: - double get_time_diff() { - return elapsed() - T0_; - } - - bool parse_ann_test_name() { - size_t pos1, pos2; - - if (ann_test_name_.empty()) { - return false; - } - - pos1 = ann_test_name_.find_first_of('-', 0); - if (pos1 == std::string::npos) { - return false; - } - pos2 = ann_test_name_.find_first_of('-', pos1 + 1); - if (pos2 == std::string::npos) { - return false; - } - - dim_ = std::stoi(ann_test_name_.substr(pos1 + 1, pos2 - pos1 - 1)); - std::string metric_str = ann_test_name_.substr(pos2 + 1); - if (metric_str == "angular") { - metric_type_ = knowhere::metric::IP; - } else if (metric_str == "euclidean") { - metric_type_ = knowhere::metric::L2; - } else { - return false; - } - - return true; - } - - int32_t CalcRecall(const idx_t* ids, int32_t nq, int32_t k) { - int32_t min_k = std::min(gt_k_, k); - int32_t hit = 0; - for (int32_t i = 0; i < nq; i++) { - std::unordered_set ground(gt_ids_ + i * gt_k_, gt_ids_ + i * gt_k_ + min_k); - for (int32_t j = 0; j < min_k; j++) { - idx_t id = ids[i * k + j]; - if (ground.count(id) > 0) { - hit++; - } - } - } - return hit; - } - - void load_base_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - printf("[%.3f s] Loading HDF5 file: %s\n", get_time_diff(), ann_file_name.c_str()); - xb_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TRAIN, H5T_FLOAT, dim, nb_); - assert(dim == dim_ || !"dataset does not have correct dimension"); - - if (metric_type_ == knowhere::metric::IP) { - printf("[%.3f s] Normalizing base data set \n", get_time_diff()); - normalize(xb_, nb_, dim_); - } - } - - void load_query_data() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - int32_t dim; - xq_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TEST, H5T_FLOAT, dim, nq_); - assert(dim == dim_ || !"query does not have same dimension as train set"); - - if (metric_type_ == knowhere::metric::IP) { - printf("[%.3f s] Normalizing query data \n", get_time_diff()); - normalize(xq_, nq_, dim_); - } - } - - void load_ground_truth() { - const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; - - // load ground-truth and convert int to long - int32_t gt_nq; - int* gt_int = (int*)hdf5_read(ann_file_name, HDF5_DATASET_NEIGHBORS, H5T_INTEGER, gt_k_, gt_nq); - assert(gt_nq == nq_ || !"incorrect nb of ground truth index"); - - gt_ids_ = new idx_t[gt_k_ * nq_]; - for (int32_t i = 0; i < gt_k_ * nq_; i++) { - gt_ids_[i] = gt_int[i]; - } - delete[] gt_int; - -#if DEBUG_VERBOSE - faiss::Index::distance_t* gt_dist; // nq * k matrix of ground-truth nearest-neighbors distances - gt_dist = (float*)hdf5_read(ann_file_name, HDF5_DATASET_DISTANCES, H5T_FLOAT, k, nq2); - assert(nq2 == nq || !"incorrect nb of ground truth distance"); - - std::string str; - str = ann_test_name + " ground truth index"; - print_array(str.c_str(), true, gt, nq, k); - str = ann_test_name + " ground truth distance"; - print_array(str.c_str(), false, gt_dist, nq, k); - - delete gt_dist; -#endif - } - - void write_index( - const std::string& filename, - const knowhere::Config& conf) { - + void + write_index(const std::string& filename, const knowhere::Config& conf) { binary_set_.clear(); FileIOWriter writer(filename); @@ -300,7 +47,8 @@ class Benchmark_knowhere : public ::testing::Test { } } - void read_index(const std::string& filename) { + void + read_index(const std::string& filename) { binary_set_.clear(); FileIOReader reader(filename); @@ -330,7 +78,8 @@ class Benchmark_knowhere : public ::testing::Test { } } - std::string get_index_name(const std::vector& params) { + std::string + get_index_name(const std::vector& params) { std::string params_str = ""; for (size_t i = 0; i < params.size(); i++) { params_str += "_" + std::to_string(params[i]); @@ -338,10 +87,8 @@ class Benchmark_knowhere : public ::testing::Test { return ann_test_name_ + "_" + std::string(index_type_) + params_str + ".index"; } - void create_cpu_index( - const std::string& index_file_name, - const knowhere::Config& conf) { - + void + create_cpu_index(const std::string& index_file_name, const knowhere::Config& conf) { printf("[%.3f s] Creating CPU index \"%s\"\n", get_time_diff(), std::string(index_type_).c_str()); auto& factory = knowhere::VecIndexFactory::GetInstance(); index_ = factory.CreateVecIndex(index_type_); @@ -359,25 +106,20 @@ class Benchmark_knowhere : public ::testing::Test { } } - void test_idmap(const knowhere::Config& cfg) { + void + test_idmap(const knowhere::Config& cfg) { auto conf = cfg; - printf("\n[%0.3f s] %s | %s \n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str()); + printf("\n[%0.3f s] %s | %s \n", get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str()); printf("================================================================================\n"); for (auto nq : NQs_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(nq, dim_, xq_); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, nq, k); - printf(" nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, nq, k); + printf(" nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", nq, k, t_diff, recall); } } printf("================================================================================\n"); @@ -385,12 +127,13 @@ class Benchmark_knowhere : public ::testing::Test { std::string(index_type_).c_str()); } - void test_ivf(const knowhere::Config& cfg) { + void + test_ivf(const knowhere::Config& cfg) { auto conf = cfg; auto nlist = knowhere::GetIndexParamNlist(conf); - printf("\n[%0.3f s] %s | %s | nlist=%ld\n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), nlist); + printf("\n[%0.3f s] %s | %s | nlist=%ld\n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), nlist); printf("================================================================================\n"); for (auto nprobe : NPROBEs_) { knowhere::SetIndexParamNprobe(conf, nprobe); @@ -398,15 +141,11 @@ class Benchmark_knowhere : public ::testing::Test { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(nq, dim_, xq_); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, nq, k); - printf(" nprobe = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - nprobe, nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, nq, k); + printf(" nprobe = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", nprobe, nq, k, t_diff, + recall); } } } @@ -415,29 +154,25 @@ class Benchmark_knowhere : public ::testing::Test { std::string(index_type_).c_str()); } - void test_hnsw(const knowhere::Config& cfg) { + void + test_hnsw(const knowhere::Config& cfg) { auto conf = cfg; auto M = knowhere::GetIndexParamHNSWM(conf); auto efConstruction = knowhere::GetIndexParamEfConstruction(conf); - printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), M, efConstruction); + printf("\n[%0.3f s] %s | %s | M=%ld | efConstruction=%ld\n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), M, efConstruction); printf("================================================================================\n"); - for (auto ef: EFs_) { + for (auto ef : EFs_) { knowhere::SetIndexParamEf(conf, ef); for (auto nq : NQs_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(nq, dim_, xq_); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, nq, k); - printf(" ef = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - ef, nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, nq, k); + printf(" ef = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", ef, nq, k, t_diff, recall); } } } @@ -446,28 +181,25 @@ class Benchmark_knowhere : public ::testing::Test { std::string(index_type_).c_str()); } - void test_annoy(const knowhere::Config& cfg) { + void + test_annoy(const knowhere::Config& cfg) { auto conf = cfg; auto n_trees = knowhere::GetIndexParamNtrees(conf); - printf("\n[%0.3f s] %s | %s | n_trees=%ld \n", - get_time_diff(), ann_test_name_.c_str(), std::string(index_type_).c_str(), n_trees); + printf("\n[%0.3f s] %s | %s | n_trees=%ld \n", get_time_diff(), ann_test_name_.c_str(), + std::string(index_type_).c_str(), n_trees); printf("================================================================================\n"); - for (auto sk: SEARCH_Ks_) { + for (auto sk : SEARCH_Ks_) { knowhere::SetIndexParamSearchK(conf, sk); for (auto nq : NQs_) { knowhere::DatasetPtr ds_ptr = knowhere::GenDataset(nq, dim_, xq_); for (auto k : TOPKs_) { knowhere::SetMetaTopk(conf, k); - - double t_start = elapsed(), t_end; - auto result = index_->Query(ds_ptr, conf, nullptr); - t_end = elapsed(); - + CALC_TIME_SPAN(auto result = index_->Query(ds_ptr, conf, nullptr)); auto ids = knowhere::GetDatasetIDs(result); - int32_t hit = CalcRecall(ids, nq, k); - printf(" search_k = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", - sk, nq, k, (t_end - t_start), (hit / float(nq * std::min(gt_k_, k)))); + float recall = CalcRecall(ids, nq, k); + printf(" search_k = %4d, nq = %4d, k = %4d, elapse = %.4fs, R@ = %.4f\n", sk, nq, k, t_diff, + recall); } } } @@ -477,44 +209,17 @@ class Benchmark_knowhere : public ::testing::Test { } protected: - void SetUp() override { - T0_ = elapsed(); - - if (!parse_ann_test_name()) { - assert(true); - } - - printf("[%.3f s] Loading base data\n", get_time_diff()); - load_base_data(); - - printf("[%.3f s] Loading queries\n", get_time_diff()); - load_query_data(); - - printf("[%.3f s] Loading ground truth\n", get_time_diff()); - load_ground_truth(); - + void + SetUp() override { + set_ann_test_name("sift-128-euclidean"); + Benchmark_sift::SetUp(); + metric_type_ = (metric_str_ == METRIC_IP_STR) ? knowhere::metric::IP : knowhere::metric::L2; knowhere::SetMetaMetricType(cfg_, metric_type_); knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AUTO); } - void TearDown() override { - delete[] xb_; - delete[] xq_; - delete[] gt_ids_; - } - protected: - double T0_; - std::string ann_test_name_ = "sift-128-euclidean"; knowhere::MetricType metric_type_; - int32_t dim_; - int32_t nb_; - int32_t nq_; - int32_t gt_k_; - distance_t* xb_; - distance_t* xq_; - idx_t* gt_ids_; // ground-truth index - knowhere::BinarySet binary_set_; knowhere::IndexType index_type_; knowhere::VecIndexPtr index_ = nullptr; @@ -597,7 +302,6 @@ TEST_F(Benchmark_knowhere, TEST_HNSW) { } TEST_F(Benchmark_knowhere, TEST_ANNOY) { - index_type_ = knowhere::IndexEnum::INDEX_ANNOY; knowhere::Config conf = cfg_; diff --git a/unittest/benchmark/benchmark_sift.h b/unittest/benchmark/benchmark_sift.h new file mode 100644 index 000000000..dfefe3e94 --- /dev/null +++ b/unittest/benchmark/benchmark_sift.h @@ -0,0 +1,297 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "knowhere/index/IndexType.h" +#include "knowhere/index/vector_index/adapter/VectorAdapter.h" + +/***************************************************** + * To run this test, please download the HDF5 from + * https://support.hdfgroup.org/ftp/HDF5/releases/ + * and install it to /usr/local/hdf5 . + *****************************************************/ +static const char* HDF5_POSTFIX = ".hdf5"; +static const char* HDF5_DATASET_TRAIN = "train"; +static const char* HDF5_DATASET_TEST = "test"; +static const char* HDF5_DATASET_NEIGHBORS = "neighbors"; +static const char* HDF5_DATASET_DISTANCES = "distances"; + +static const char* METRIC_IP_STR = "angular"; +static const char* METRIC_L2_STR = "euclidean"; + +/************************************************************************************ + * https://github.com/erikbern/ann-benchmarks + * + * Dataset Dimensions Train_size Test_size Neighbors Distance Download + * Fashion- + MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) + * GIST 960 1,000,000 1,000 100 Euclidean HDF5 (3.6GB) + * GloVe 100 1,183,514 10,000 100 Angular HDF5 (463MB) + * GloVe 200 1,183,514 10,000 100 Angular HDF5 (918MB) + * MNIST 784 60,000 10,000 100 Euclidean HDF5 (217MB) + * NYTimes 256 290,000 10,000 100 Angular HDF5 (301MB) + * SIFT 128 1,000,000 10,000 100 Euclidean HDF5 (501MB) + *************************************************************************************/ +using idx_t = int64_t; +using distance_t = float; + +class Benchmark_sift : public ::testing::Test { + public: + void + normalize(float* arr, int32_t nq, int32_t dim) { + for (int32_t i = 0; i < nq; i++) { + double vecLen = 0.0, inv_vecLen = 0.0; + for (int32_t j = 0; j < dim; j++) { + double val = arr[i * dim + j]; + vecLen += val * val; + } + inv_vecLen = 1.0 / std::sqrt(vecLen); + for (int32_t j = 0; j < dim; j++) { + arr[i * dim + j] = (float)(arr[i * dim + j] * inv_vecLen); + } + } + } + + double + elapsed() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return tv.tv_sec + tv.tv_usec * 1e-6; + } + + double + get_time_diff() { + return elapsed() - T0_; + } + + void + set_ann_test_name(const char* test_name) { + ann_test_name_ = test_name; + } + + float + CalcRecall(const idx_t* ids, int32_t nq, int32_t k) { + int32_t min_k = std::min(gt_k_, k); + int32_t hit = 0; + for (int32_t i = 0; i < nq; i++) { + std::unordered_set ground(gt_ids_ + i * gt_k_, gt_ids_ + i * gt_k_ + min_k); + for (int32_t j = 0; j < min_k; j++) { + idx_t id = ids[i * k + j]; + if (ground.count(id) > 0) { + hit++; + } + } + } + return (hit * 1.0f / (nq * min_k)); + } + + float + CalcRecall(const idx_t* ids, int32_t nq_start, int32_t step, int32_t k) { + assert(nq_start + step <= 10000); + int32_t min_k = std::min(gt_k_, k); + int32_t hit = 0; + for (int32_t i = 0; i < step; i++) { + std::unordered_set ground(gt_ids_ + (i + nq_start) * gt_k_, + gt_ids_ + (i + nq_start) * gt_k_ + min_k); + for (int32_t j = 0; j < min_k; j++) { + idx_t id = ids[i * k + j]; + if (ground.count(id) > 0) { + hit++; + } + } + } + return (hit * 1.0f / (step * min_k)); + } + + void + parse_ann_test_name() { + size_t pos1, pos2; + + assert(!ann_test_name_.empty() || !"ann_test_name not set"); + pos1 = ann_test_name_.find_first_of('-', 0); + assert(pos1 != std::string::npos); + + pos2 = ann_test_name_.find_first_of('-', pos1 + 1); + assert(pos2 != std::string::npos); + + dim_ = std::stoi(ann_test_name_.substr(pos1 + 1, pos2 - pos1 - 1)); + metric_str_ = ann_test_name_.substr(pos2 + 1); + assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); + } + + void + load_base_data() { + const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; + + int32_t dim; + printf("[%.3f s] Loading HDF5 file: %s\n", get_time_diff(), ann_file_name.c_str()); + xb_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TRAIN, H5T_FLOAT, dim, nb_); + assert(dim == dim_ || !"dataset does not have correct dimension"); + + if (metric_str_ == METRIC_IP_STR) { + printf("[%.3f s] Normalizing base data set \n", get_time_diff()); + normalize(xb_, nb_, dim_); + } + } + + void + load_query_data() { + const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; + + int32_t dim; + xq_ = (float*)hdf5_read(ann_file_name, HDF5_DATASET_TEST, H5T_FLOAT, dim, nq_); + assert(dim == dim_ || !"query does not have same dimension as train set"); + + if (metric_str_ == METRIC_IP_STR) { + printf("[%.3f s] Normalizing query data \n", get_time_diff()); + normalize(xq_, nq_, dim_); + } + } + + void + load_ground_truth() { + const std::string ann_file_name = ann_test_name_ + HDF5_POSTFIX; + + // load ground-truth and convert int to long + int32_t gt_nq; + int* gt_int = (int*)hdf5_read(ann_file_name, HDF5_DATASET_NEIGHBORS, H5T_INTEGER, gt_k_, gt_nq); + assert(gt_nq == nq_ || !"incorrect nb of ground truth index"); + + gt_ids_ = new idx_t[gt_k_ * nq_]; + for (int32_t i = 0; i < gt_k_ * nq_; i++) { + gt_ids_[i] = gt_int[i]; + } + delete[] gt_int; + +#if DEBUG_VERBOSE + distance_t* gt_dist = (float*)hdf5_read(ann_file_name, HDF5_DATASET_DISTANCES, H5T_FLOAT, k, nq2); + assert(nq2 == nq || !"incorrect nb of ground truth distance"); +#endif + } + + private: + void* + hdf5_read(const std::string& file_name, const std::string& dataset_name, H5T_class_t dataset_class, int32_t& d_out, + int32_t& n_out) { + hid_t file, dataset, datatype, dataspace, memspace; + H5T_class_t t_class; /* data type class */ + hsize_t dimsm[3]; /* memory space dimensions */ + hsize_t dims_out[2]; /* dataset dimensions */ + hsize_t count[2]; /* size of the hyperslab in the file */ + hsize_t offset[2]; /* hyperslab offset in the file */ + hsize_t count_out[3]; /* size of the hyperslab in memory */ + hsize_t offset_out[3]; /* hyperslab offset in memory */ + void* data_out = nullptr; /* output buffer */ + + /* Open the file and the dataset. */ + file = H5Fopen(file_name.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); + dataset = H5Dopen2(file, dataset_name.c_str(), H5P_DEFAULT); + + /* Get datatype and dataspace handles and then query + * dataset class, order, size, rank and dimensions. */ + datatype = H5Dget_type(dataset); /* datatype handle */ + t_class = H5Tget_class(datatype); + assert(t_class == dataset_class || !"Illegal dataset class type"); + + dataspace = H5Dget_space(dataset); /* dataspace handle */ + H5Sget_simple_extent_dims(dataspace, dims_out, nullptr); + n_out = dims_out[0]; + d_out = dims_out[1]; + + /* Define hyperslab in the dataset. */ + offset[0] = offset[1] = 0; + count[0] = dims_out[0]; + count[1] = dims_out[1]; + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, nullptr, count, nullptr); + + /* Define the memory dataspace. */ + dimsm[0] = dims_out[0]; + dimsm[1] = dims_out[1]; + dimsm[2] = 1; + memspace = H5Screate_simple(3, dimsm, nullptr); + + /* Define memory hyperslab. */ + offset_out[0] = offset_out[1] = offset_out[2] = 0; + count_out[0] = dims_out[0]; + count_out[1] = dims_out[1]; + count_out[2] = 1; + H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset_out, nullptr, count_out, nullptr); + + /* Read data from hyperslab in the file into the hyperslab in memory and display. */ + switch (t_class) { + case H5T_INTEGER: + data_out = new int[dims_out[0] * dims_out[1]]; + H5Dread(dataset, H5T_NATIVE_INT, memspace, dataspace, H5P_DEFAULT, data_out); + break; + case H5T_FLOAT: + data_out = new float[dims_out[0] * dims_out[1]]; + H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_out); + break; + default: + printf("Illegal dataset class type\n"); + break; + } + + /* Close/release resources. */ + H5Tclose(datatype); + H5Dclose(dataset); + H5Sclose(dataspace); + H5Sclose(memspace); + H5Fclose(file); + + return data_out; + } + + protected: + void + SetUp() override { + T0_ = elapsed(); + + parse_ann_test_name(); + + printf("[%.3f s] Loading base data\n", get_time_diff()); + load_base_data(); + + printf("[%.3f s] Loading queries\n", get_time_diff()); + load_query_data(); + + printf("[%.3f s] Loading ground truth\n", get_time_diff()); + load_ground_truth(); + } + + void + TearDown() override { + delete[] xb_; + delete[] xq_; + delete[] gt_ids_; + } + + protected: + double T0_; + std::string ann_test_name_ = ""; + std::string metric_str_; + int32_t dim_; + distance_t* xb_; + distance_t* xq_; + int32_t nb_; + int32_t nq_; + int32_t gt_k_; + idx_t* gt_ids_; // ground-truth index +}; diff --git a/unittest/benchmark/benchmark_faiss_ref.log b/unittest/benchmark/ref_log/benchmark_faiss_ref.log similarity index 100% rename from unittest/benchmark/benchmark_faiss_ref.log rename to unittest/benchmark/ref_log/benchmark_faiss_ref.log diff --git a/unittest/benchmark/benchmark_knowhere_ref.log b/unittest/benchmark/ref_log/benchmark_knowhere_ref.log similarity index 100% rename from unittest/benchmark/benchmark_knowhere_ref.log rename to unittest/benchmark/ref_log/benchmark_knowhere_ref.log