Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

optimize gpu bitset filter #875

Merged
merged 1 commit into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ include(cmake/libs/libfaiss.cmake)
include(cmake/libs/libhnsw.cmake)

include_directories(thirdparty/faiss)
include_directories(thirdparty/bitset)

find_package(OpenMP REQUIRED)

Expand Down Expand Up @@ -148,14 +147,6 @@ add_library(knowhere SHARED ${KNOWHERE_SRCS})
add_dependencies(knowhere ${KNOWHERE_LINKER_LIBS})
if(WITH_RAFT)
list(APPEND KNOWHERE_LINKER_LIBS raft::raft raft::compiled)
find_library(LIBRAFT_FOUND raft)
if (NOT LIBRAFT_FOUND)
message(WARNING "libraft not found")
else()
message(STATUS "libraft found")
list(APPEND KNOWHERE_LINKER_LIBS ${LIBRAFT_FOUND})
add_definitions(-DRAFT_COMPILED)
endif()
endif()
target_link_libraries(knowhere PUBLIC ${KNOWHERE_LINKER_LIBS})
target_include_directories(
Expand Down
16 changes: 12 additions & 4 deletions src/index/ivf/ivf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,11 +532,11 @@ IvfIndexNode<T>::GetVectorByIds(const DataSet& dataset) const {
if (!this->index_->is_trained) {
return unexpected(Status::index_not_trained);
}
auto dim = Dim();
auto rows = dataset.GetRows();
auto ids = dataset.GetIds();

if constexpr (std::is_same<T, faiss::IndexBinaryIVF>::value) {
auto dim = Dim();
auto rows = dataset.GetRows();
auto ids = dataset.GetIds();

uint8_t* data = nullptr;
try {
data = new uint8_t[dim * rows / 8];
Expand All @@ -553,6 +553,10 @@ IvfIndexNode<T>::GetVectorByIds(const DataSet& dataset) const {
return unexpected(Status::faiss_inner_error);
}
} else if constexpr (std::is_same<T, faiss::IndexIVFFlat>::value) {
auto dim = Dim();
auto rows = dataset.GetRows();
auto ids = dataset.GetIds();

float* data = nullptr;
try {
data = new float[dim * rows];
Expand All @@ -569,6 +573,10 @@ IvfIndexNode<T>::GetVectorByIds(const DataSet& dataset) const {
return unexpected(Status::faiss_inner_error);
}
} else if constexpr (std::is_same<T, faiss::IndexIVFFlatCC>::value) {
auto dim = Dim();
auto rows = dataset.GetRows();
auto ids = dataset.GetIds();

float* data = nullptr;
try {
data = new float[dim * rows];
Expand Down
138 changes: 99 additions & 39 deletions src/index/ivf_raft/ivf_raft.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,42 +45,86 @@
namespace knowhere {

__global__ void
filter(const int k1, const int k2, const int nq, const uint8_t* bs, int64_t* ids_before, float* dis_before,
int64_t* ids, float* dis) {
filter(const uint8_t* bs, const int64_t* ids_before, int32_t* ids_block, int len) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
extern __shared__ char s[];
int64_t* ids_ = (int64_t*)s;
float* dis_ = (float*)&s[k2 * sizeof(int64_t)];
if (tx >= k2)
if (tx >= len)
return;
int64_t i = ids_before[ty * k2 + tx];
float d = dis_before[ty * k2 + tx];
bool check = bs[i >> 3] & (0x1 << (i & 0x7));
if (!check) {
ids_[tx] = i;
dis_[tx] = d;
} else {
ids_[tx] = -1;
dis_[tx] = -1.0f;
__shared__ int64_t ids_before_s[1024];
__shared__ int32_t ids_block_s[1024];

ids_before_s[threadIdx.x] = ids_before[tx];
int64_t i = ids_before_s[threadIdx.x];
Presburger marked this conversation as resolved.
Show resolved Hide resolved
ids_block_s[threadIdx.x] = 1 - (bs[i >> 3] && (0x1 << (i & 0x7)));
ids_block[tx] = ids_block_s[threadIdx.x];
}

__global__ void
prescan(int32_t* g_odata, int32_t* g_idata, int n) {
Presburger marked this conversation as resolved.
Show resolved Hide resolved
extern __shared__ int32_t temp[];
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
int offset = 1;
temp[2 * tx] = g_idata[2 * tx + ty * n];
temp[2 * tx + 1] = g_idata[2 * tx + 1 + ty * n];
for (int d = n >> 1; d > 0; d >>= 1) {
__syncthreads();
if (tx < d) {
int ai = offset * (2 * tx + 1) - 1;
int bi = offset * (2 * tx + 2) - 1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tx == 0) {
int j = 0, k = 0;
while (j < k1 && k < k2) {
while (ids_[k] == -1) k++;
if (k >= k2)
break;
ids[ty * k1 + j] = ids_[k];
dis[ty * k1 + j] = dis_[k];
j++;
k++;
}
if (j != k1) {
ids_before[0] = -1; // destroy answer
temp[n - 1] = 0;
}
for (int d = 1; d < n; d *= 2) {
offset >>= 1;
__syncthreads();
if (tx < d) {
int ai = offset * (2 * tx + 1) - 1;
int bi = offset * (2 * tx + 2) - 1;
int32_t t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2 * tx + ty * n] += temp[2 * tx];
g_odata[2 * tx + 1 + ty * n] += temp[2 * tx + 1];
}

__global__ void
write_back(const int32_t* pre_ids_block, int64_t* ids_before, float* dis_before, int64_t* ids_after, float* dis_after,
int k1, int k2) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;

extern __shared__ int32_t temp[];
for (int i = tx; i < k2; i += k1) {
temp[i] = pre_ids_block[ty * k2 + i];
}
__syncthreads();
int left = 0;
int right = k2;
while (left < right) {
int mid = (left + right) / 2;
if (temp[mid] == tx + 1) {
right = mid;
} else if (temp[mid] > tx + 1) {
right = mid;
} else if (temp[mid] < tx + 1) {
left = mid + 1;
}
}
if (left < k2) {
ids_after[ty * k1 + tx] = ids_before[ty * k2 + left];
dis_after[ty * k1 + tx] = dis_before[ty * k2 + left];
} else {
ids_after[ty * k1 + tx] = -1;
dis_after[ty * k1 + tx] = 1.0f / 0.0f;
ids_before[0] = -1; // destory old ans;
Presburger marked this conversation as resolved.
Show resolved Hide resolved
}
}

namespace raft_res_pool {
Expand Down Expand Up @@ -443,7 +487,7 @@ class RaftIvfIndexNode : public IndexNode {
k2 |= k2 >> 8;
k2 |= k2 >> 14;
k2 += 1;
while (k2 <= 1024) {
while (k2 <= (1 << 14)) {
auto ids_gpu_before = raft::make_device_matrix<std::int64_t, std::int64_t>(*res_, rows, k2);
auto dis_gpu_before = raft::make_device_matrix<float, std::int64_t>(*res_, rows, k2);
auto bs_gpu = raft::make_device_vector<uint8_t, std::int64_t>(*res_, bitset.byte_size());
Expand All @@ -453,9 +497,18 @@ class RaftIvfIndexNode : public IndexNode {
raft::neighbors::ivf_flat::search<float, std::int64_t>(
*res_, search_params, *gpu_index_, raft::make_const_mdspan(data_gpu.view()),
ids_gpu_before.view(), dis_gpu_before.view());
filter<<<dim3(1, rows), k2, k2 * sizeof(std::int64_t) + k2 * sizeof(float), stream.value()>>>(
k1, k2, rows, bs_gpu.data_handle(), ids_gpu_before.data_handle(),
dis_gpu_before.data_handle(), ids_gpu.data_handle(), dis_gpu.data_handle());

auto ids_block = raft::make_device_vector<int32_t, std::int64_t>(*res_, rows * k2);
filter<<<(rows * k2 + 1023) / 1024, 1024, 0, stream.value()>>>(
bs_gpu.data_handle(), ids_gpu_before.data_handle(), ids_block.data_handle(), rows * k2);
auto pre_ids_block = raft::make_device_vector<int32_t, std::int64_t>(*res_, rows * k2);
RAFT_CUDA_TRY(cudaMemcpyAsync(pre_ids_block.data_handle(), ids_block.data_handle(),
rows * k2 * sizeof(int32_t), cudaMemcpyDefault, stream.value()));
prescan<<<dim3(1, rows), k2 / 2, sizeof(int32_t) * k2, stream.value()>>>(
pre_ids_block.data_handle(), ids_block.data_handle(), k2);
write_back<<<dim3(1, rows), k1, sizeof(int32_t) * k2, stream.value()>>>(
pre_ids_block.data_handle(), ids_gpu_before.data_handle(), dis_gpu_before.data_handle(),
ids_gpu.data_handle(), dis_gpu.data_handle(), k1, k2);

std::int64_t is_fine = 0;
RAFT_CUDA_TRY(cudaMemcpyAsync(&is_fine, ids_gpu_before.data_handle(), sizeof(std::int64_t),
Expand Down Expand Up @@ -506,7 +559,7 @@ class RaftIvfIndexNode : public IndexNode {
k2 |= k2 >> 8;
k2 |= k2 >> 14;
k2 += 1;
while (k2 <= 1024) {
while (k2 <= (1 << 14)) {
auto ids_gpu_before = raft::make_device_matrix<std::int64_t, std::int64_t>(*res_, rows, k2);
auto dis_gpu_before = raft::make_device_matrix<float, std::int64_t>(*res_, rows, k2);
auto bs_gpu = raft::make_device_vector<uint8_t, std::int64_t>(*res_, bitset.byte_size());
Expand All @@ -517,9 +570,17 @@ class RaftIvfIndexNode : public IndexNode {
*res_, search_params, *gpu_index_, raft::make_const_mdspan(data_gpu.view()),
ids_gpu_before.view(), dis_gpu_before.view());

filter<<<dim3(1, rows), k2, k2 * sizeof(std::int64_t) + k2 * sizeof(float), stream.value()>>>(
k1, k2, rows, bs_gpu.data_handle(), ids_gpu_before.data_handle(),
dis_gpu_before.data_handle(), ids_gpu.data_handle(), dis_gpu.data_handle());
auto ids_block = raft::make_device_vector<int32_t, std::int64_t>(*res_, rows * k2);
filter<<<(rows * k2 + 1023) / 1024, 1024, 0, stream.value()>>>(
bs_gpu.data_handle(), ids_gpu_before.data_handle(), ids_block.data_handle(), rows * k2);
auto pre_ids_block = raft::make_device_vector<int32_t, std::int64_t>(*res_, rows * k2);
RAFT_CUDA_TRY(cudaMemcpyAsync(pre_ids_block.data_handle(), ids_block.data_handle(),
rows * k2 * sizeof(int32_t), cudaMemcpyDefault, stream.value()));
prescan<<<dim3(1, rows), k2 / 2, sizeof(int32_t) * k2, stream.value()>>>(
pre_ids_block.data_handle(), ids_block.data_handle(), k2);
write_back<<<dim3(1, rows), k1, sizeof(int32_t) * k2, stream.value()>>>(
pre_ids_block.data_handle(), ids_gpu_before.data_handle(), dis_gpu_before.data_handle(),
Presburger marked this conversation as resolved.
Show resolved Hide resolved
ids_gpu.data_handle(), dis_gpu.data_handle(), k1, k2);

std::int64_t is_fine = 0;
RAFT_CUDA_TRY(cudaMemcpyAsync(&is_fine, ids_gpu_before.data_handle(), sizeof(std::int64_t),
Expand All @@ -530,7 +591,6 @@ class RaftIvfIndexNode : public IndexNode {
k2 = k2 << 1;
}
}

} else {
static_assert(std::is_same_v<detail::raft_ivf_flat_index, T>);
}
Expand Down Expand Up @@ -630,7 +690,7 @@ class RaftIvfIndexNode : public IndexNode {
}
// TODO(yusheng.ma):support no raw data mode
/*
#define RAW_DATA "RAW_DATA"
#define RAW_DATA "RAW_DATA"
auto data = binset.GetByName(RAW_DATA);
raft_gpu::raw_data_copy(*this->index_, data->data.get(), data->size);
*/
Expand Down