Skip to content

Commit

Permalink
search: fix subtle bug with some newest-first text search
Browse files Browse the repository at this point in the history
Due to the way the range queries work for newest-first searches, we can
have a situation where the MDB_SET_RANGE gets placed on either the
correct place or just after the correct place. To position the cursor
correctly, we jump back one if the search result prefix doesn't match.
  • Loading branch information
jb55 committed Dec 2, 2023
1 parent 488b4c1 commit 26e8054
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 26 deletions.
7 changes: 7 additions & 0 deletions ndb.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ static void ndb_print_text_search_result(struct ndb_txn *txn,
printf("\n%s\n\n---\n", ndb_note_str(note, &note->content).str);
}

int ndb_print_search_keys(struct ndb_txn *txn);

int main(int argc, char *argv[])
{
struct ndb *ndb;
Expand Down Expand Up @@ -199,6 +201,11 @@ int main(int argc, char *argv[])
} else if (argc == 3 && !strcmp(argv[1], "import")) {
map_file(argv[2], &data, &data_len);
ndb_process_events(ndb, (const char *)data, data_len);
ndb_process_client_events(ndb, (const char *)data, data_len);
} else if (argc == 2 && !strcmp(argv[1], "print-search-keys")) {
ndb_begin_query(ndb, &txn);
ndb_print_search_keys(&txn);
ndb_end_query(&txn);
} else {
return usage();
}
Expand Down
108 changes: 82 additions & 26 deletions nostrdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -2473,6 +2473,38 @@ static void ndb_print_text_search_key(struct ndb_text_search_key *key)
key->note_id);
}

static int ndb_prefix_matches(struct ndb_text_search_result *result,
struct ndb_word *search_word)
{
// Empty strings shouldn't happen but let's
if (result->key.str_len < 2 || search_word->word_len < 2)
return 0;

// make sure we at least have two matching prefix characters. exact
// matches are nice but range searches allow us to match prefixes as
// well. A double-char prefix is suffient, but maybe we could up this
// in the future.
//
// TODO: How are we handling utf-8 prefix matches like
// japanese?
//
if ( result->key.str[0] != tolower(search_word->word[0])
&& result->key.str[1] != tolower(search_word->word[1])
)
return 0;

// count the number of prefix-matched characters. This will be used
// for ranking search results
result->prefix_chars = prefix_count(result->key.str,
result->key.str_len,
search_word->word,
search_word->word_len);

if (result->prefix_chars <= (int)((double)search_word->word_len / 1.5))
return 0;

return 1;
}

// This is called when scanning the full text search index. Scanning stops
// when we no longer have a prefix match for the word
Expand All @@ -2483,7 +2515,10 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
MDB_cursor_op order_op)
{
struct cursor key_cursor;
//struct ndb_text_search_key search_key;
MDB_val v;
int retries;
retries = -1;

make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);

Expand All @@ -2504,6 +2539,15 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
}
}

retry:
retries++;
/*
printf("continuing from ");
if (ndb_unpack_text_search_key(k->mv_data, k->mv_size, &search_key)) {
ndb_print_text_search_key(&search_key);
} else { printf("??"); }
printf("\n");
*/

make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);

Expand Down Expand Up @@ -2533,32 +2577,17 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
return 0;
}

// Empty strings shouldn't happen but let's
if (result->key.str_len < 2 || search_word->word_len < 2)
return 0;

// make sure we at least have two matching prefix characters. exact
// matches are nice but range searches allow us to match prefixes as
// well. A double-char prefix is suffient, but maybe we could up this
// in the future.
//
// TODO: How are we handling utf-8 prefix matches like
// japanese?
//
if ( result->key.str[0] != tolower(search_word->word[0])
&& result->key.str[1] != tolower(search_word->word[1])
)
return 0;

// count the number of prefix-matched characters. This will be used
// for ranking search results
result->prefix_chars = prefix_count(result->key.str,
result->key.str_len,
search_word->word,
search_word->word_len);

if (result->prefix_chars <= (int)((double)search_word->word_len / 1.5))
return 0;
if (!ndb_prefix_matches(result, search_word)) {
// we should only do this if we're going in reverse
if (retries == 0 && op == MDB_SET_RANGE && order_op == MDB_PREV) {
// if set range worked and our key exists, it should be
// the one right before this one
if (mdb_cursor_get(cursor, k, &v, MDB_PREV))
goto retry;
} else {
return 0;
}
}

// Unpack the remaining text search key, we will need this information
// when building up our search results.
Expand Down Expand Up @@ -4439,3 +4468,30 @@ void ndb_config_set_ingest_filter(struct ndb_config *config,
config->ingest_filter = fn;
config->filter_context = filter_ctx;
}

// used by ndb.c
int ndb_print_search_keys(struct ndb_txn *txn)
{
MDB_cursor *cur;
MDB_val k, v;
int i;
struct ndb_text_search_key search_key;

if (mdb_cursor_open(txn->mdb_txn, txn->lmdb->dbs[NDB_DB_NOTE_TEXT], &cur))
return 0;

i = 1;
while (mdb_cursor_get(cur, &k, &v, MDB_NEXT) == 0) {
if (!ndb_unpack_text_search_key(k.mv_data, k.mv_size, &search_key)) {
fprintf(stderr, "error decoding key %d\n", i);
continue;
}

ndb_print_text_search_key(&search_key);
printf("\n");

i++;
}

return 1;
}
3 changes: 3 additions & 0 deletions test.c
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,9 @@ static void test_fulltext()

ndb_begin_query(ndb, &txn);
ndb_text_search(&txn, "Jump Over", &results, NULL);
fprintf(stderr, "num results %d\n", results.num_results);
assert(results.num_results == 2);
assert(!strcmp(results.results[0].key.str, "jumped"));
ndb_end_query(&txn);

ndb_destroy(ndb);
Expand Down

0 comments on commit 26e8054

Please sign in to comment.