Skip to content

Commit

Permalink
feat: Update tantivy
Browse files Browse the repository at this point in the history
  • Loading branch information
ppodolsky committed Oct 15, 2023
1 parent 9cf78fd commit e9bba6e
Show file tree
Hide file tree
Showing 13 changed files with 82 additions and 86 deletions.
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ serde_yaml = { version = "0.8" }
strfmt = "0.2"
summa-proto = { version = "0.34.0", path = "./summa-proto", default_features = false }
take_mut = "0.2"
tantivy = { package = "izihawa-tantivy", version = "0.20.2", default_features = false, features = ["quickwit", "zstd-compression"] }
tantivy-common = { package = "izihawa-tantivy-common", version = "0.5.0" }
tantivy-query-grammar = { package = "izihawa-tantivy-query-grammar", version = "0.20.0" }
tantivy = { package = "izihawa-tantivy", version = "0.21.0", default_features = false, features = ["quickwit", "zstd-compression"] }
tantivy-common = { package = "izihawa-tantivy-common", version = "0.6.0" }
tantivy-query-grammar = { package = "izihawa-tantivy-query-grammar", version = "0.21.0" }
thiserror = "1.0"
time = { version = "0.3", features = ["serde-well-known", "wasm-bindgen"] }
tokio = { version = "1.32", default_features = false }
Expand Down
6 changes: 5 additions & 1 deletion docs/core/query-dsl.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ Modifies scores produced by a nested query. Useful in `BooleanQuery` to penalize
```

## MatchQuery
`MatchQuery` is a special query. Summa takes the value of this query, parses it and produces other kind of queries.
`MatchQuery` is a special query. Summa takes the value written in SummaQL format, parses it and produces tree of queries that may be executed by search engine.
`MatchQuery` may be used for parsing queries written in natural language. For example, following query
```json
{
Expand Down Expand Up @@ -176,6 +176,10 @@ will be parsed into
}
```

SummaQL supports
- `RegexQuery`: `phone_number://7916.*//`
- `ExistsQuery`: `phone_number:*`

## PhraseQuery
Documents containing exact occurrence of the phrase
```json
Expand Down
2 changes: 1 addition & 1 deletion summa-core/src/collectors/reservoir_sampling_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use tantivy::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer(3_000_000).unwrap();
/// let mut index_writer = index.writer(15_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
Expand Down
10 changes: 5 additions & 5 deletions summa-core/src/components/custom_serializer.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use std::collections::{BTreeMap, HashSet};

use serde::{Serialize, Serializer};
use tantivy::schema::{Field, Schema};
use tantivy::Document;
use tantivy::schema::{Field, OwnedValue, Schema};
use tantivy::{Document, TantivyDocument};

/// `Value` is used for representing singular or multi-values of `tantivy::Document`
///
/// Required because Tantivy operates with multi-values only and Summa provides an abstraction of singular fields
pub enum Value<'a> {
SingleValue(Option<&'a tantivy::schema::Value>),
MultipleValue(Vec<&'a tantivy::schema::Value>),
SingleValue(Option<&'a OwnedValue>),
MultipleValue(Vec<&'a OwnedValue>),
}

/// Internal representation of a document used for JSON
Expand All @@ -22,7 +22,7 @@ pub enum Value<'a> {
pub struct NamedFieldDocument<'a>(pub BTreeMap<&'a str, Value<'a>>);

impl<'a> NamedFieldDocument<'a> {
pub fn from_document(schema: &'a Schema, fields: &Option<HashSet<Field>>, multi_fields: &HashSet<Field>, document: &'a Document) -> Self {
pub fn from_document(schema: &'a Schema, fields: &Option<HashSet<Field>>, multi_fields: &HashSet<Field>, document: &'a TantivyDocument) -> Self {
let mut field_map = BTreeMap::new();
for (field, field_values) in document.get_sorted_field_values() {
let field_name = schema.get_field_name(field);
Expand Down
24 changes: 12 additions & 12 deletions summa-core/src/components/index_holder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use tantivy::directory::OwnedBytes;
use tantivy::query::{EnableScoring, Query};
use tantivy::schema::{Field, Schema};
use tantivy::space_usage::SearcherSpaceUsage;
use tantivy::{Directory, Index, IndexBuilder, IndexReader, Opstamp, ReloadPolicy, Searcher};
use tantivy::{Directory, Index, IndexBuilder, IndexReader, Opstamp, ReloadPolicy, Searcher, TantivyDocument};
use tokio::sync::RwLock;
use tracing::{error, info, instrument, trace, warn};

Expand Down Expand Up @@ -582,7 +582,7 @@ impl IndexHolder {
&self,
searcher: &Searcher,
query_filter: &Option<proto::Query>,
documents_modifier: impl Fn(tantivy::schema::Document) -> Option<O> + Clone + Send + Sync + 'static,
documents_modifier: impl Fn(TantivyDocument) -> Option<O> + Clone + Send + Sync + 'static,
) -> SummaResult<tokio::sync::mpsc::Receiver<O>> {
match query_filter {
None | Some(proto::Query { query: None }) => {
Expand Down Expand Up @@ -623,7 +623,7 @@ impl IndexHolder {
&self,
searcher: &Searcher,
query: &proto::query::Query,
documents_modifier: impl Fn(tantivy::schema::Document) -> Option<O> + Clone + Send + Sync + 'static,
documents_modifier: impl Fn(TantivyDocument) -> Option<O> + Clone + Send + Sync + 'static,
) -> SummaResult<tokio::sync::mpsc::Receiver<O>> {
let parsed_query = self.query_parser.parse_query(query.clone())?;
let collector = tantivy::collector::DocSetCollector;
Expand Down Expand Up @@ -684,8 +684,8 @@ pub mod tests {
use summa_proto::proto::ConflictStrategy;
use tantivy::collector::{Count, TopDocs};
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::IndexRecordOption;
use tantivy::{doc, Document, IndexBuilder, Term};
use tantivy::schema::{IndexRecordOption, Value};
use tantivy::{doc, IndexBuilder, TantivyDocument, Term};

use crate::components::index_holder::register_default_tokenizers;
use crate::components::test_utils::{create_test_schema, generate_documents};
Expand All @@ -711,7 +711,7 @@ pub mod tests {
)?;
let mut last_document = None;
for document in generate_documents(&schema, 10000) {
let document: Document = document.bound_with(&schema).try_into()?;
let document: TantivyDocument = document.bound_with(&schema).try_into()?;
last_document = Some(document.clone());
index_writer_holder.index_document(document, ConflictStrategy::Merge)?;
}
Expand Down Expand Up @@ -802,11 +802,11 @@ pub mod tests {
.into_iter()
.map(|x| {
searcher
.doc(x.1)
.doc::<TantivyDocument>(x.1)
.unwrap()
.get_first(title_field)
.unwrap()
.as_text()
.as_str()
.map(|x| x.to_string())
.unwrap()
})
Expand All @@ -820,11 +820,11 @@ pub mod tests {
.into_iter()
.map(|x| {
searcher
.doc(x.1)
.doc::<TantivyDocument>(x.1)
.unwrap()
.get_first(title_field)
.unwrap()
.as_text()
.as_str()
.map(|x| x.to_string())
.unwrap()
})
Expand Down Expand Up @@ -873,11 +873,11 @@ pub mod tests {
.into_iter()
.map(|x| {
searcher
.doc(x.1)
.doc::<TantivyDocument>(x.1)
.unwrap()
.get_first(title_field)
.unwrap()
.as_text()
.as_str()
.map(|x| x.to_string())
.unwrap()
})
Expand Down
56 changes: 31 additions & 25 deletions summa-core/src/components/index_writer_holder.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::collections::hash_map::RandomState;
use std::collections::HashSet;
use std::collections::{BTreeMap, HashSet};
use std::path::Path;
use std::sync::{Arc, RwLock};

Expand All @@ -8,42 +8,42 @@ use summa_proto::proto;
use tantivy::json_utils::{convert_to_fast_value_and_get_term, JsonTermWriter};
use tantivy::merge_policy::MergePolicy;
use tantivy::query::Query;
use tantivy::schema::{Field, Value};
use tantivy::{Directory, Document, Index, IndexWriter, Opstamp, SegmentId, SegmentMeta, SingleSegmentIndexWriter, Term};
use tantivy::schema::{Field, NamedFieldDocument, OwnedValue, Value};
use tantivy::{Directory, Document, Index, IndexWriter, Opstamp, SegmentId, SegmentMeta, SingleSegmentIndexWriter, TantivyDocument, Term};
use tracing::info;

use super::SummaSegmentAttributes;
use crate::configs::core::WriterThreads;
use crate::errors::{SummaResult, ValidationError};
use crate::Error;

fn extract_flatten<T: AsRef<str>>(v: &serde_json::Value, parts: &[T], buffer: &mut Vec<String>) {
fn extract_flatten<T: AsRef<str>>(v: &OwnedValue, parts: &[T], buffer: &mut Vec<String>) {
let mut current = v;
for (i, part) in parts.iter().enumerate() {
match current {
serde_json::value::Value::Object(m) => {
OwnedValue::Object(m) => {
if let Some(next) = m.get(part.as_ref()) {
current = next
}
}
serde_json::value::Value::Array(a) => {
OwnedValue::Array(a) => {
for child in a {
extract_flatten(child, &parts[i..], buffer)
}
}
_ => break,
}
}
if let serde_json::Value::String(last_value) = &current {
if let OwnedValue::Str(last_value) = current {
buffer.push(last_value.to_string())
}
}

fn extract_flatten_from_map<T: AsRef<str>>(m: &serde_json::value::Map<String, serde_json::Value>, parts: &[T], buffer: &mut Vec<String>) {
fn extract_flatten_from_map<T: AsRef<str>>(m: &BTreeMap<String, OwnedValue>, parts: &[T], buffer: &mut Vec<String>) {
for (i, part) in parts.iter().enumerate() {
match m.get(part.as_ref()) {
Some(v) => match v {
serde_json::value::Value::Array(a) => {
OwnedValue::Array(a) => {
for child in a {
extract_flatten(child, &parts[i..], buffer)
}
Expand All @@ -55,20 +55,26 @@ fn extract_flatten_from_map<T: AsRef<str>>(m: &serde_json::value::Map<String, se
}
}

fn cast_to_term(unique_field: &Field, full_path: &str, value: &serde_json::Value) -> Vec<Term> {
fn cast_to_term(unique_field: &Field, full_path: &str, value: &OwnedValue) -> Vec<Term> {
let mut term = Term::with_capacity(128);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(*unique_field, full_path, true, &mut term);
match value {
serde_json::Value::Number(n) => {
OwnedValue::I64(n) => {
vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
}
serde_json::Value::String(s) => {
OwnedValue::U64(n) => {
vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
}
OwnedValue::F64(n) => {
vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
}
OwnedValue::Str(s) => {
let mut term = Term::with_capacity(128);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(*unique_field, full_path, true, &mut term);
json_term_writer.set_str(s);
vec![json_term_writer.term().clone()]
}
serde_json::Value::Array(v) => v.iter().flat_map(|e| cast_to_term(unique_field, full_path, e)).collect(),
OwnedValue::Array(v) => v.iter().flat_map(|e| cast_to_term(unique_field, full_path, e)).collect(),
_ => unreachable!(),
}
}
Expand Down Expand Up @@ -116,7 +122,7 @@ impl IndexWriterImpl {
}
}

pub fn add_document(&self, document: Document) -> SummaResult<()> {
pub fn add_document(&self, document: TantivyDocument) -> SummaResult<()> {
match self {
IndexWriterImpl::SameThread(writer) => {
writer.index_writer.write().expect("poisoned").add_document(document)?;
Expand Down Expand Up @@ -272,7 +278,7 @@ impl IndexWriterHolder {
}

/// Delete index by its unique fields
pub(super) fn resolve_conflicts(&self, document: &Document, conflict_strategy: proto::ConflictStrategy) -> SummaResult<Option<u64>> {
pub(super) fn resolve_conflicts(&self, document: &TantivyDocument, conflict_strategy: proto::ConflictStrategy) -> SummaResult<Option<u64>> {
if self.unique_fields.is_empty() || matches!(conflict_strategy, proto::ConflictStrategy::DoNothing) {
return Ok(None);
}
Expand All @@ -282,11 +288,11 @@ impl IndexWriterHolder {
.iter()
.filter_map(|(unique_field, full_path)| {
document.get_first(*unique_field).and_then(|value| match value {
Value::Str(s) => Some(Ok(vec![Term::from_field_text(*unique_field, s)])),
Value::JsonObject(i) => i.get(full_path).map(|value| Ok(cast_to_term(unique_field, full_path, value))),
Value::I64(i) => Some(Ok(vec![Term::from_field_i64(*unique_field, *i)])),
Value::U64(i) => Some(Ok(vec![Term::from_field_u64(*unique_field, *i)])),
Value::F64(i) => Some(Ok(vec![Term::from_field_f64(*unique_field, *i)])),
OwnedValue::Str(s) => Some(Ok(vec![Term::from_field_text(*unique_field, s)])),
OwnedValue::Object(i) => i.get(full_path).map(|value| Ok(cast_to_term(unique_field, full_path, value))),
OwnedValue::I64(i) => Some(Ok(vec![Term::from_field_i64(*unique_field, *i)])),
OwnedValue::U64(i) => Some(Ok(vec![Term::from_field_u64(*unique_field, *i)])),
OwnedValue::F64(i) => Some(Ok(vec![Term::from_field_f64(*unique_field, *i)])),
_ => {
let schema = self.index_writer.index().schema();
let field_type = schema.get_field_entry(*unique_field).field_type();
Expand All @@ -302,7 +308,7 @@ impl IndexWriterHolder {
if unique_terms.is_empty() {
Err(ValidationError::MissingUniqueField(format!(
"{:?}",
self.index_writer.index().schema().to_named_doc(document)
document.to_named_doc(&self.index_writer.index().schema()),
)))?
}

Expand Down Expand Up @@ -330,7 +336,7 @@ impl IndexWriterHolder {
}

#[inline]
fn process_dynamic_fields(&self, document: &mut Document) -> SummaResult<()> {
fn process_dynamic_fields(&self, document: &mut TantivyDocument) -> SummaResult<()> {
if let Some((extra_field, issued_at_field)) = self.extra_year_field {
if let Some(issued_at_value) = document.get_first(issued_at_field) {
if let Some(issued_at_value) = issued_at_value.as_i64() {
Expand All @@ -346,8 +352,8 @@ impl IndexWriterHolder {
for ((source_field, source_full_path), target_field) in &self.mapped_fields {
for value in document.get_all(*source_field) {
match value {
Value::Str(s) => buffer.push(s.to_string()),
Value::JsonObject(m) => {
OwnedValue::Str(s) => buffer.push(s.to_string()),
OwnedValue::Object(m) => {
extract_flatten_from_map(m, source_full_path, &mut buffer);
}
_ => unimplemented!(),
Expand All @@ -362,7 +368,7 @@ impl IndexWriterHolder {
}

/// Put document to the index. Before comes searchable it must be committed
pub fn index_document(&self, mut document: Document, conflict_strategy: proto::ConflictStrategy) -> SummaResult<()> {
pub fn index_document(&self, mut document: TantivyDocument, conflict_strategy: proto::ConflictStrategy) -> SummaResult<()> {
self.process_dynamic_fields(&mut document)?;
self.resolve_conflicts(&document, conflict_strategy)?;
self.index_writer.add_document(document)?;
Expand Down
2 changes: 1 addition & 1 deletion summa-core/src/components/queries/exists_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use tantivy_common::BitSet;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// author => "Patrick Rothfuss"
Expand Down
6 changes: 2 additions & 4 deletions summa-core/src/components/query_parser/proto_query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use tantivy::query::{
AllQuery, BooleanQuery, BoostQuery, DisjunctionMaxQuery, EmptyQuery, MoreLikeThisQuery, Occur, PhraseQuery, Query, RangeQuery, RegexQuery, TermQuery,
};
use tantivy::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
use tantivy::{Index, Score, Term};
use tantivy::{Document, Index, Score, TantivyDocument, Term};
use tracing::info;

use crate::components::queries::ExistsQuery;
Expand Down Expand Up @@ -207,9 +207,7 @@ impl ProtoQueryParser {
))
}
proto::query::Query::MoreLikeThis(more_like_this_query_proto) => {
let document = self
.cached_schema
.parse_document(&more_like_this_query_proto.document)
let document = TantivyDocument::parse_json(&self.cached_schema, &more_like_this_query_proto.document)
.map_err(|_e| Error::InvalidSyntax("bad document".to_owned()))?;
let field_values = document
.get_sorted_field_values()
Expand Down
Loading

0 comments on commit e9bba6e

Please sign in to comment.