feat: Update tantivy

izihawa · Oct 15, 2023 · e9bba6e · e9bba6e
1 parent 9cf78fd
commit e9bba6e
Show file tree

Hide file tree

Showing 13 changed files with 82 additions and 86 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -44,9 +44,9 @@ serde_yaml = { version = "0.8" }
 strfmt = "0.2"
 summa-proto = { version = "0.34.0", path = "./summa-proto", default_features = false }
 take_mut = "0.2"
-tantivy = { package = "izihawa-tantivy", version = "0.20.2", default_features = false, features = ["quickwit", "zstd-compression"] }
-tantivy-common = { package = "izihawa-tantivy-common", version = "0.5.0" }
-tantivy-query-grammar = { package = "izihawa-tantivy-query-grammar", version = "0.20.0" }
+tantivy = { package = "izihawa-tantivy", version = "0.21.0", default_features = false, features = ["quickwit", "zstd-compression"] }
+tantivy-common = { package = "izihawa-tantivy-common", version = "0.6.0" }
+tantivy-query-grammar = { package = "izihawa-tantivy-query-grammar", version = "0.21.0" }
 thiserror = "1.0"
 time = { version = "0.3", features = ["serde-well-known", "wasm-bindgen"] }
 tokio = { version = "1.32", default_features = false }

diff --git a/docs/core/query-dsl.md b/docs/core/query-dsl.md
@@ -134,7 +134,7 @@ Modifies scores produced by a nested query. Useful in `BooleanQuery` to penalize
 ```
 
 ## MatchQuery
-`MatchQuery` is a special query. Summa takes the value of this query, parses it and produces other kind of queries.
+`MatchQuery` is a special query. Summa takes the value written in SummaQL format, parses it and produces tree of queries that may be executed by search engine.
 `MatchQuery` may be used for parsing queries written in natural language. For example, following query
 ```json
 {
@@ -176,6 +176,10 @@ will be parsed into
 }
 ```
 
+SummaQL supports 
+- `RegexQuery`: `phone_number://7916.*//`
+- `ExistsQuery`: `phone_number:*`
+
 ## PhraseQuery
 Documents containing exact occurrence of the phrase
 ```json

diff --git a/summa-core/src/collectors/reservoir_sampling_collector.rs b/summa-core/src/collectors/reservoir_sampling_collector.rs
@@ -19,7 +19,7 @@ use tantivy::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader};
 /// let schema = schema_builder.build();
 /// let index = Index::create_in_ram(schema);
 ///
-/// let mut index_writer = index.writer(3_000_000).unwrap();
+/// let mut index_writer = index.writer(15_000_000).unwrap();
 /// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
 /// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
 /// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();

diff --git a/summa-core/src/components/custom_serializer.rs b/summa-core/src/components/custom_serializer.rs
@@ -1,15 +1,15 @@
 use std::collections::{BTreeMap, HashSet};
 
 use serde::{Serialize, Serializer};
-use tantivy::schema::{Field, Schema};
-use tantivy::Document;
+use tantivy::schema::{Field, OwnedValue, Schema};
+use tantivy::{Document, TantivyDocument};
 
 /// `Value` is used for representing singular or multi-values of `tantivy::Document`
 ///
 /// Required because Tantivy operates with multi-values only and Summa provides an abstraction of singular fields
 pub enum Value<'a> {
-    SingleValue(Option<&'a tantivy::schema::Value>),
-    MultipleValue(Vec<&'a tantivy::schema::Value>),
+    SingleValue(Option<&'a OwnedValue>),
+    MultipleValue(Vec<&'a OwnedValue>),
 }
 
 /// Internal representation of a document used for JSON
@@ -22,7 +22,7 @@ pub enum Value<'a> {
 pub struct NamedFieldDocument<'a>(pub BTreeMap<&'a str, Value<'a>>);
 
 impl<'a> NamedFieldDocument<'a> {
-    pub fn from_document(schema: &'a Schema, fields: &Option<HashSet<Field>>, multi_fields: &HashSet<Field>, document: &'a Document) -> Self {
+    pub fn from_document(schema: &'a Schema, fields: &Option<HashSet<Field>>, multi_fields: &HashSet<Field>, document: &'a TantivyDocument) -> Self {
         let mut field_map = BTreeMap::new();
         for (field, field_values) in document.get_sorted_field_values() {
             let field_name = schema.get_field_name(field);

diff --git a/summa-core/src/components/index_holder.rs b/summa-core/src/components/index_holder.rs
@@ -13,7 +13,7 @@ use tantivy::directory::OwnedBytes;
 use tantivy::query::{EnableScoring, Query};
 use tantivy::schema::{Field, Schema};
 use tantivy::space_usage::SearcherSpaceUsage;
-use tantivy::{Directory, Index, IndexBuilder, IndexReader, Opstamp, ReloadPolicy, Searcher};
+use tantivy::{Directory, Index, IndexBuilder, IndexReader, Opstamp, ReloadPolicy, Searcher, TantivyDocument};
 use tokio::sync::RwLock;
 use tracing::{error, info, instrument, trace, warn};
 
@@ -582,7 +582,7 @@ impl IndexHolder {
         &self,
         searcher: &Searcher,
         query_filter: &Option<proto::Query>,
-        documents_modifier: impl Fn(tantivy::schema::Document) -> Option<O> + Clone + Send + Sync + 'static,
+        documents_modifier: impl Fn(TantivyDocument) -> Option<O> + Clone + Send + Sync + 'static,
     ) -> SummaResult<tokio::sync::mpsc::Receiver<O>> {
         match query_filter {
             None | Some(proto::Query { query: None }) => {
@@ -623,7 +623,7 @@ impl IndexHolder {
         &self,
         searcher: &Searcher,
         query: &proto::query::Query,
-        documents_modifier: impl Fn(tantivy::schema::Document) -> Option<O> + Clone + Send + Sync + 'static,
+        documents_modifier: impl Fn(TantivyDocument) -> Option<O> + Clone + Send + Sync + 'static,
     ) -> SummaResult<tokio::sync::mpsc::Receiver<O>> {
         let parsed_query = self.query_parser.parse_query(query.clone())?;
         let collector = tantivy::collector::DocSetCollector;
@@ -684,8 +684,8 @@ pub mod tests {
     use summa_proto::proto::ConflictStrategy;
     use tantivy::collector::{Count, TopDocs};
     use tantivy::query::{AllQuery, TermQuery};
-    use tantivy::schema::IndexRecordOption;
-    use tantivy::{doc, Document, IndexBuilder, Term};
+    use tantivy::schema::{IndexRecordOption, Value};
+    use tantivy::{doc, IndexBuilder, TantivyDocument, Term};
 
     use crate::components::index_holder::register_default_tokenizers;
     use crate::components::test_utils::{create_test_schema, generate_documents};
@@ -711,7 +711,7 @@ pub mod tests {
         )?;
         let mut last_document = None;
         for document in generate_documents(&schema, 10000) {
-            let document: Document = document.bound_with(&schema).try_into()?;
+            let document: TantivyDocument = document.bound_with(&schema).try_into()?;
             last_document = Some(document.clone());
             index_writer_holder.index_document(document, ConflictStrategy::Merge)?;
         }
@@ -802,11 +802,11 @@ pub mod tests {
             .into_iter()
             .map(|x| {
                 searcher
-                    .doc(x.1)
+                    .doc::<TantivyDocument>(x.1)
                     .unwrap()
                     .get_first(title_field)
                     .unwrap()
-                    .as_text()
+                    .as_str()
                     .map(|x| x.to_string())
                     .unwrap()
             })
@@ -820,11 +820,11 @@ pub mod tests {
             .into_iter()
             .map(|x| {
                 searcher
-                    .doc(x.1)
+                    .doc::<TantivyDocument>(x.1)
                     .unwrap()
                     .get_first(title_field)
                     .unwrap()
-                    .as_text()
+                    .as_str()
                     .map(|x| x.to_string())
                     .unwrap()
             })
@@ -873,11 +873,11 @@ pub mod tests {
             .into_iter()
             .map(|x| {
                 searcher
-                    .doc(x.1)
+                    .doc::<TantivyDocument>(x.1)
                     .unwrap()
                     .get_first(title_field)
                     .unwrap()
-                    .as_text()
+                    .as_str()
                     .map(|x| x.to_string())
                     .unwrap()
             })

diff --git a/summa-core/src/components/index_writer_holder.rs b/summa-core/src/components/index_writer_holder.rs
@@ -1,5 +1,5 @@
 use std::collections::hash_map::RandomState;
-use std::collections::HashSet;
+use std::collections::{BTreeMap, HashSet};
 use std::path::Path;
 use std::sync::{Arc, RwLock};
 
@@ -8,42 +8,42 @@ use summa_proto::proto;
 use tantivy::json_utils::{convert_to_fast_value_and_get_term, JsonTermWriter};
 use tantivy::merge_policy::MergePolicy;
 use tantivy::query::Query;
-use tantivy::schema::{Field, Value};
-use tantivy::{Directory, Document, Index, IndexWriter, Opstamp, SegmentId, SegmentMeta, SingleSegmentIndexWriter, Term};
+use tantivy::schema::{Field, NamedFieldDocument, OwnedValue, Value};
+use tantivy::{Directory, Document, Index, IndexWriter, Opstamp, SegmentId, SegmentMeta, SingleSegmentIndexWriter, TantivyDocument, Term};
 use tracing::info;
 
 use super::SummaSegmentAttributes;
 use crate::configs::core::WriterThreads;
 use crate::errors::{SummaResult, ValidationError};
 use crate::Error;
 
-fn extract_flatten<T: AsRef<str>>(v: &serde_json::Value, parts: &[T], buffer: &mut Vec<String>) {
+fn extract_flatten<T: AsRef<str>>(v: &OwnedValue, parts: &[T], buffer: &mut Vec<String>) {
     let mut current = v;
     for (i, part) in parts.iter().enumerate() {
         match current {
-            serde_json::value::Value::Object(m) => {
+            OwnedValue::Object(m) => {
                 if let Some(next) = m.get(part.as_ref()) {
                     current = next
                 }
             }
-            serde_json::value::Value::Array(a) => {
+            OwnedValue::Array(a) => {
                 for child in a {
                     extract_flatten(child, &parts[i..], buffer)
                 }
             }
             _ => break,
         }
     }
-    if let serde_json::Value::String(last_value) = &current {
+    if let OwnedValue::Str(last_value) = current {
         buffer.push(last_value.to_string())
     }
 }
 
-fn extract_flatten_from_map<T: AsRef<str>>(m: &serde_json::value::Map<String, serde_json::Value>, parts: &[T], buffer: &mut Vec<String>) {
+fn extract_flatten_from_map<T: AsRef<str>>(m: &BTreeMap<String, OwnedValue>, parts: &[T], buffer: &mut Vec<String>) {
     for (i, part) in parts.iter().enumerate() {
         match m.get(part.as_ref()) {
             Some(v) => match v {
-                serde_json::value::Value::Array(a) => {
+                OwnedValue::Array(a) => {
                     for child in a {
                         extract_flatten(child, &parts[i..], buffer)
                     }
@@ -55,20 +55,26 @@ fn extract_flatten_from_map<T: AsRef<str>>(m: &serde_json::value::Map<String, se
     }
 }
 
-fn cast_to_term(unique_field: &Field, full_path: &str, value: &serde_json::Value) -> Vec<Term> {
+fn cast_to_term(unique_field: &Field, full_path: &str, value: &OwnedValue) -> Vec<Term> {
     let mut term = Term::with_capacity(128);
     let mut json_term_writer = JsonTermWriter::from_field_and_json_path(*unique_field, full_path, true, &mut term);
     match value {
-        serde_json::Value::Number(n) => {
+        OwnedValue::I64(n) => {
             vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
         }
-        serde_json::Value::String(s) => {
+        OwnedValue::U64(n) => {
+            vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
+        }
+        OwnedValue::F64(n) => {
+            vec![convert_to_fast_value_and_get_term(&mut json_term_writer, &n.to_string()).expect("incorrect json type")]
+        }
+        OwnedValue::Str(s) => {
             let mut term = Term::with_capacity(128);
             let mut json_term_writer = JsonTermWriter::from_field_and_json_path(*unique_field, full_path, true, &mut term);
             json_term_writer.set_str(s);
             vec![json_term_writer.term().clone()]
         }
-        serde_json::Value::Array(v) => v.iter().flat_map(|e| cast_to_term(unique_field, full_path, e)).collect(),
+        OwnedValue::Array(v) => v.iter().flat_map(|e| cast_to_term(unique_field, full_path, e)).collect(),
         _ => unreachable!(),
     }
 }
@@ -116,7 +122,7 @@ impl IndexWriterImpl {
         }
     }
 
-    pub fn add_document(&self, document: Document) -> SummaResult<()> {
+    pub fn add_document(&self, document: TantivyDocument) -> SummaResult<()> {
         match self {
             IndexWriterImpl::SameThread(writer) => {
                 writer.index_writer.write().expect("poisoned").add_document(document)?;
@@ -272,7 +278,7 @@ impl IndexWriterHolder {
     }
 
     /// Delete index by its unique fields
-    pub(super) fn resolve_conflicts(&self, document: &Document, conflict_strategy: proto::ConflictStrategy) -> SummaResult<Option<u64>> {
+    pub(super) fn resolve_conflicts(&self, document: &TantivyDocument, conflict_strategy: proto::ConflictStrategy) -> SummaResult<Option<u64>> {
         if self.unique_fields.is_empty() || matches!(conflict_strategy, proto::ConflictStrategy::DoNothing) {
             return Ok(None);
         }
@@ -282,11 +288,11 @@ impl IndexWriterHolder {
             .iter()
             .filter_map(|(unique_field, full_path)| {
                 document.get_first(*unique_field).and_then(|value| match value {
-                    Value::Str(s) => Some(Ok(vec![Term::from_field_text(*unique_field, s)])),
-                    Value::JsonObject(i) => i.get(full_path).map(|value| Ok(cast_to_term(unique_field, full_path, value))),
-                    Value::I64(i) => Some(Ok(vec![Term::from_field_i64(*unique_field, *i)])),
-                    Value::U64(i) => Some(Ok(vec![Term::from_field_u64(*unique_field, *i)])),
-                    Value::F64(i) => Some(Ok(vec![Term::from_field_f64(*unique_field, *i)])),
+                    OwnedValue::Str(s) => Some(Ok(vec![Term::from_field_text(*unique_field, s)])),
+                    OwnedValue::Object(i) => i.get(full_path).map(|value| Ok(cast_to_term(unique_field, full_path, value))),
+                    OwnedValue::I64(i) => Some(Ok(vec![Term::from_field_i64(*unique_field, *i)])),
+                    OwnedValue::U64(i) => Some(Ok(vec![Term::from_field_u64(*unique_field, *i)])),
+                    OwnedValue::F64(i) => Some(Ok(vec![Term::from_field_f64(*unique_field, *i)])),
                     _ => {
                         let schema = self.index_writer.index().schema();
                         let field_type = schema.get_field_entry(*unique_field).field_type();
@@ -302,7 +308,7 @@ impl IndexWriterHolder {
         if unique_terms.is_empty() {
             Err(ValidationError::MissingUniqueField(format!(
                 "{:?}",
-                self.index_writer.index().schema().to_named_doc(document)
+                document.to_named_doc(&self.index_writer.index().schema()),
             )))?
         }
 
@@ -330,7 +336,7 @@ impl IndexWriterHolder {
     }
 
     #[inline]
-    fn process_dynamic_fields(&self, document: &mut Document) -> SummaResult<()> {
+    fn process_dynamic_fields(&self, document: &mut TantivyDocument) -> SummaResult<()> {
         if let Some((extra_field, issued_at_field)) = self.extra_year_field {
             if let Some(issued_at_value) = document.get_first(issued_at_field) {
                 if let Some(issued_at_value) = issued_at_value.as_i64() {
@@ -346,8 +352,8 @@ impl IndexWriterHolder {
         for ((source_field, source_full_path), target_field) in &self.mapped_fields {
             for value in document.get_all(*source_field) {
                 match value {
-                    Value::Str(s) => buffer.push(s.to_string()),
-                    Value::JsonObject(m) => {
+                    OwnedValue::Str(s) => buffer.push(s.to_string()),
+                    OwnedValue::Object(m) => {
                         extract_flatten_from_map(m, source_full_path, &mut buffer);
                     }
                     _ => unimplemented!(),
@@ -362,7 +368,7 @@ impl IndexWriterHolder {
     }
 
     /// Put document to the index. Before comes searchable it must be committed
-    pub fn index_document(&self, mut document: Document, conflict_strategy: proto::ConflictStrategy) -> SummaResult<()> {
+    pub fn index_document(&self, mut document: TantivyDocument, conflict_strategy: proto::ConflictStrategy) -> SummaResult<()> {
         self.process_dynamic_fields(&mut document)?;
         self.resolve_conflicts(&document, conflict_strategy)?;
         self.index_writer.add_document(document)?;

diff --git a/summa-core/src/components/queries/exists_query.rs b/summa-core/src/components/queries/exists_query.rs
@@ -22,7 +22,7 @@ use tantivy_common::BitSet;
 /// let schema = schema_builder.build();
 /// let index = Index::create_in_ram(schema);
 /// {
-///     let mut index_writer = index.writer(3_000_000)?;
+///     let mut index_writer = index.writer(15_000_000)?;
 ///     index_writer.add_document(doc!(
 ///         title => "The Name of the Wind",
 ///         author => "Patrick Rothfuss"

diff --git a/summa-core/src/components/query_parser/proto_query_parser.rs b/summa-core/src/components/query_parser/proto_query_parser.rs
@@ -11,7 +11,7 @@ use tantivy::query::{
     AllQuery, BooleanQuery, BoostQuery, DisjunctionMaxQuery, EmptyQuery, MoreLikeThisQuery, Occur, PhraseQuery, Query, RangeQuery, RegexQuery, TermQuery,
 };
 use tantivy::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
-use tantivy::{Index, Score, Term};
+use tantivy::{Document, Index, Score, TantivyDocument, Term};
 use tracing::info;
 
 use crate::components::queries::ExistsQuery;
@@ -207,9 +207,7 @@ impl ProtoQueryParser {
                 ))
             }
             proto::query::Query::MoreLikeThis(more_like_this_query_proto) => {
-                let document = self
-                    .cached_schema
-                    .parse_document(&more_like_this_query_proto.document)
+                let document = TantivyDocument::parse_json(&self.cached_schema, &more_like_this_query_proto.document)
                     .map_err(|_e| Error::InvalidSyntax("bad document".to_owned()))?;
                 let field_values = document
                     .get_sorted_field_values()