From 389c243289ae1a2b7be862214b6c413f3a15d4d9 Mon Sep 17 00:00:00 2001 From: Andrew Huynh Date: Mon, 3 Apr 2023 13:45:24 -0700 Subject: [PATCH] bugfix: file index deletion issue (#418) * adding some chunking to delete_documents_by_uri * adding more logging * cargo fmt --- .../entities/src/models/indexed_document.rs | 2 +- crates/spyglass/src/documents/mod.rs | 59 +++++++++++-------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/crates/entities/src/models/indexed_document.rs b/crates/entities/src/models/indexed_document.rs index 24ae94554..49d85418c 100644 --- a/crates/entities/src/models/indexed_document.rs +++ b/crates/entities/src/models/indexed_document.rs @@ -344,7 +344,7 @@ pub async fn delete_many_by_id( /// delete all related tag references before deleting the documents pub async fn delete_many_by_url( db: &DatabaseConnection, - urls: Vec, + urls: &[String], ) -> Result { let mut num_deleted = 0; for chunk in urls.chunks(BATCH_SIZE) { diff --git a/crates/spyglass/src/documents/mod.rs b/crates/spyglass/src/documents/mod.rs index ad04102fe..02461e198 100644 --- a/crates/spyglass/src/documents/mod.rs +++ b/crates/spyglass/src/documents/mod.rs @@ -6,6 +6,7 @@ use entities::{ tag::{self, TagPair}, }, sea_orm::{ActiveModelTrait, DatabaseConnection}, + BATCH_SIZE, }; use shared::config::LensConfig; use spyglass_plugin::TagModification; @@ -26,39 +27,51 @@ use entities::sea_orm::{ColumnTrait, EntityTrait, QueryFilter, Set, TransactionT /// Helper method to delete indexed documents, crawl queue items and search /// documents by url pub async fn delete_documents_by_uri(state: &AppState, uri: Vec) { - log::debug!("Deleting {:?} documents", uri.len()); + log::info!("Deleting {} documents", uri.len()); // Delete from crawl queue - if let Err(error) = crawl_queue::delete_many_by_url(&state.db, &uri).await { - log::error!("Error delete items from crawl queue {:?}", error); + log::warn!("Unable to delete from crawl_queue: {:?}", error); } // find all documents that already exist with that url - let existing: Vec = indexed_document::Entity::find() - .filter(indexed_document::Column::Url.is_in(uri.clone())) - .all(&state.db) - .await - .unwrap_or_default(); + for chunk in uri.chunks(BATCH_SIZE) { + let existing: Vec = indexed_document::Entity::find() + .filter(indexed_document::Column::Url.is_in(chunk.to_vec())) + .all(&state.db) + .await + .unwrap_or_default(); + + // build a hash map of Url to the doc id + let mut id_map = HashMap::new(); + for model in &existing { + id_map.insert(model.url.to_string(), model.doc_id.clone()); + } - // build a hash map of Url to the doc id - let mut id_map = HashMap::new(); - for model in &existing { - id_map.insert(model.url.to_string(), model.doc_id.clone()); - } + // build a list of doc ids to delete from the index + let doc_id_list = id_map + .values() + .map(|x| x.to_owned()) + .collect::>(); - // build a list of doc ids to delete from the index - let doc_id_list = id_map - .values() - .map(|x| x.to_owned()) - .collect::>(); + if let Err(err) = Searcher::delete_many_by_id(state, &doc_id_list, false).await { + log::warn!("Unable to delete_many_by_id: {err}") + } - let _ = Searcher::delete_many_by_id(state, &doc_id_list, false).await; - let _ = Searcher::save(state).await; + if let Err(err) = Searcher::save(state).await { + log::warn!("Unable to save searcher: {err}") + } - // now that the documents are deleted delete from the queue - if let Err(error) = indexed_document::delete_many_by_url(&state.db, uri).await { - log::error!("Error deleting for indexed document store {:?}", error); + // now that the documents are deleted delete from the queue + if let Err(error) = indexed_document::delete_many_by_url(&state.db, chunk).await { + log::warn!("Error deleting for indexed document store {:?}", error); + } + + log::info!( + "chunk: deleted {} ({}) docs from index", + chunk.len(), + existing.len() + ); } }