Skip to content

Commit

Permalink
bugfix: file index deletion issue (#418)
Browse files Browse the repository at this point in the history
* adding some chunking to delete_documents_by_uri

* adding more logging

* cargo fmt
  • Loading branch information
a5huynh authored Apr 3, 2023
1 parent b304763 commit 389c243
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 24 deletions.
2 changes: 1 addition & 1 deletion crates/entities/src/models/indexed_document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ pub async fn delete_many_by_id(
/// delete all related tag references before deleting the documents
pub async fn delete_many_by_url(
db: &DatabaseConnection,
urls: Vec<String>,
urls: &[String],
) -> Result<u64, sea_orm::DbErr> {
let mut num_deleted = 0;
for chunk in urls.chunks(BATCH_SIZE) {
Expand Down
59 changes: 36 additions & 23 deletions crates/spyglass/src/documents/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use entities::{
tag::{self, TagPair},
},
sea_orm::{ActiveModelTrait, DatabaseConnection},
BATCH_SIZE,
};
use shared::config::LensConfig;
use spyglass_plugin::TagModification;
Expand All @@ -26,39 +27,51 @@ use entities::sea_orm::{ColumnTrait, EntityTrait, QueryFilter, Set, TransactionT
/// Helper method to delete indexed documents, crawl queue items and search
/// documents by url
pub async fn delete_documents_by_uri(state: &AppState, uri: Vec<String>) {
log::debug!("Deleting {:?} documents", uri.len());
log::info!("Deleting {} documents", uri.len());

// Delete from crawl queue

if let Err(error) = crawl_queue::delete_many_by_url(&state.db, &uri).await {
log::error!("Error delete items from crawl queue {:?}", error);
log::warn!("Unable to delete from crawl_queue: {:?}", error);
}

// find all documents that already exist with that url
let existing: Vec<indexed_document::Model> = indexed_document::Entity::find()
.filter(indexed_document::Column::Url.is_in(uri.clone()))
.all(&state.db)
.await
.unwrap_or_default();
for chunk in uri.chunks(BATCH_SIZE) {
let existing: Vec<indexed_document::Model> = indexed_document::Entity::find()
.filter(indexed_document::Column::Url.is_in(chunk.to_vec()))
.all(&state.db)
.await
.unwrap_or_default();

// build a hash map of Url to the doc id
let mut id_map = HashMap::new();
for model in &existing {
id_map.insert(model.url.to_string(), model.doc_id.clone());
}

// build a hash map of Url to the doc id
let mut id_map = HashMap::new();
for model in &existing {
id_map.insert(model.url.to_string(), model.doc_id.clone());
}
// build a list of doc ids to delete from the index
let doc_id_list = id_map
.values()
.map(|x| x.to_owned())
.collect::<Vec<String>>();

// build a list of doc ids to delete from the index
let doc_id_list = id_map
.values()
.map(|x| x.to_owned())
.collect::<Vec<String>>();
if let Err(err) = Searcher::delete_many_by_id(state, &doc_id_list, false).await {
log::warn!("Unable to delete_many_by_id: {err}")
}

let _ = Searcher::delete_many_by_id(state, &doc_id_list, false).await;
let _ = Searcher::save(state).await;
if let Err(err) = Searcher::save(state).await {
log::warn!("Unable to save searcher: {err}")
}

// now that the documents are deleted delete from the queue
if let Err(error) = indexed_document::delete_many_by_url(&state.db, uri).await {
log::error!("Error deleting for indexed document store {:?}", error);
// now that the documents are deleted delete from the queue
if let Err(error) = indexed_document::delete_many_by_url(&state.db, chunk).await {
log::warn!("Error deleting for indexed document store {:?}", error);
}

log::info!(
"chunk: deleted {} ({}) docs from index",
chunk.len(),
existing.len()
);
}
}

Expand Down

0 comments on commit 389c243

Please sign in to comment.