v2022.05.15 Release (#48)

* Bump default domain limit to 10k pages * Update default lens to pull less data (all of wikipedia was a little much) * Cleaning up README & updating examples * Update README.md Fix centering * Gracefully handle crawler crash (#42) * Log errors if we can't dequeue * Log sidecar termination reasons & attempt to restart sidecar after IPC comms are lost * cargo fmt * Add setting to configure data directory (#43) * Add setting to configure data directory * Add documentation for new setting * Adding build links to top of README * Add setting & functionality to only crawl urls specified by lens rules (#44) * Add setting/functionality to only crawl urls specified by lens rules * Add documentation for new setting * Revert default crawl state on startup * Clarify blocklist documentation * Index poison error (#45) * No need to commit on every `add_document` call * Fix tests * Spawn a task to continually commit added documents to the index * v2022.5.15 Release (#47)
spyglass-search · May 16, 2022 · f25cf57 · f25cf57
1 parent b936792
commit f25cf57
Show file tree

Hide file tree

Showing 27 changed files with 607 additions and 621 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -3,11 +3,13 @@
 
 # Spyglass
 
-> tl; dr: Spyglass is a search platform that lives on your device, indexing what
-> you want, exposing it to you in a super simple & fast interface.
+## tl; dr; Spyglass indexes what you want exposing it to you in a super simple & fast interface
 
-> ⚠️ Spyglass is very much in its early stages, but it’s in a place where it's functional
-> and can be used to replace basic searches. ⚠️
+⚠️ Spyglass is very much in its early stages, but it’s in a place where it's functional and can be used to replace basic searches. ⚠️
+
+Download now: [Mac](https://github.com/a5huynh/spyglass/releases/download/v2022.5.11/Spyglass_22.5.11_x64.dmg) | [Windows](https://github.com/a5huynh/spyglass/releases/download/v2022.5.11/Spyglass_22.5.11_x64_en-US.msi) | [Linux (AppImage)](https://github.com/a5huynh/spyglass/releases/download/v2022.5.11/spyglass_22.5.11_amd64.AppImage)
+
+---
 
 ## Table of Contents
 
@@ -20,6 +22,7 @@
 * [Settings](#settings)
   * [Updating the shorcut](#updating-the-shortcut)
 
+---
 
 ## Installation
 
@@ -35,11 +38,15 @@ make build-release
 
 ## Spyglass in action
 
-Once launched, press **`Cmd + Shift + /`** to open Spyglass. Queries prefixed with `/`
-will search through your installed lenses, otherwise it'll search through your index.
+Once launched, press **`Cmd + Shift + /`** to open Spyglass. If the app has been
+successfully launched, you'll see a little menubar icon like the following:
+
+![Menubar icon and menu](docs/menubar-menu.png)
+
 
-Use the arrow keys to select the result you want and hit `Enter` to open the link in the
-browser of your choice!
+Queries prefixed with `/` will search through your installed lenses, otherwise it'll
+search through your index. Use the arrow keys to select the result you want and hit
+`Enter` to open the link in the browser of your choice!
 
 [![Spyglass in action!](docs/spyglass-poc.gif)](https://www.youtube.com/embed/OzNrxtM3s_8)
 
@@ -72,6 +79,8 @@ curated set of websites with high quality recipes.
 ``` rust
 (
     version: "1",
+    // Be proud of your creation :). Maybe soon we can share these ;)
+    author: "Andrew Huynh",
     name: "recipes",
     description: Some(r#"
         A curated collection of websites with useful, high-quality recipes.
@@ -88,9 +97,13 @@ curated set of websites with high quality recipes.
         "www.vickypham.com",
     ],
 
-    // Not yet supported but ideally more ways to filter URLs within a domain
     urls: [
-        "www.reddit.com/r/recipes/*",
+        // URLs are considered prefixed, i.e. anything that starts w/ the following
+        // will be matched and crawled.
+        //
+        // https://www.reddit.com/r/recipes/ -> matches
+        // https://www.reddit.com/r/recipes_not/ -> does not matche, notice the end slash.
+        "https://www.reddit.com/r/recipes/",
     ]
 )
 ```
@@ -105,6 +118,7 @@ programming language and not the Rust game / The Rust Belt / oxidation / etc.
 ``` rust
 (
     version: "1",
+    author: "Andrew Huynh",
     name: "rustlang",
     description: Some("Rustlang targeted websites"),
     domains: [
@@ -117,12 +131,9 @@ programming language and not the Rust game / The Rust Belt / oxidation / etc.
         ...
     ],
 
-    // Again not yet supported but an example of indexing specific communities that
-    // are relevant to the topic
     urls: [
-        "www.reddit.com/r/rust",
-        "www.reddit.com/r/rust_gamedev",
-        "https://github.com/topics/rust"
+        "https://www.reddit.com/r/rust/",
+        "https://www.reddit.com/r/rust_gamedev/",
     ]
 )
 ```
@@ -145,13 +156,19 @@ file found in their directory on startup, a default one will be created.
     run_wizard: false,
     // Not used... yet!
     allow_list: [],
-    // Domains to completely ignore.
+    // Domains to completely ignore, regardless of the lenses you have installed.
     block_list: [
       "web.archive.org",
       "w3schools.com"
     ],
     // Shortcut to launch the search bar
     shortcut: "CmdOrCtrl+Shift+/",
+    // Where to store your index and index metadata
+    // The exact default location is dependent on your OS
+    data_directory: "/Users/<username>/Library/Application Support/com.athlabs.spyglass",
+    // By default, Spyglass will only crawl things as specified in your lenses. If you want
+    // to follow links without regard to those rules, set this to true.
+    crawl_external_links: false,
 )
 ```
 

diff --git a/crates/client/src/components.rs b/crates/client/src/components.rs
@@ -73,7 +73,10 @@ pub fn search_result_component(res: &ResultListData, is_selected: bool) -> Html
     match res.result_type {
         ResultListType::DocSearch => {
             let url_link = if res.url.is_some() {
-                let domain = res.domain.clone().unwrap_or_else(||"example.com".to_string());
+                let domain = res
+                    .domain
+                    .clone()
+                    .unwrap_or_else(|| "example.com".to_string());
                 let url = res.url.clone().unwrap();
 
                 let path = url

diff --git a/crates/entities/Cargo.toml b/crates/entities/Cargo.toml
@@ -9,7 +9,8 @@ edition = "2021"
 anyhow = "1.0"
 chrono = { version = "0.4", features = ["serde"] }
 log = "0.4"
-sea-orm = { version = "^0", features = ["macros", "sqlx-sqlite", "runtime-tokio-rustls"], default-features = false }
+regex = "1"
+sea-orm = { version = "^0.8", features = ["macros", "sqlx-sqlite", "runtime-tokio-rustls"], default-features = false }
 serde = { version = "1.0", features = ["derive"] }
 shared = { path = "../shared" }
 tokio = { version = "1", features = ["full"] }

diff --git a/crates/entities/src/lib.rs b/crates/entities/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod models;
+pub mod regex;
 pub mod test;
 
 pub use sea_orm;
diff --git a/crates/entities/src/models/crawl_queue.rs b/crates/entities/src/models/crawl_queue.rs
@@ -1,13 +1,15 @@
 use std::collections::HashSet;
 use std::fmt;
 
+use regex::RegexSet;
 use sea_orm::entity::prelude::*;
 use sea_orm::{sea_query, DbBackend, FromQueryResult, QuerySelect, Set, Statement};
 use serde::Serialize;
 use url::Url;
 
 use super::indexed_document;
-use shared::config::{Limit, UserSettings};
+use crate::regex::{regex_for_domain, regex_for_prefix};
+use shared::config::{Lens, Limit, UserSettings};
 
 const MAX_RETRIES: u8 = 5;
 
@@ -247,35 +249,62 @@ pub enum SkipReason {
 #[derive(Default)]
 pub struct EnqueueSettings {
     pub skip_blocklist: bool,
+    pub skip_lenses: bool,
     pub crawl_type: CrawlType,
 }
 
 pub async fn enqueue_all(
     db: &DatabaseConnection,
     urls: &[String],
+    lenses: &[Lens],
     settings: &UserSettings,
     overrides: &EnqueueSettings,
 ) -> anyhow::Result<(), sea_orm::DbErr> {
+    let mut allow_list: Vec<String> = Vec::new();
+    for lens in lenses {
+        // Build regex from domain
+        for domain in lens.domains.iter() {
+            allow_list.push(regex_for_domain(domain));
+        }
+
+        // Build regex from url rules
+        for prefix in lens.urls.iter() {
+            allow_list.push(regex_for_prefix(prefix));
+        }
+    }
+
+    let allow_list = RegexSet::new(allow_list).unwrap();
     let block_list: HashSet<String> = HashSet::from_iter(settings.block_list.iter().cloned());
 
     // Ignore invalid URLs
     let urls: Vec<String> = urls
         .iter()
-        .filter_map(|x| {
-            if let Ok(mut parsed) = Url::parse(x) {
+        .filter_map(|url| {
+            if let Ok(mut parsed) = Url::parse(url) {
                 // Always ignore fragments, otherwise crawling
                 // https://wikipedia.org/Rust#Blah would be considered different than
                 // https://wikipedia.org/Rust
                 parsed.set_fragment(None);
 
                 // Ignore URLs w/ no domain/host strings
                 let domain = parsed.host_str()?;
+                let normalized = parsed.to_string();
 
                 // Ignore domains on blacklist
                 if !overrides.skip_blocklist && block_list.contains(&domain.to_string()) {
                     return None;
                 }
 
+                // Check lense rules?
+                if !overrides.skip_lenses
+                    // Should we crawl external links?
+                    && !settings.crawl_external_links
+                    // Only allow crawls specified in our lenses
+                    && !allow_list.is_match(&normalized)
+                {
+                    return None;
+                }
+
                 Some(parsed.as_str().to_string())
             } else {
                 None
@@ -361,7 +390,7 @@ mod test {
     use crate::test::setup_test_db;
     use shared::config::{Limit, UserSettings};
 
-    use super::{gen_priority_sql, gen_priority_values};
+    use super::{gen_priority_sql, gen_priority_values, EnqueueSettings};
 
     #[tokio::test]
     async fn test_insert() {
@@ -397,7 +426,7 @@ mod test {
         let sql = gen_priority_sql(&p_domains, &p_prefixes, settings);
         assert_eq!(
             sql.to_string(),
-            "WITH\n                p_domain(domain, priority) AS (values (\"en.wikipedia.org\", 1)),\n                p_prefix(prefix, priority) AS (values (\"https://roll20.net/compendium/dnd5e%\", 1)), indexed AS (\n    SELECT\n        domain,\n        count(*) as count\n    FROM indexed_document\n    GROUP BY domain\n),\ninflight AS (\n    SELECT\n        domain,\n        count(*) as count\n    FROM crawl_queue\n    WHERE status = \"Processing\"\n    GROUP BY domain\n)\nSELECT\n    cq.*\nFROM crawl_queue cq\nLEFT JOIN p_domain ON cq.domain like p_domain.domain\nLEFT JOIN p_prefix ON cq.url like p_prefix.prefix\nLEFT JOIN indexed ON indexed.domain = cq.domain\nLEFT JOIN inflight ON inflight.domain = cq.domain\nWHERE\n    COALESCE(indexed.count, 0) < 1000 AND\n    COALESCE(inflight.count, 0) < 2 AND\n    status = \"Queued\"\nORDER BY\n    p_prefix.priority DESC,\n    p_domain.priority DESC,\n    cq.updated_at ASC"
+            "WITH\n                p_domain(domain, priority) AS (values (\"en.wikipedia.org\", 1)),\n                p_prefix(prefix, priority) AS (values (\"https://roll20.net/compendium/dnd5e%\", 1)), indexed AS (\n    SELECT\n        domain,\n        count(*) as count\n    FROM indexed_document\n    GROUP BY domain\n),\ninflight AS (\n    SELECT\n        domain,\n        count(*) as count\n    FROM crawl_queue\n    WHERE status = \"Processing\"\n    GROUP BY domain\n)\nSELECT\n    cq.*\nFROM crawl_queue cq\nLEFT JOIN p_domain ON cq.domain like p_domain.domain\nLEFT JOIN p_prefix ON cq.url like p_prefix.prefix\nLEFT JOIN indexed ON indexed.domain = cq.domain\nLEFT JOIN inflight ON inflight.domain = cq.domain\nWHERE\n    COALESCE(indexed.count, 0) < 10000 AND\n    COALESCE(inflight.count, 0) < 2 AND\n    status = \"Queued\"\nORDER BY\n    p_prefix.priority DESC,\n    p_domain.priority DESC,\n    cq.updated_at ASC"
         );
     }
 
@@ -406,7 +435,12 @@ mod test {
         let settings = UserSettings::default();
         let db = setup_test_db().await;
         let url = vec!["https://oldschool.runescape.wiki/".into()];
-        crawl_queue::enqueue_all(&db, &url, &settings, &Default::default())
+
+        let overrides = EnqueueSettings {
+            skip_lenses: true,
+            ..Default::default()
+        };
+        crawl_queue::enqueue_all(&db, &url, &[], &settings, &overrides)
             .await
             .unwrap();
 
@@ -426,7 +460,12 @@ mod test {
         let url = vec!["https://oldschool.runescape.wiki/".into()];
         let prioritized = vec![];
 
-        crawl_queue::enqueue_all(&db, &url, &settings, &Default::default())
+        let overrides = EnqueueSettings {
+            skip_lenses: true,
+            ..Default::default()
+        };
+
+        crawl_queue::enqueue_all(&db, &url, &[], &settings, &overrides)
             .await
             .unwrap();
 
@@ -448,8 +487,12 @@ mod test {
         let url: Vec<String> = vec!["https://oldschool.runescape.wiki/".into()];
         let parsed = Url::parse(&url[0]).unwrap();
         let prioritized = vec![];
+        let overrides = EnqueueSettings {
+            skip_lenses: true,
+            ..Default::default()
+        };
 
-        crawl_queue::enqueue_all(&db, &url, &settings, &Default::default())
+        crawl_queue::enqueue_all(&db, &url, &[], &settings, &overrides)
             .await
             .unwrap();
         let doc = indexed_document::ActiveModel {

diff --git a/crates/entities/src/models/mod.rs b/crates/entities/src/models/mod.rs
@@ -9,13 +9,16 @@ pub mod resource_rule;
 
 use shared::config::Config;
 
-pub async fn create_connection(is_test: bool) -> anyhow::Result<DatabaseConnection> {
+pub async fn create_connection(
+    config: &Config,
+    is_test: bool,
+) -> anyhow::Result<DatabaseConnection> {
     let db_uri: String = if is_test {
         "sqlite::memory:".to_string()
     } else {
         format!(
             "sqlite://{}?mode=rwc",
-            Config::data_dir().join("db.sqlite").to_str().unwrap()
+            config.data_dir().join("db.sqlite").to_str().unwrap()
         )
     };
 
@@ -30,10 +33,12 @@ pub async fn create_connection(is_test: bool) -> anyhow::Result<DatabaseConnecti
 #[cfg(test)]
 mod test {
     use crate::models::create_connection;
+    use shared::config::Config;
 
     #[tokio::test]
     async fn test_create_connection() {
-        let res = create_connection(true).await;
+        let config = Config::default();
+        let res = create_connection(&config, true).await;
         assert!(res.is_ok());
     }
 }