From 9dd8e5dfb52f2a5aafb7058fa9c6bf6fa3f409ac Mon Sep 17 00:00:00 2001 From: ptdewey Date: Mon, 4 Nov 2024 15:21:52 -0500 Subject: [PATCH 1/6] fix: WIP fixes surrounding lexing zones --- internal/linking/lexer/lexer_utils.go | 22 +++++++++++++++++----- internal/linking/ngrams/weighting.go | 5 +++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/internal/linking/lexer/lexer_utils.go b/internal/linking/lexer/lexer_utils.go index b56e03e..65c8345 100644 --- a/internal/linking/lexer/lexer_utils.go +++ b/internal/linking/lexer/lexer_utils.go @@ -11,13 +11,17 @@ var eof rune = -1 var ( // Heading (e.g., # Heading, ## Heading) - only matches standalone heading lines - headingPattern = regexp.MustCompile(`^(#{1,6})\s+(.+?)\s*$`) + // (?m) is required to allow matching from start/end of line rather than start/end of string + // FIX: these capture groups are sometimes wrapping around lines (probably abandon regex and use more advanced lexer logic) + h1Pattern = regexp.MustCompile(`(?m)^(#)\s+(.+?)\s*$`) + h2Pattern = regexp.MustCompile(`(?m)^(#{2})\s+(.+?)\s*$`) + h3Pattern = regexp.MustCompile(`(?m)^(#{3})\s+(.+?)\s*$`) + h4Pattern = regexp.MustCompile(`(?m)^(#{4})\s+(.+?)\s*$`) + h5Pattern = regexp.MustCompile(`(?m)^(#{5})\s+(.+?)\s*$`) // Bold text (e.g., **bold** or __bold__) - matches inline without lookaheads/behinds boldPattern = regexp.MustCompile(`\*\*(.+?)\*\*|__(.+?)__`) // Italic text (e.g., *italic* or _italic_) - matches inline without lookaheads/behinds italicPattern = regexp.MustCompile(`(?:^|[^\w])(\*(\w+?)\*|_(\w+?)_)(?:[^\w]|$)`) - // Lists (e.g., - item or * item) - matches only at the beginning of a line - listPattern = regexp.MustCompile(`(?m)^\s*([*+-])\s+(.+)$`) // Link (e.g., [text](url)) linkPattern = regexp.MustCompile(`\[(.*?)\]\((.*?)\)`) // Image (e.g., ![alt text](url)) @@ -102,13 +106,21 @@ func (l *Lexer) ignore() { } func (l *Lexer) detectZone() { - peekBuffer, _ := l.br.Peek(128) // Adjust size as needed + peekBuffer, _ := l.br.Peek(32) switch { // FIX: handle remaining cases // TODO: add capture group for code blocks (might just need a boolean flag for them) - case headingPattern.Match(peekBuffer): + case h1Pattern.Match(peekBuffer): l.zone = H1 + case h2Pattern.Match(peekBuffer): + l.zone = H2 + case h3Pattern.Match(peekBuffer): + l.zone = H3 + case h4Pattern.Match(peekBuffer): + l.zone = H4 + case h5Pattern.Match(peekBuffer): + l.zone = H5 // case sectionPattern.Match(peekBuffer): // l.zone = Default // case boldPattern.Match(peekBuffer): diff --git a/internal/linking/ngrams/weighting.go b/internal/linking/ngrams/weighting.go index fa55fa2..08a9942 100644 --- a/internal/linking/ngrams/weighting.go +++ b/internal/linking/ngrams/weighting.go @@ -1,6 +1,7 @@ package ngrams import ( + "fmt" "math" "github.com/oolong-sh/oolong/internal/linking/lexer" @@ -73,6 +74,10 @@ func (ng *NGram) updateWeight() { adjustment := ladj * cadj * nadj[ng.n] * dadj // * cdadj + if ng.zone != lexer.Default { + fmt.Println(ng.keyword, ng.zone) + } + for _, nginfo := range ng.documents { // TODO: set document weight here nginfo.DocumentWeight = nginfo.DocumentBM25 * adjustment From 80d6def5424f47ce01efa32859ee29af93aa724e Mon Sep 17 00:00:00 2001 From: ptdewey Date: Mon, 4 Nov 2024 22:35:57 -0500 Subject: [PATCH 2/6] feat: document update watcher and daemon runner --- README.md | 2 +- examples/oolong.json | 17 +++++++ go.mod | 5 ++ go.sum | 4 ++ internal/daemon/daemon.go | 11 +++++ internal/daemon/watcher.go | 63 ++++++++++++++++++++++++++ internal/documents/corpus.go | 1 + internal/linking/lexer/lexeme.go | 8 ++-- internal/linking/lexer/lexer_test.go | 47 ++++++++++--------- internal/linking/lexer/lexer_utils.go | 10 ++-- internal/linking/ngrams/frequency.go | 32 +++++++------ internal/linking/ngrams/ngram.go | 14 +++--- internal/linking/ngrams/ngram_utils.go | 8 ++-- internal/linking/ngrams/similarity.go | 4 +- internal/linking/ngrams/weighting.go | 18 +++----- main.go | 2 + 16 files changed, 176 insertions(+), 70 deletions(-) create mode 100644 examples/oolong.json create mode 100644 internal/daemon/daemon.go create mode 100644 internal/daemon/watcher.go diff --git a/README.md b/README.md index eb47787..a9d8e4c 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Oolong looks for a configuration file at `~/.oolong.json` 3 ], "noteDirectories": [ - "~/notes", + "~/notes" ], "allowedExtensions": [ ".md", diff --git a/examples/oolong.json b/examples/oolong.json new file mode 100644 index 0000000..00377ac --- /dev/null +++ b/examples/oolong.json @@ -0,0 +1,17 @@ +{ + "ngramRange": [ + 1, + 2, + 3 + ], + "noteDirectories": [], + "allowedExtensions": [ + ".md", + ".mdx", + ".tex", + ".typ" + ], + "pluginPaths": [ + "./scripts/daily_note.lua" + ] +} diff --git a/go.mod b/go.mod index 8b84f20..4857529 100644 --- a/go.mod +++ b/go.mod @@ -8,3 +8,8 @@ require ( github.com/aaaton/golem/v4 v4.0.0 github.com/aaaton/golem/v4/dicts/en v1.0.1 ) + +require ( + github.com/fsnotify/fsnotify v1.8.0 // indirect + golang.org/x/sys v0.13.0 // indirect +) diff --git a/go.sum b/go.sum index 676eb3f..c5b8400 100644 --- a/go.sum +++ b/go.sum @@ -2,5 +2,9 @@ github.com/aaaton/golem/v4 v4.0.0 h1:YHieBS+5Fqir298nJ7fk3EvMcKM/+T5gpMRt4TIAiZ8 github.com/aaaton/golem/v4 v4.0.0/go.mod h1:OfK/S5v9Exsx1yO21WorREuIVV+Y5K2hygP0A9oJCCI= github.com/aaaton/golem/v4/dicts/en v1.0.1 h1:/BsOsh8JTgTkuevwM9axPnAi9CD4rK7TWHNdW/6V3Uo= github.com/aaaton/golem/v4/dicts/en v1.0.1/go.mod h1:1YKRrQNng+KbS+peA7sj3TIa8eqR6T2UqdJ+Tc9xeoA= +github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= +github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go new file mode 100644 index 0000000..4b92e9a --- /dev/null +++ b/internal/daemon/daemon.go @@ -0,0 +1,11 @@ +package daemon + +import "github.com/oolong-sh/oolong/internal/config" + +// Launch perpetually running watchers and run application in the background as a daemon +func Run() { + go runNotesDirsWatcher(config.NotesDirPaths()...) + + // run forever + <-make(chan struct{}) +} diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go new file mode 100644 index 0000000..a2eb8e0 --- /dev/null +++ b/internal/daemon/watcher.go @@ -0,0 +1,63 @@ +package daemon + +import ( + "errors" + "io/fs" + "log" + "path/filepath" + + "github.com/fsnotify/fsnotify" +) + +// Initialize and run file update watcher for notes directories +func runNotesDirsWatcher(dirs ...string) error { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return err + } + defer watcher.Close() + + for _, dir := range dirs { + // TODO: add oolong ignore system to blacklist certain subdirs/files + if err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if d.IsDir() { + err = watcher.Add(path) + if err != nil { + return err + } + log.Println("Added watcher on", path) + } + return nil + }); err != nil { + return err + } + } + + // watcher handler + // go func() { // running entire function as a goroutine, handler doesn't need to be one + for { + select { + case event, ok := <-watcher.Events: + if !ok { + log.Println("Watcher event channel returned bad result.") + return errors.New("Invalid watcher errors channel value.") + } + log.Println("Event:", event) + + // TODO: call functions on document update + // - has potential performance impacts, so only rerun what is needed (i.e. lex single doc + weight updates) + if event.Has(fsnotify.Write) { + log.Println("Modified file:", event.Name) + } + case err, ok := <-watcher.Errors: + if !ok { + return errors.New("Invalid watcher errors channel value.") + } + log.Println("error:", err) + } + } + // }() + + // <-make(chan struct{}) + // return nil +} diff --git a/internal/documents/corpus.go b/internal/documents/corpus.go index 77446a0..b53249e 100644 --- a/internal/documents/corpus.go +++ b/internal/documents/corpus.go @@ -20,6 +20,7 @@ func ReadNotesDirs() ([]*Document, error) { for _, notesDirPath := range config.NotesDirPaths() { // extract all note file paths from notes directory notePaths := []string{} + // TODO: add oolong ignore system to blacklist certain subdirs/files if err := filepath.WalkDir(notesDirPath, func(path string, d fs.DirEntry, err error) error { if d.IsDir() { return nil diff --git a/internal/linking/lexer/lexeme.go b/internal/linking/lexer/lexeme.go index 879d712..674a11c 100644 --- a/internal/linking/lexer/lexeme.go +++ b/internal/linking/lexer/lexeme.go @@ -36,10 +36,10 @@ const ( ) type Lexeme struct { - Lemma string // lexical root of unit (i.e. continues -> continue) - Value string // lexical unit - Row int // row location in file - Col int // column location of first character in file + Lemma string // lexical root of unit (i.e. continues -> continue) + Value string // lexical unit + // Row int // row location in file + // Col int // column location of first character in file LexType LexType // type of lexical unit Zone Zone // document zone } diff --git a/internal/linking/lexer/lexer_test.go b/internal/linking/lexer/lexer_test.go index b50e89a..dddeace 100644 --- a/internal/linking/lexer/lexer_test.go +++ b/internal/linking/lexer/lexer_test.go @@ -15,7 +15,7 @@ var cfg config.OolongConfig func init() { var err error - cfg, err = config.Setup("") + cfg, err = config.Setup("../../../examples/oolong.json") if err != nil { panic(err) } @@ -34,17 +34,19 @@ func TestReadDocumentSimple(t *testing.T) { } expectedTokens := []lexer.Lexeme{ { - Lemma: "hello", - Value: "Hello", - Row: 1, - Col: 1, + Lemma: "hello", + Value: "Hello", + // Row: 1, + // Col: 1, LexType: lexer.Word, + Zone: lexer.Default, }, { - Lemma: "world", - Value: "world", - Row: 1, - Col: 7, + Lemma: "world", + Value: "world", + // Row: 1, + // Col: 7, LexType: lexer.Word, + Zone: lexer.Default, }, } if !slices.Equal(l.Output, expectedTokens) { @@ -62,24 +64,27 @@ func TestReadDocumentSimple(t *testing.T) { } expectedTokens = []lexer.Lexeme{ { - Lemma: "hello", - Value: "Hello", - Row: 1, - Col: 1, + Lemma: "hello", + Value: "Hello", + // Row: 1, + // Col: 1, LexType: lexer.Word, + Zone: lexer.Default, }, { - Value: lexer.BreakToken, - Row: 1, - Col: 8, + Value: lexer.BreakToken, + // Row: 1, + // Col: 8, LexType: lexer.Break, + Zone: lexer.Default, }, { - Lemma: "world", - Value: "World", - Row: 2, - Col: 1, + Lemma: "world", + Value: "world", + // Row: 2, + // Col: 1, LexType: lexer.Word, + Zone: lexer.Default, }, } if !slices.Equal(l.Output, expectedTokens) { @@ -89,8 +94,8 @@ func TestReadDocumentSimple(t *testing.T) { // test with many newlines and multiple single-line lexemes s = "\nHello, \nworld! Foo-bar baz \n\n foo" rd = strings.NewReader(s) - l.Lex(rd) l = lexer.New() + l.Lex(rd) fmt.Println("Input:", s, " Output:", l.Output) if len(l.Output) != 9 { t.Fatalf("Incorrect Document.Content length. Expected %d, got %d", 5, len(l.Output)) diff --git a/internal/linking/lexer/lexer_utils.go b/internal/linking/lexer/lexer_utils.go index 65c8345..2f0063e 100644 --- a/internal/linking/lexer/lexer_utils.go +++ b/internal/linking/lexer/lexer_utils.go @@ -32,9 +32,9 @@ func (l *Lexer) push(v LexType) { switch v { case Break: l.Output = append(l.Output, Lexeme{ - Value: BreakToken, - Row: l.row, - Col: l.col, + Value: BreakToken, + // Row: l.row, + // Col: l.col, LexType: Break, Zone: l.zone, }) @@ -48,9 +48,9 @@ func (l *Lexer) push(v LexType) { l.Output = append(l.Output, Lexeme{ Lemma: lemma, Value: word, - Row: l.row, + // Row: l.row, // FIX: handles removed characters incorrectly in calculation (what start is probably supposed to used be for) - Col: l.col - 1 - len(word), + // Col: l.col - 1 - len(word), // Col: l.col - l.start, LexType: Word, Zone: l.zone, diff --git a/internal/linking/ngrams/frequency.go b/internal/linking/ngrams/frequency.go index 876ac65..31fb499 100644 --- a/internal/linking/ngrams/frequency.go +++ b/internal/linking/ngrams/frequency.go @@ -5,19 +5,19 @@ import ( ) // Calculate term frequency -func tf(ngmap map[string]*NGram, path string) { - // totalCount := 0 - // for _, ng := range ngmap { - // totalCount += ng.documents[path].DocumentCount - // } - - for _, ng := range ngmap { - nginfo := ng.documents[path] - // normalize by document token count - // nginfo.DocumentTF = float64(nginfo.DocumentCount) / float64(totalCount) - nginfo.DocumentTF = float64(nginfo.DocumentCount) - } -} +// func tf(ngmap map[string]*NGram, path string) { +// // totalCount := 0 +// // for _, ng := range ngmap { +// // totalCount += ng.documents[path].DocumentCount +// // } +// +// for _, ng := range ngmap { +// nginfo := ng.documents[path] +// // normalize by document token count +// // nginfo.DocumentTF = float64(nginfo.DocumentCount) / float64(totalCount) +// nginfo.DocumentTF = float64(nginfo.DocumentCount) +// } +// } // Calculate inverse document frequency of NGrams // N is the total number of documents in the text corpus @@ -42,7 +42,8 @@ func idf(ngmap map[string]*NGram, N int) { func tfidf(ngmap map[string]*NGram) { for _, ng := range ngmap { for _, nginfo := range ng.documents { - nginfo.DocumentTfIdf = nginfo.DocumentTF * ng.idf + // nginfo.DocumentTfIdf = nginfo.DocumentTF * ng.idf + nginfo.DocumentWeight = float64(nginfo.DocumentCount) * ng.idf } } } @@ -73,7 +74,8 @@ func bm25(ngmap map[string]*NGram) { b = zoneB[ng.zone] k1 = zoneK1[ng.zone] for path, nginfo := range ng.documents { - nginfo.DocumentBM25 = ng.idf * ((nginfo.DocumentTF * (k1 + 1)) / (nginfo.DocumentTF + k1*(1-b+b*(d[path]/davg)))) + tf := float64(nginfo.DocumentCount) + nginfo.DocumentWeight = ng.idf * ((tf * (k1 + 1)) / (tf + k1*(1-b+b*(d[path]/davg)))) } } } diff --git a/internal/linking/ngrams/ngram.go b/internal/linking/ngrams/ngram.go index e1343fc..e773f48 100644 --- a/internal/linking/ngrams/ngram.go +++ b/internal/linking/ngrams/ngram.go @@ -25,12 +25,12 @@ type NGram struct { // Information about NGram occurences in a single document type NGramInfo struct { - DocumentCount int - DocumentWeight float64 - DocumentLocations []location - DocumentTF float64 - DocumentTfIdf float64 - DocumentBM25 float64 + DocumentCount int + DocumentWeight float64 + // DocumentLocations []location + // DocumentTF float64 + // DocumentTfIdf float64 + // DocumentBM25 float64 } // location type for occurence of an NGram within a document @@ -94,7 +94,7 @@ func Generate(tokens []lexer.Lexeme, nrange []int, path string) map[string]*NGra } // calculate term frequencies - tf(ngrams, path) + // tf(ngrams, path) // using count instead return ngrams } diff --git a/internal/linking/ngrams/ngram_utils.go b/internal/linking/ngrams/ngram_utils.go index ae31d53..838e259 100644 --- a/internal/linking/ngrams/ngram_utils.go +++ b/internal/linking/ngrams/ngram_utils.go @@ -14,7 +14,7 @@ func addNGram(k string, n int, ngmap map[string]*NGram, i int, tokens []lexer.Le doc := ngram.documents[path] doc.DocumentCount++ - doc.DocumentLocations = append(doc.DocumentLocations, location{row: tokens[i].Row, col: tokens[i].Col}) + // doc.DocumentLocations = append(doc.DocumentLocations, location{row: tokens[i].Row, col: tokens[i].Col}) // update ngram zone if current is considered more valuable if tokens[i].Zone < ngram.zone { @@ -25,9 +25,9 @@ func addNGram(k string, n int, ngmap map[string]*NGram, i int, tokens []lexer.Le // create document info struct for ngram documents[path] = &NGramInfo{ - DocumentCount: 1, - DocumentWeight: 0, - DocumentLocations: []location{{row: tokens[i].Row, col: tokens[i].Col}}, + DocumentCount: 1, + DocumentWeight: 0, + // DocumentLocations: []location{{row: tokens[i].Row, col: tokens[i].Col}}, } // create ngram diff --git a/internal/linking/ngrams/similarity.go b/internal/linking/ngrams/similarity.go index 83dfc7b..197beb7 100644 --- a/internal/linking/ngrams/similarity.go +++ b/internal/linking/ngrams/similarity.go @@ -43,7 +43,7 @@ func CosineSimilarity(ngmap map[string]*NGram) { } } -// Construct tf-idf score vectors +// Construct weighting score vectors func constructDocumentVectors(ngmap map[string]*NGram) map[string]map[string]float64 { documentVectors := make(map[string]map[string]float64) @@ -52,7 +52,7 @@ func constructDocumentVectors(ngmap map[string]*NGram) map[string]map[string]flo if _, exists := documentVectors[doc]; !exists { documentVectors[doc] = make(map[string]float64) } - documentVectors[doc][ngram.keyword] = nginfo.DocumentTfIdf + documentVectors[doc][ngram.keyword] = nginfo.DocumentWeight } } diff --git a/internal/linking/ngrams/weighting.go b/internal/linking/ngrams/weighting.go index 08a9942..d2b66c2 100644 --- a/internal/linking/ngrams/weighting.go +++ b/internal/linking/ngrams/weighting.go @@ -1,7 +1,6 @@ package ngrams import ( - "fmt" "math" "github.com/oolong-sh/oolong/internal/linking/lexer" @@ -66,22 +65,19 @@ func (ng *NGram) updateWeight() { // TODO: these numbers are subject to change // - document and count adjustments are too high for n=1 - ladj := math.Min(0.1*float64(len(ng.keyword)), 1.1) // length adjustment - cadj := math.Min(0.1*float64(ng.n)*float64(ng.globalCount), 1.5) // count adjustment - dadj := math.Min(0.11*float64(ng.n)*float64(len(ng.documents)), 2) + ladj := math.Min(0.12*float64(len(ng.keyword)), 1.2) // length adjustment + cadj := math.Min(0.08*float64(ng.n)*float64(ng.globalCount), 1.5) // count adjustment + dadj := math.Min(0.08*float64(ng.n)*float64(len(ng.documents)), 1.8) // document occurence adjustment // TODO: heavily prefer count / len(dg.documents) > 1 // cdadj := math.Min(0.5*float64(ng.globalCount)/float64(len(ng.documents)), 2) adjustment := ladj * cadj * nadj[ng.n] * dadj // * cdadj - if ng.zone != lexer.Default { - fmt.Println(ng.keyword, ng.zone) - } - for _, nginfo := range ng.documents { - // TODO: set document weight here - nginfo.DocumentWeight = nginfo.DocumentBM25 * adjustment - w += nginfo.DocumentBM25 + // documentWeight will be bm25 or tf-idf before this point + // apply adjustment to document weight + nginfo.DocumentWeight = nginfo.DocumentWeight * adjustment + w += nginfo.DocumentWeight df++ } diff --git a/main.go b/main.go index 5f302fa..9f64d74 100644 --- a/main.go +++ b/main.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/oolong-sh/oolong/internal/config" + "github.com/oolong-sh/oolong/internal/daemon" "github.com/oolong-sh/oolong/internal/documents" ) @@ -22,4 +23,5 @@ func main() { _ = d // plugins.InitPlugins(&cfg) + daemon.Run() } From 1ae774a4dcd3c8b3393b727a77f3bb2087e7d2fd Mon Sep 17 00:00:00 2001 From: ptdewey Date: Mon, 4 Nov 2024 22:55:32 -0500 Subject: [PATCH 3/6] feat: added cli flag to run without the daemon --- main.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/main.go b/main.go index 9f64d74..f6a1b99 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "flag" "fmt" "github.com/oolong-sh/oolong/internal/config" @@ -8,6 +9,8 @@ import ( "github.com/oolong-sh/oolong/internal/documents" ) +var daemonFlag = flag.Bool("no-daemon", false, "Run Oolong in no-daemon mode (not recommended)") + func main() { cfg, err := config.Setup("~/.oolong.json") if err != nil { @@ -16,12 +19,14 @@ func main() { } fmt.Println(cfg.PluginPaths) - d, err := documents.ReadNotesDirs() + _, err = documents.ReadNotesDirs() if err != nil { return } - _ = d - // plugins.InitPlugins(&cfg) - daemon.Run() + // go plugins.InitPlugins(&cfg) + flag.Parse() + if !*daemonFlag { + daemon.Run() + } } From f759c079772c21771d796bec31379f528e48030a Mon Sep 17 00:00:00 2001 From: ptdewey Date: Tue, 5 Nov 2024 14:29:25 -0500 Subject: [PATCH 4/6] feat: watcher directory ignores --- internal/config/config.go | 10 ++++++---- internal/daemon/watcher.go | 26 +++++++++++++++++++------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 2517ec1..edb3604 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,14 +14,16 @@ type OolongConfig struct { NGramRange []int `json:"ngramRange"` AllowedExtensions []string `json:"allowedExtensions"` PluginPaths []string `json:"pluginPaths"` + IgnoreDirectories []string `json:"ignoreDirectories"` } func Config() OolongConfig { return config } -func NotesDirPaths() []string { return config.NotesDirPaths } -func NGramRange() []int { return config.NGramRange } -func AllowedExtensions() []string { return config.AllowedExtensions } -func PluginPaths() []string { return config.PluginPaths } +func NotesDirPaths() []string { return config.NotesDirPaths } +func NGramRange() []int { return config.NGramRange } +func AllowedExtensions() []string { return config.AllowedExtensions } +func PluginPaths() []string { return config.PluginPaths } +func IgnoredDirectories() []string { return config.IgnoreDirectories } // TODO: file watcher for config file, reload on change diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go index a2eb8e0..a48db38 100644 --- a/internal/daemon/watcher.go +++ b/internal/daemon/watcher.go @@ -5,8 +5,10 @@ import ( "io/fs" "log" "path/filepath" + "slices" "github.com/fsnotify/fsnotify" + "github.com/oolong-sh/oolong/internal/config" ) // Initialize and run file update watcher for notes directories @@ -17,16 +19,27 @@ func runNotesDirsWatcher(dirs ...string) error { } defer watcher.Close() + dirIgnores := config.IgnoredDirectories() + for _, dir := range dirs { // TODO: add oolong ignore system to blacklist certain subdirs/files if err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { - if d.IsDir() { - err = watcher.Add(path) - if err != nil { - return err - } - log.Println("Added watcher on", path) + if !d.IsDir() { + return nil + } + + // NOTE: this may not be the exact desired behavior for ignores + // - this logic also needs to be replicated in the document reader + if slices.Contains(dirIgnores, filepath.Base(path)) { + return filepath.SkipDir } + + err = watcher.Add(path) + if err != nil { + return err + } + log.Println("Added watcher on", path) + return nil }); err != nil { return err @@ -57,7 +70,6 @@ func runNotesDirsWatcher(dirs ...string) error { } } // }() - // <-make(chan struct{}) // return nil } From f2517b136769b3a1acc74333afa5238105f1c232 Mon Sep 17 00:00:00 2001 From: ptdewey Date: Tue, 5 Nov 2024 20:14:21 -0500 Subject: [PATCH 5/6] feat/refactor: file watching and documents state handling --- go.mod | 2 +- internal/config/config.go | 2 +- internal/daemon/watcher.go | 14 ++- internal/documents/corpus.go | 145 ++++++++++---------------- internal/documents/document.go | 8 +- internal/documents/state.go | 45 ++++++++ internal/linking/ngrams/ngram.go | 4 +- internal/linking/ngrams/similarity.go | 4 +- main.go | 2 +- 9 files changed, 124 insertions(+), 102 deletions(-) create mode 100644 internal/documents/state.go diff --git a/go.mod b/go.mod index 4857529..ed6ba7b 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,6 @@ require ( ) require ( - github.com/fsnotify/fsnotify v1.8.0 // indirect + github.com/fsnotify/fsnotify v1.8.0 golang.org/x/sys v0.13.0 // indirect ) diff --git a/internal/config/config.go b/internal/config/config.go index edb3604..880d3c2 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,7 +14,7 @@ type OolongConfig struct { NGramRange []int `json:"ngramRange"` AllowedExtensions []string `json:"allowedExtensions"` PluginPaths []string `json:"pluginPaths"` - IgnoreDirectories []string `json:"ignoreDirectories"` + IgnoreDirectories []string `json:"ignoredDirectories"` } func Config() OolongConfig { return config } diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go index a48db38..d274415 100644 --- a/internal/daemon/watcher.go +++ b/internal/daemon/watcher.go @@ -6,9 +6,11 @@ import ( "log" "path/filepath" "slices" + "time" "github.com/fsnotify/fsnotify" "github.com/oolong-sh/oolong/internal/config" + "github.com/oolong-sh/oolong/internal/documents" ) // Initialize and run file update watcher for notes directories @@ -34,6 +36,7 @@ func runNotesDirsWatcher(dirs ...string) error { return filepath.SkipDir } + // TEST: this may need to add path as absolute to get correct results err = watcher.Add(path) if err != nil { return err @@ -55,12 +58,17 @@ func runNotesDirsWatcher(dirs ...string) error { log.Println("Watcher event channel returned bad result.") return errors.New("Invalid watcher errors channel value.") } - log.Println("Event:", event) + // log.Println("Event:", event) - // TODO: call functions on document update - // - has potential performance impacts, so only rerun what is needed (i.e. lex single doc + weight updates) if event.Has(fsnotify.Write) { log.Println("Modified file:", event.Name) + + // write event is sent on write start, wait 500ms for write to finish + time.Sleep(500) + + // re-read document + documents.ReadDocuments(event.Name) + // TODO: add dedup timer to prevent multi-write calls } case err, ok := <-watcher.Errors: if !ok { diff --git a/internal/documents/corpus.go b/internal/documents/corpus.go index b53249e..f2acc0e 100644 --- a/internal/documents/corpus.go +++ b/internal/documents/corpus.go @@ -1,123 +1,90 @@ package documents import ( - "fmt" "io/fs" - "os" + "log" "path/filepath" "slices" "sync" "github.com/oolong-sh/oolong/internal/config" - "github.com/oolong-sh/oolong/internal/linking/lexer" - "github.com/oolong-sh/oolong/internal/linking/ngrams" ) -// Read, lex, and extract NGrams for all documents in notes directories specified in config file -func ReadNotesDirs() ([]*Document, error) { - documents := []*Document{} +// DOC: meant to be called with watcher +// assumes paths should not be ignored (should be safe assumption due to watcher ignores) +func ReadDocuments(paths ...string) error { + // read all input files, update state with documents + docs := readHandler(paths...) + + // merge ngram maps and calculate weights + err := updateState(docs) + if err != nil { + return err + } - for _, notesDirPath := range config.NotesDirPaths() { + // TODO: all weights change, but may not need to be recalculated every time + + return nil +} + +// Read, lex, and extract NGrams for all documents in notes directories specified in config file +func ReadNotesDirs() error { + docs := []*Document{} + for _, dir := range config.NotesDirPaths() { // extract all note file paths from notes directory - notePaths := []string{} + paths := []string{} // TODO: add oolong ignore system to blacklist certain subdirs/files - if err := filepath.WalkDir(notesDirPath, func(path string, d fs.DirEntry, err error) error { + if err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { if d.IsDir() { + if slices.Contains(config.IgnoredDirectories(), filepath.Base(path)) { + return filepath.SkipDir + } return nil } if slices.Contains(config.AllowedExtensions(), filepath.Ext(path)) { - notePaths = append(notePaths, path) + paths = append(paths, path) } return nil }); err != nil { - return nil, err + return err } - // perform a parallel read of found notes files - var wg sync.WaitGroup - wg.Add(len(notePaths)) - docs := make([]*Document, len(notePaths)) - - for i, notePath := range notePaths { - go func(i int, notePath string) { - doc, err := ReadDocument(notePath) - if err != nil { - fmt.Printf("Failed to read file: '%s' %v", notePath, err) - return - } - docs[i] = doc - wg.Done() - }(i, notePath) - } - - wg.Wait() - - // append results to output array - documents = append(documents, docs...) + // read all documents and append results + docs = append(docs, readHandler(paths...)...) } - // - // TEST: for debugging, remove later - // - // write out tokens - b := []byte{} - for _, d := range documents { - for _, t := range d.tokens { - if t.Value == lexer.BreakToken { - continue - } - b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...) - } - } - err := os.WriteFile("./tokens.txt", b, 0666) + // merge maps and calculate weights + err := updateState(docs) if err != nil { - panic(err) + return err } - b = []byte{} - b = append(b, []byte("ngram,weight,count\n")...) - ngmap := make(map[string]*ngrams.NGram) - for _, d := range documents { - ngrams.Merge(ngmap, d.ngrams) - } - ngrams.CalcWeights(ngmap, len(documents)) - for _, d := range documents { - for _, ng := range d.ngrams { - b = append(b, []byte(fmt.Sprintf("%s, %f, %d\n", ng.Keyword(), ng.Weight(), ng.Count()))...) - } - } - err = os.WriteFile("./ngrams.txt", b, 0666) - if err != nil { - panic(err) - } - b = []byte{} - b = append(b, []byte("ngram,weight,count,ndocs\n")...) - mng := ngrams.FilterMeaningfulNGrams(ngmap, 2, int(float64(len(documents))/1.5), 4.0) - for _, s := range mng { - b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, ngmap[s].Weight(), ngmap[s].Count(), len(ngmap[s].Documents())))...) - } - err = os.WriteFile("./meaningful-ngrams.csv", b, 0666) - if err != nil { - panic(err) - } - // ngrams.CosineSimilarity(ngmap) + return nil +} - // ngcounts := ngrams.Count(ngmap) - // freq := ngrams.OrderByFrequency(ngcounts, 10) - freq := ngrams.OrderByFrequency(ngmap) - b = []byte{} - for _, v := range freq { - b = append(b, []byte(fmt.Sprintf("%s %f\n", v.Key, v.Value))...) - } - err = os.WriteFile("./ngram-counts.txt", b, 0666) - if err != nil { - panic(err) +// DOC: +func readHandler(paths ...string) []*Document { + docs := make([]*Document, len(paths)) + var wg sync.WaitGroup + + // perform a parallel read of found notes files + wg.Add(len(paths)) + for i, p := range paths { + go func(i int, notePath string) { + doc, err := readDocumentByFile(notePath) + if err != nil { + log.Printf("Failed to read file: '%s' %v", notePath, err) + return + } + // TODO: this could be changed to use channels + docs[i] = doc + wg.Done() + }(i, p) } - // - // TEST: for debugging, remove later - // + wg.Wait() - return documents, nil + // append results to output array + return docs } diff --git a/internal/documents/document.go b/internal/documents/document.go index df3729e..0a45379 100644 --- a/internal/documents/document.go +++ b/internal/documents/document.go @@ -1,8 +1,8 @@ package documents import ( - "fmt" "io" + "log" "os" "github.com/oolong-sh/oolong/internal/config" @@ -25,7 +25,7 @@ func (d *Document) KeywordWeights() map[string]float64 { return d.ngwgts } // Read in a single document file, lex, and generate NGrams // Wraps readDocument for explicit use with files -func ReadDocument(documentPath string) (*Document, error) { +func readDocumentByFile(documentPath string) (*Document, error) { f, err := os.Open(documentPath) if err != nil { return nil, err @@ -44,7 +44,7 @@ func ReadDocument(documentPath string) (*Document, error) { // internal reader function that allows usage of io readers for generalized use func readDocument(r io.Reader, documentPath string) (*Document, error) { l := lexer.New() - fmt.Printf("Running lexer on %s...\n", documentPath) + log.Printf("Running lexer on %s...\n", documentPath) l.Lex(r) doc := &Document{ @@ -52,7 +52,7 @@ func readDocument(r io.Reader, documentPath string) (*Document, error) { tokens: l.Output, } - fmt.Printf("Generating NGrams for %s...\n", documentPath) + log.Printf("Generating NGrams for %s...\n", documentPath) doc.ngrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.path) // FIX: weight setting must occur after document NGRam maps are merged diff --git a/internal/documents/state.go b/internal/documents/state.go new file mode 100644 index 0000000..85a241e --- /dev/null +++ b/internal/documents/state.go @@ -0,0 +1,45 @@ +package documents + +import ( + "log" + + "github.com/oolong-sh/oolong/internal/linking/ngrams" +) + +// DOC: +var state = Corpus{ + Documents: map[string]*Document{}, + NGrams: map[string]*ngrams.NGram{}, +} + +// DOC: +type Corpus struct { + Documents map[string]*Document + NGrams map[string]*ngrams.NGram +} + +// DOC: +func State() Corpus { return state } + +// DOC: +func updateState(docs []*Document) error { + log.Println("Updating state and recalculating weights...") + + // update state documents + for _, doc := range docs { + state.Documents[doc.path] = doc + } + + // merge resulting ngram maps + for _, d := range state.Documents { + ngrams.Merge(state.NGrams, d.ngrams) + } + + // calculate weights + ngrams.CalcWeights(state.NGrams, len(state.Documents)) + log.Println("Done calculating weights.") + + // TODO: other things? (file writes?) + + return nil +} diff --git a/internal/linking/ngrams/ngram.go b/internal/linking/ngrams/ngram.go index e773f48..1955c6a 100644 --- a/internal/linking/ngrams/ngram.go +++ b/internal/linking/ngrams/ngram.go @@ -104,13 +104,15 @@ func Merge(maps ...map[string]*NGram) { for i := 1; i < len(maps); i++ { for k, vi := range maps[i] { if v0, ok := maps[0][k]; !ok { + // ngram key not found in main map, add it maps[0][k] = vi } else { + // ngram key found in map, merge counts and document info + // weights should be calculated elsewhere after all merges are completed v0.globalCount += vi.globalCount for dk, dv := range vi.documents { v0.documents[dk] = dv } - // weights should be calculated elsewhere after all merges are completed } } } diff --git a/internal/linking/ngrams/similarity.go b/internal/linking/ngrams/similarity.go index 197beb7..15167a3 100644 --- a/internal/linking/ngrams/similarity.go +++ b/internal/linking/ngrams/similarity.go @@ -1,7 +1,7 @@ package ngrams import ( - "fmt" + "log" "math" ) @@ -38,7 +38,7 @@ func CosineSimilarity(ngmap map[string]*NGram) { } similarity := calculateCosineSimilarity(vec1, vec2) // TODO: do something other than print here? -- (if this actually ends up being used) - fmt.Printf("%s, %s, %.4f\n", doc1, doc2, similarity) + log.Printf("%s, %s, %.4f\n", doc1, doc2, similarity) } } } diff --git a/main.go b/main.go index f6a1b99..04e865a 100644 --- a/main.go +++ b/main.go @@ -19,7 +19,7 @@ func main() { } fmt.Println(cfg.PluginPaths) - _, err = documents.ReadNotesDirs() + err = documents.ReadNotesDirs() if err != nil { return } From 9b68499e7e6b5fbd310e0181796bf2f50792caf4 Mon Sep 17 00:00:00 2001 From: Patrick Dewey <57921252+ptdewey@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:42:33 -0500 Subject: [PATCH 6/6] Update go-test.yml --- .github/workflows/go-test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml index 6df5455..e1b8687 100644 --- a/.github/workflows/go-test.yml +++ b/.github/workflows/go-test.yml @@ -1,8 +1,6 @@ name: Go on: - push: - branches: [ "main" ] pull_request: branches: [ "main" ]