diff --git a/Makefile b/Makefile index 05f8ae5..0600010 100644 --- a/Makefile +++ b/Makefile @@ -6,3 +6,8 @@ run: test: go test ./... + +make eval: + time go run main.go --no-daemon true + xsv sort -NRs weight meaningful-ngrams.csv -o meaningful-ngrams.csv + csvlens meaningful-ngrams.csv diff --git a/README.md b/README.md index 1364cbd..98b70e1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Oolong +# Oolong Backend Daemon Coming soon... diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go index d274415..2e09261 100644 --- a/internal/daemon/watcher.go +++ b/internal/daemon/watcher.go @@ -68,6 +68,7 @@ func runNotesDirsWatcher(dirs ...string) error { // re-read document documents.ReadDocuments(event.Name) + // TODO: add dedup timer to prevent multi-write calls } case err, ok := <-watcher.Errors: diff --git a/internal/documents/corpus.go b/internal/documents/corpus.go index f2acc0e..15007ae 100644 --- a/internal/documents/corpus.go +++ b/internal/documents/corpus.go @@ -10,25 +10,34 @@ import ( "github.com/oolong-sh/oolong/internal/config" ) +// State updater function is dependency injected from state to avoid circular dependency +var UpdateState func([]*Document) error + // DOC: meant to be called with watcher // assumes paths should not be ignored (should be safe assumption due to watcher ignores) func ReadDocuments(paths ...string) error { + if UpdateState == nil { + panic("UpdateState was never instantiated.") + } + // read all input files, update state with documents docs := readHandler(paths...) // merge ngram maps and calculate weights - err := updateState(docs) + err := UpdateState(docs) if err != nil { return err } - // TODO: all weights change, but may not need to be recalculated every time - return nil } // Read, lex, and extract NGrams for all documents in notes directories specified in config file func ReadNotesDirs() error { + if UpdateState == nil { + panic("UpdateState not instantiated.") + } + docs := []*Document{} for _, dir := range config.NotesDirPaths() { // extract all note file paths from notes directory @@ -56,7 +65,7 @@ func ReadNotesDirs() error { } // merge maps and calculate weights - err := updateState(docs) + err := UpdateState(docs) if err != nil { return err } @@ -66,25 +75,35 @@ func ReadNotesDirs() error { // DOC: func readHandler(paths ...string) []*Document { - docs := make([]*Document, len(paths)) var wg sync.WaitGroup + docChan := make(chan *Document) - // perform a parallel read of found notes files - wg.Add(len(paths)) - for i, p := range paths { - go func(i int, notePath string) { + // launch a goroutine for each file path and read in parallel + for _, p := range paths { + wg.Add(1) + go func(notePath string) { + defer wg.Done() doc, err := readDocumentByFile(notePath) if err != nil { log.Printf("Failed to read file: '%s' %v", notePath, err) return } - // TODO: this could be changed to use channels - docs[i] = doc - wg.Done() - }(i, p) + // send the document via channel + docChan <- doc + }(p) + } + + // close the channel once all goroutines are done + go func() { + wg.Wait() + close(docChan) + }() + + // collect documents from the channel + var docs []*Document + for doc := range docChan { + docs = append(docs, doc) } - wg.Wait() - // append results to output array return docs } diff --git a/internal/documents/corpus_test.go b/internal/documents/corpus_test.go index b80c9b3..cb353b7 100644 --- a/internal/documents/corpus_test.go +++ b/internal/documents/corpus_test.go @@ -1,4 +1,4 @@ -package documents +package documents_test import ( "fmt" @@ -6,8 +6,9 @@ import ( "testing" "github.com/oolong-sh/oolong/internal/config" - "github.com/oolong-sh/oolong/internal/linking/lexer" + "github.com/oolong-sh/oolong/internal/documents" "github.com/oolong-sh/oolong/internal/linking/ngrams" + "github.com/oolong-sh/oolong/internal/state" ) var cfg = config.OolongConfig{ @@ -20,28 +21,29 @@ var cfg = config.OolongConfig{ } func TestReadNotesDirs(t *testing.T) { + s := state.State() // TODO: actual tests with an example data directory - if err := ReadNotesDirs(); err != nil { + if err := documents.ReadNotesDirs(); err != nil { t.Fatalf("Failed to read notes directories: %v\n", err) } // write out tokens - b := []byte{} - for _, d := range state.Documents { - for _, t := range d.tokens { - if t.Value == lexer.BreakToken { - continue - } - b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...) - } - } - if err := os.WriteFile("./tokens.txt", b, 0666); err != nil { - t.Fatalf("Failed to write tokens: %v\n", err) - } + // b := []byte{} + // for _, d := range s.Documents { + // for _, t := range d.tokens { + // if t.Value == lexer.BreakToken { + // continue + // } + // b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...) + // } + // } + // if err := os.WriteFile("./tokens.txt", b, 0666); err != nil { + // t.Fatalf("Failed to write tokens: %v\n", err) + // } - b = append([]byte{}, []byte("ngram,weight,count\n")...) - for _, d := range state.Documents { - for _, ng := range d.ngrams { + b := append([]byte{}, []byte("ngram,weight,count\n")...) + for _, d := range s.Documents { + for _, ng := range d.NGrams { b = append(b, []byte(fmt.Sprintf("%s, %f, %d\n", ng.Keyword(), ng.Weight(), ng.Count()))...) } } @@ -50,9 +52,9 @@ func TestReadNotesDirs(t *testing.T) { } b = append([]byte{}, []byte("ngram,weight,count,ndocs\n")...) - mng := ngrams.FilterMeaningfulNGrams(state.NGrams, 2, int(float64(len(state.Documents))/1.5), 4.0) - for _, s := range mng { - b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, state.NGrams[s].Weight(), state.NGrams[s].Count(), len(state.NGrams[s].Documents())))...) + mng := ngrams.FilterMeaningfulNGrams(s.NGrams, 2, int(float64(len(s.Documents))/1.5), 4.0) + for _, k := range mng { + b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", k, s.NGrams[k].Weight(), s.NGrams[k].Count(), len(s.NGrams[k].Documents())))...) } if err := os.WriteFile("./meaningful-ngrams.csv", b, 0666); err != nil { t.Fatalf("Failed to write out meaningful ngrams: %v\n", err) diff --git a/internal/documents/document.go b/internal/documents/document.go index 0a45379..c9eeb3c 100644 --- a/internal/documents/document.go +++ b/internal/documents/document.go @@ -12,17 +12,13 @@ import ( // Document type stores lexical tokens and NGrams for a single document type Document struct { - path string - ngwgts map[string]float64 + Path string + Weights map[string]float64 + NGrams map[string]*ngrams.NGram - ngrams map[string]*ngrams.NGram tokens []lexer.Lexeme } -// Document implementation of Note interface -func (d *Document) Path() string { return d.path } -func (d *Document) KeywordWeights() map[string]float64 { return d.ngwgts } - // Read in a single document file, lex, and generate NGrams // Wraps readDocument for explicit use with files func readDocumentByFile(documentPath string) (*Document, error) { @@ -36,7 +32,7 @@ func readDocumentByFile(documentPath string) (*Document, error) { if err != nil { return nil, err } - d.path = documentPath + d.Path = documentPath return d, nil } @@ -48,24 +44,16 @@ func readDocument(r io.Reader, documentPath string) (*Document, error) { l.Lex(r) doc := &Document{ - path: documentPath, + Path: documentPath, tokens: l.Output, } + // extract ngrams from document log.Printf("Generating NGrams for %s...\n", documentPath) - doc.ngrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.path) + doc.NGrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.Path) - // FIX: weight setting must occur after document NGRam maps are merged - doc.setWeightsMap() + // initialize weights map to avoid nil pointer issues + doc.Weights = make(map[string]float64, len(doc.NGrams)) return doc, nil } - -// Generate map of weights for all NGrams found in the document -func (d *Document) setWeightsMap() { - wgts := make(map[string]float64) - for k, v := range d.ngrams { - wgts[k] = v.Documents()[d.path].DocumentWeight - } - d.ngwgts = wgts -} diff --git a/internal/documents/state.go b/internal/documents/state.go deleted file mode 100644 index 025a1e7..0000000 --- a/internal/documents/state.go +++ /dev/null @@ -1,63 +0,0 @@ -package documents - -import ( - "fmt" - "log" - "os" - - "github.com/oolong-sh/oolong/internal/linking/ngrams" -) - -// DOC: -var state = Corpus{ - Documents: map[string]*Document{}, - NGrams: map[string]*ngrams.NGram{}, -} - -// DOC: -type Corpus struct { - Documents map[string]*Document - NGrams map[string]*ngrams.NGram -} - -// DOC: -func State() Corpus { return state } - -// DOC: -func updateState(docs []*Document) error { - log.Println("Updating state and recalculating weights...") - - // update state documents - for _, doc := range docs { - state.Documents[doc.path] = doc - } - - // merge resulting ngram maps - for _, d := range state.Documents { - ngrams.Merge(state.NGrams, d.ngrams) - } - - // calculate weights - ngrams.CalcWeights(state.NGrams, len(state.Documents)) - log.Println("Done calculating weights.") - - // - // TEST: remove later - // - state := State() - b := append([]byte{}, []byte("ngram,weight,count,ndocs\n")...) - mng := ngrams.FilterMeaningfulNGrams(state.NGrams, 2, int(float64(len(state.Documents))/1.5), 4.0) - for _, s := range mng { - b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, state.NGrams[s].Weight(), state.NGrams[s].Count(), len(state.NGrams[s].Documents())))...) - } - if err := os.WriteFile("./meaningful-ngrams.csv", b, 0666); err != nil { - panic(err) - } - // - // TEST: remove later - // - - // TODO: other things? (file writes?) - - return nil -} diff --git a/internal/linking/lexer/lexer_utils.go b/internal/linking/lexer/lexer_utils.go index 2f0063e..dcdf81d 100644 --- a/internal/linking/lexer/lexer_utils.go +++ b/internal/linking/lexer/lexer_utils.go @@ -109,8 +109,8 @@ func (l *Lexer) detectZone() { peekBuffer, _ := l.br.Peek(32) switch { - // FIX: handle remaining cases - // TODO: add capture group for code blocks (might just need a boolean flag for them) + // TODO: handle remaining cases + // - add capture group for code blocks (might just need a boolean flag for them) case h1Pattern.Match(peekBuffer): l.zone = H1 case h2Pattern.Match(peekBuffer): diff --git a/internal/linking/ngrams/frequency.go b/internal/linking/ngrams/frequency.go index 31fb499..cf03378 100644 --- a/internal/linking/ngrams/frequency.go +++ b/internal/linking/ngrams/frequency.go @@ -51,7 +51,6 @@ func tfidf(ngmap map[string]*NGram) { // Best Matching 25 -- Alternative matching function that doesn't downweight common terms as much // k1: controls saturation of TF (normally between 1.2 and 2) // b: controls document length normalization (0 is no normaliztion) -// TODO: add bm25f modifications to account for zones -- add zone tracking to lexer (zones affect b, k1, idf) func bm25(ngmap map[string]*NGram) { d := make(map[string]float64) totalLength := 0.0 diff --git a/internal/linking/ngrams/ngram.go b/internal/linking/ngrams/ngram.go index 1955c6a..e6026fc 100644 --- a/internal/linking/ngrams/ngram.go +++ b/internal/linking/ngrams/ngram.go @@ -110,6 +110,10 @@ func Merge(maps ...map[string]*NGram) { // ngram key found in map, merge counts and document info // weights should be calculated elsewhere after all merges are completed v0.globalCount += vi.globalCount + // lower zones are considered better, take best + if v0.zone > vi.zone { + v0.zone = vi.zone + } for dk, dv := range vi.documents { v0.documents[dk] = dv } diff --git a/internal/state/state.go b/internal/state/state.go new file mode 100644 index 0000000..698556c --- /dev/null +++ b/internal/state/state.go @@ -0,0 +1,91 @@ +package state + +import ( + "fmt" + "log" + "os" + + "github.com/oolong-sh/oolong/internal/documents" + "github.com/oolong-sh/oolong/internal/linking/ngrams" + "github.com/oolong-sh/oolong/pkg/keywords" + "github.com/oolong-sh/oolong/pkg/notes" +) + +// application-wide persistent state of documents and ngrams +var state OolongState + +type OolongState struct { + Documents map[string]*documents.Document + NGrams map[string]*ngrams.NGram +} + +// State getter +func State() OolongState { return state } + +// Initialize oolong state variables and inject state updater function into documents +func InitState() { + // instantiate persistent state + state = OolongState{ + Documents: map[string]*documents.Document{}, + NGrams: map[string]*ngrams.NGram{}, + } + + // dependency injection of state updater function + documents.UpdateState = UpdateState +} + +// Update application state information after file reads are performed +func UpdateState(docs []*documents.Document) error { + log.Println("Updating state and recalculating weights...") + + // update state documents + for _, doc := range docs { + state.Documents[doc.Path] = doc + } + + // merge resulting ngram maps + for _, d := range state.Documents { + ngrams.Merge(state.NGrams, d.NGrams) + } + + // calculate weights + ngrams.CalcWeights(state.NGrams, len(state.Documents)) + log.Println("Done calculating weights.") + + // update document weights after all weights are calculated + log.Println("Updating document weights...") + for ng, ngram := range state.NGrams { + for path, nginfo := range ngram.Documents() { + state.Documents[path].Weights[ng] = nginfo.DocumentWeight + } + } + log.Println("Done updating document weights.") + + // + // TEST: remove later + // + state := State() + b := append([]byte{}, []byte("ngram,weight,count,ndocs\n")...) + mng := ngrams.FilterMeaningfulNGrams(state.NGrams, 2, int(float64(len(state.Documents))/1.5), 4.0) + for _, s := range mng { + b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, state.NGrams[s].Weight(), state.NGrams[s].Count(), len(state.NGrams[s].Documents())))...) + } + if err := os.WriteFile("./meaningful-ngrams.csv", b, 0666); err != nil { + panic(err) + } + // + // TEST: remove later + // + + // TODO: other things? (file writes?) + + // serialize results for graph usage + if err := notes.SerializeDocuments(state.Documents); err != nil { + panic(err) + } + if err := keywords.SerializeNGrams(state.NGrams); err != nil { + panic(err) + } + + return nil +} diff --git a/main.go b/main.go index 3e5a3c6..2aae615 100644 --- a/main.go +++ b/main.go @@ -7,11 +7,13 @@ import ( "github.com/oolong-sh/oolong/internal/config" "github.com/oolong-sh/oolong/internal/daemon" "github.com/oolong-sh/oolong/internal/documents" + "github.com/oolong-sh/oolong/internal/state" ) var daemonFlag = flag.Bool("no-daemon", false, "Run Oolong in no-daemon mode (not recommended)") func main() { + // read in config cfg, err := config.Setup("~/.config/oolong.toml") if err != nil { fmt.Println(err) @@ -19,13 +21,18 @@ func main() { } fmt.Println(cfg.PluginPaths) + // initialize state + state.InitState() + + // read notes directories err = documents.ReadNotesDirs() if err != nil { - return + panic(err) } // go plugins.InitPlugins(&cfg) + // run daemon if --no-daemon flag is not passed flag.Parse() if !*daemonFlag { daemon.Run() diff --git a/pkg/documents/document.go b/pkg/documents/document.go deleted file mode 100644 index 4c53484..0000000 --- a/pkg/documents/document.go +++ /dev/null @@ -1,6 +0,0 @@ -package documents - -type Document interface { - Path() string - KeywordWeights() map[string]float64 -} diff --git a/pkg/keywords/keyword.go b/pkg/keywords/keyword.go deleted file mode 100644 index 4daf946..0000000 --- a/pkg/keywords/keyword.go +++ /dev/null @@ -1,6 +0,0 @@ -package keywords - -type Keyword interface { - Keyword() string - Weight() float64 -} diff --git a/pkg/keywords/keywords.go b/pkg/keywords/keywords.go new file mode 100644 index 0000000..9f45824 --- /dev/null +++ b/pkg/keywords/keywords.go @@ -0,0 +1,61 @@ +package keywords + +import ( + "encoding/json" + "os" + + "github.com/oolong-sh/oolong/internal/linking/ngrams" +) + +var keywordsFile = "./oolong-keywords.json" + +type keyword struct { + Keyword string `json:"keyword"` + Weight float64 `json:"weight"` +} + +// DOC: +func SerializeNGrams(ngmap map[string]*ngrams.NGram) error { + keywords := ngramsToKeywords(ngmap) + + err := serializeKeywords(keywords) + if err != nil { + return err + } + + return nil +} + +func serializeKeywords(keywords []keyword) error { + b, err := json.Marshal(keywords) + if err != nil { + return err + } + + err = os.WriteFile(keywordsFile, b, 0644) + if err != nil { + return err + } + + return nil +} + +// TODO: parameterize filtering threshold (maybe a percentage?) +func ngramsToKeywords(ngmap map[string]*ngrams.NGram) []keyword { + // keywords := make([]keyword, len(ngmap)) + keywords := []keyword{} + threshold := 8.0 + + for k, v := range ngmap { + w := v.Weight() + + if w > threshold { + keywords = append(keywords, keyword{ + Keyword: k, + Weight: w, + }) + } + } + + return keywords +} diff --git a/pkg/notes/notes.go b/pkg/notes/notes.go new file mode 100644 index 0000000..2fe0a35 --- /dev/null +++ b/pkg/notes/notes.go @@ -0,0 +1,63 @@ +package notes + +import ( + "encoding/json" + "os" + + "github.com/oolong-sh/oolong/internal/documents" +) + +var notesFile = "./oolong-notes.json" + +type note struct { + Path string `json:"path"` + Weights map[string]float64 `json:"weights"` +} + +// DOC: +func SerializeDocuments(documents map[string]*documents.Document) error { + notes := documentsToNotes(documents) + + err := serializeNotes(notes) + if err != nil { + return err + } + + return nil +} + +func serializeNotes(notes []note) error { + b, err := json.Marshal(notes) + if err != nil { + return err + } + + err = os.WriteFile(notesFile, b, 0644) + if err != nil { + return err + } + + return nil +} + +// TODO: parameterize filtering threshold (maybe as a percentage?) +func documentsToNotes(documents map[string]*documents.Document) []note { + notes := []note{} + threshold := 8.0 + + for k, v := range documents { + weights := map[string]float64{} + for k, v := range v.Weights { + if v > threshold { + weights[k] = v + } + } + + notes = append(notes, note{ + Path: k, + Weights: weights, + }) + } + + return notes +}