feat: output json serialization

oolong-sh · Nov 7, 2024 · 1b362f8 · 1b362f8
1 parent 851d5e8
commit 1b362f8
Show file tree

Hide file tree

Showing 16 changed files with 302 additions and 137 deletions.
diff --git a/Makefile b/Makefile
@@ -6,3 +6,8 @@ run:
 
 test:
 	go test ./...
+
+make eval:
+	time go run main.go --no-daemon true
+	xsv sort -NRs weight meaningful-ngrams.csv -o meaningful-ngrams.csv
+	csvlens meaningful-ngrams.csv
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Oolong
+# Oolong Backend Daemon
 
 Coming soon...
 

diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go
@@ -68,6 +68,7 @@ func runNotesDirsWatcher(dirs ...string) error {
 
 				// re-read document
 				documents.ReadDocuments(event.Name)
+
 				// TODO: add dedup timer to prevent multi-write calls
 			}
 		case err, ok := <-watcher.Errors:

diff --git a/internal/documents/corpus.go b/internal/documents/corpus.go
@@ -10,25 +10,34 @@ import (
 	"github.com/oolong-sh/oolong/internal/config"
 )
 
+// State updater function is dependency injected from state to avoid circular dependency
+var UpdateState func([]*Document) error
+
 // DOC: meant to be called with watcher
 // assumes paths should not be ignored (should be safe assumption due to watcher ignores)
 func ReadDocuments(paths ...string) error {
+	if UpdateState == nil {
+		panic("UpdateState was never instantiated.")
+	}
+
 	// read all input files, update state with documents
 	docs := readHandler(paths...)
 
 	// merge ngram maps and calculate weights
-	err := updateState(docs)
+	err := UpdateState(docs)
 	if err != nil {
 		return err
 	}
 
-	// TODO: all weights change, but may not need to be recalculated every time
-
 	return nil
 }
 
 // Read, lex, and extract NGrams for all documents in notes directories specified in config file
 func ReadNotesDirs() error {
+	if UpdateState == nil {
+		panic("UpdateState not instantiated.")
+	}
+
 	docs := []*Document{}
 	for _, dir := range config.NotesDirPaths() {
 		// extract all note file paths from notes directory
@@ -56,7 +65,7 @@ func ReadNotesDirs() error {
 	}
 
 	// merge maps and calculate weights
-	err := updateState(docs)
+	err := UpdateState(docs)
 	if err != nil {
 		return err
 	}
@@ -66,25 +75,35 @@ func ReadNotesDirs() error {
 
 // DOC:
 func readHandler(paths ...string) []*Document {
-	docs := make([]*Document, len(paths))
 	var wg sync.WaitGroup
+	docChan := make(chan *Document)
 
-	// perform a parallel read of found notes files
-	wg.Add(len(paths))
-	for i, p := range paths {
-		go func(i int, notePath string) {
+	// launch a goroutine for each file path and read in parallel
+	for _, p := range paths {
+		wg.Add(1)
+		go func(notePath string) {
+			defer wg.Done()
 			doc, err := readDocumentByFile(notePath)
 			if err != nil {
 				log.Printf("Failed to read file: '%s' %v", notePath, err)
 				return
 			}
-			// TODO: this could be changed to use channels
-			docs[i] = doc
-			wg.Done()
-		}(i, p)
+			// send the document via channel
+			docChan <- doc
+		}(p)
+	}
+
+	// close the channel once all goroutines are done
+	go func() {
+		wg.Wait()
+		close(docChan)
+	}()
+
+	// collect documents from the channel
+	var docs []*Document
+	for doc := range docChan {
+		docs = append(docs, doc)
 	}
-	wg.Wait()
 
-	// append results to output array
 	return docs
 }
diff --git a/internal/documents/corpus_test.go b/internal/documents/corpus_test.go
@@ -1,13 +1,14 @@
-package documents
+package documents_test
 
 import (
 	"fmt"
 	"os"
 	"testing"
 
 	"github.com/oolong-sh/oolong/internal/config"
-	"github.com/oolong-sh/oolong/internal/linking/lexer"
+	"github.com/oolong-sh/oolong/internal/documents"
 	"github.com/oolong-sh/oolong/internal/linking/ngrams"
+	"github.com/oolong-sh/oolong/internal/state"
 )
 
 var cfg = config.OolongConfig{
@@ -20,28 +21,29 @@ var cfg = config.OolongConfig{
 }
 
 func TestReadNotesDirs(t *testing.T) {
+	s := state.State()
 	// TODO: actual tests with an example data directory
-	if err := ReadNotesDirs(); err != nil {
+	if err := documents.ReadNotesDirs(); err != nil {
 		t.Fatalf("Failed to read notes directories: %v\n", err)
 	}
 
 	// write out tokens
-	b := []byte{}
-	for _, d := range state.Documents {
-		for _, t := range d.tokens {
-			if t.Value == lexer.BreakToken {
-				continue
-			}
-			b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...)
-		}
-	}
-	if err := os.WriteFile("./tokens.txt", b, 0666); err != nil {
-		t.Fatalf("Failed to write tokens: %v\n", err)
-	}
+	// b := []byte{}
+	// for _, d := range s.Documents {
+	// 	for _, t := range d.tokens {
+	// 		if t.Value == lexer.BreakToken {
+	// 			continue
+	// 		}
+	// 		b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...)
+	// 	}
+	// }
+	// if err := os.WriteFile("./tokens.txt", b, 0666); err != nil {
+	// 	t.Fatalf("Failed to write tokens: %v\n", err)
+	// }
 
-	b = append([]byte{}, []byte("ngram,weight,count\n")...)
-	for _, d := range state.Documents {
-		for _, ng := range d.ngrams {
+	b := append([]byte{}, []byte("ngram,weight,count\n")...)
+	for _, d := range s.Documents {
+		for _, ng := range d.NGrams {
 			b = append(b, []byte(fmt.Sprintf("%s, %f, %d\n", ng.Keyword(), ng.Weight(), ng.Count()))...)
 		}
 	}
@@ -50,9 +52,9 @@ func TestReadNotesDirs(t *testing.T) {
 	}
 
 	b = append([]byte{}, []byte("ngram,weight,count,ndocs\n")...)
-	mng := ngrams.FilterMeaningfulNGrams(state.NGrams, 2, int(float64(len(state.Documents))/1.5), 4.0)
-	for _, s := range mng {
-		b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, state.NGrams[s].Weight(), state.NGrams[s].Count(), len(state.NGrams[s].Documents())))...)
+	mng := ngrams.FilterMeaningfulNGrams(s.NGrams, 2, int(float64(len(s.Documents))/1.5), 4.0)
+	for _, k := range mng {
+		b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", k, s.NGrams[k].Weight(), s.NGrams[k].Count(), len(s.NGrams[k].Documents())))...)
 	}
 	if err := os.WriteFile("./meaningful-ngrams.csv", b, 0666); err != nil {
 		t.Fatalf("Failed to write out meaningful ngrams: %v\n", err)

diff --git a/internal/documents/document.go b/internal/documents/document.go
@@ -12,17 +12,13 @@ import (
 
 // Document type stores lexical tokens and NGrams for a single document
 type Document struct {
-	path   string
-	ngwgts map[string]float64
+	Path    string
+	Weights map[string]float64
+	NGrams  map[string]*ngrams.NGram
 
-	ngrams map[string]*ngrams.NGram
 	tokens []lexer.Lexeme
 }
 
-// Document implementation of Note interface
-func (d *Document) Path() string                       { return d.path }
-func (d *Document) KeywordWeights() map[string]float64 { return d.ngwgts }
-
 // Read in a single document file, lex, and generate NGrams
 // Wraps readDocument for explicit use with files
 func readDocumentByFile(documentPath string) (*Document, error) {
@@ -36,7 +32,7 @@ func readDocumentByFile(documentPath string) (*Document, error) {
 	if err != nil {
 		return nil, err
 	}
-	d.path = documentPath
+	d.Path = documentPath
 
 	return d, nil
 }
@@ -48,24 +44,16 @@ func readDocument(r io.Reader, documentPath string) (*Document, error) {
 	l.Lex(r)
 
 	doc := &Document{
-		path:   documentPath,
+		Path:   documentPath,
 		tokens: l.Output,
 	}
 
+	// extract ngrams from document
 	log.Printf("Generating NGrams for %s...\n", documentPath)
-	doc.ngrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.path)
+	doc.NGrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.Path)
 
-	// FIX: weight setting must occur after document NGRam maps are merged
-	doc.setWeightsMap()
+	// initialize weights map to avoid nil pointer issues
+	doc.Weights = make(map[string]float64, len(doc.NGrams))
 
 	return doc, nil
 }
-
-// Generate map of weights for all NGrams found in the document
-func (d *Document) setWeightsMap() {
-	wgts := make(map[string]float64)
-	for k, v := range d.ngrams {
-		wgts[k] = v.Documents()[d.path].DocumentWeight
-	}
-	d.ngwgts = wgts
-}
diff --git a/internal/documents/state.go b/internal/documents/state.go
diff --git a/internal/linking/lexer/lexer_utils.go b/internal/linking/lexer/lexer_utils.go
@@ -109,8 +109,8 @@ func (l *Lexer) detectZone() {
 	peekBuffer, _ := l.br.Peek(32)
 
 	switch {
-	// FIX: handle remaining cases
-	// TODO: add capture group for code blocks (might just need a boolean flag for them)
+	// TODO: handle remaining cases
+	// - add capture group for code blocks (might just need a boolean flag for them)
 	case h1Pattern.Match(peekBuffer):
 		l.zone = H1
 	case h2Pattern.Match(peekBuffer):

diff --git a/internal/linking/ngrams/frequency.go b/internal/linking/ngrams/frequency.go
@@ -51,7 +51,6 @@ func tfidf(ngmap map[string]*NGram) {
 // Best Matching 25 -- Alternative matching function that doesn't downweight common terms as much
 // k1: controls saturation of TF (normally between 1.2 and 2)
 // b: controls document length normalization (0 is no normaliztion)
-// TODO: add bm25f modifications to account for zones -- add zone tracking to lexer (zones affect b, k1, idf)
 func bm25(ngmap map[string]*NGram) {
 	d := make(map[string]float64)
 	totalLength := 0.0

diff --git a/internal/linking/ngrams/ngram.go b/internal/linking/ngrams/ngram.go
@@ -110,6 +110,10 @@ func Merge(maps ...map[string]*NGram) {
 				// ngram key found in map, merge counts and document info
 				// weights should be calculated elsewhere after all merges are completed
 				v0.globalCount += vi.globalCount
+				// lower zones are considered better, take best
+				if v0.zone > vi.zone {
+					v0.zone = vi.zone
+				}
 				for dk, dv := range vi.documents {
 					v0.documents[dk] = dv
 				}