Skip to content

Commit

Permalink
feat: output json serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
ptdewey committed Nov 7, 2024
1 parent 851d5e8 commit 1b362f8
Show file tree
Hide file tree
Showing 16 changed files with 302 additions and 137 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,8 @@ run:

test:
go test ./...

make eval:
time go run main.go --no-daemon true
xsv sort -NRs weight meaningful-ngrams.csv -o meaningful-ngrams.csv
csvlens meaningful-ngrams.csv
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Oolong
# Oolong Backend Daemon

Coming soon...

Expand Down
1 change: 1 addition & 0 deletions internal/daemon/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func runNotesDirsWatcher(dirs ...string) error {

// re-read document
documents.ReadDocuments(event.Name)

// TODO: add dedup timer to prevent multi-write calls
}
case err, ok := <-watcher.Errors:
Expand Down
49 changes: 34 additions & 15 deletions internal/documents/corpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,34 @@ import (
"github.com/oolong-sh/oolong/internal/config"
)

// State updater function is dependency injected from state to avoid circular dependency
var UpdateState func([]*Document) error

// DOC: meant to be called with watcher
// assumes paths should not be ignored (should be safe assumption due to watcher ignores)
func ReadDocuments(paths ...string) error {
if UpdateState == nil {
panic("UpdateState was never instantiated.")
}

// read all input files, update state with documents
docs := readHandler(paths...)

// merge ngram maps and calculate weights
err := updateState(docs)
err := UpdateState(docs)
if err != nil {
return err
}

// TODO: all weights change, but may not need to be recalculated every time

return nil
}

// Read, lex, and extract NGrams for all documents in notes directories specified in config file
func ReadNotesDirs() error {
if UpdateState == nil {
panic("UpdateState not instantiated.")
}

docs := []*Document{}
for _, dir := range config.NotesDirPaths() {
// extract all note file paths from notes directory
Expand Down Expand Up @@ -56,7 +65,7 @@ func ReadNotesDirs() error {
}

// merge maps and calculate weights
err := updateState(docs)
err := UpdateState(docs)
if err != nil {
return err
}
Expand All @@ -66,25 +75,35 @@ func ReadNotesDirs() error {

// DOC:
func readHandler(paths ...string) []*Document {
docs := make([]*Document, len(paths))
var wg sync.WaitGroup
docChan := make(chan *Document)

// perform a parallel read of found notes files
wg.Add(len(paths))
for i, p := range paths {
go func(i int, notePath string) {
// launch a goroutine for each file path and read in parallel
for _, p := range paths {
wg.Add(1)
go func(notePath string) {
defer wg.Done()
doc, err := readDocumentByFile(notePath)
if err != nil {
log.Printf("Failed to read file: '%s' %v", notePath, err)
return
}
// TODO: this could be changed to use channels
docs[i] = doc
wg.Done()
}(i, p)
// send the document via channel
docChan <- doc
}(p)
}

// close the channel once all goroutines are done
go func() {
wg.Wait()
close(docChan)
}()

// collect documents from the channel
var docs []*Document
for doc := range docChan {
docs = append(docs, doc)
}
wg.Wait()

// append results to output array
return docs
}
44 changes: 23 additions & 21 deletions internal/documents/corpus_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package documents
package documents_test

import (
"fmt"
"os"
"testing"

"github.com/oolong-sh/oolong/internal/config"
"github.com/oolong-sh/oolong/internal/linking/lexer"
"github.com/oolong-sh/oolong/internal/documents"
"github.com/oolong-sh/oolong/internal/linking/ngrams"
"github.com/oolong-sh/oolong/internal/state"
)

var cfg = config.OolongConfig{
Expand All @@ -20,28 +21,29 @@ var cfg = config.OolongConfig{
}

func TestReadNotesDirs(t *testing.T) {
s := state.State()
// TODO: actual tests with an example data directory
if err := ReadNotesDirs(); err != nil {
if err := documents.ReadNotesDirs(); err != nil {
t.Fatalf("Failed to read notes directories: %v\n", err)
}

// write out tokens
b := []byte{}
for _, d := range state.Documents {
for _, t := range d.tokens {
if t.Value == lexer.BreakToken {
continue
}
b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...)
}
}
if err := os.WriteFile("./tokens.txt", b, 0666); err != nil {
t.Fatalf("Failed to write tokens: %v\n", err)
}
// b := []byte{}
// for _, d := range s.Documents {
// for _, t := range d.tokens {
// if t.Value == lexer.BreakToken {
// continue
// }
// b = append(b, []byte(fmt.Sprintf("%s, %s, %d\n", t.Lemma, t.Value, t.Zone))...)
// }
// }
// if err := os.WriteFile("./tokens.txt", b, 0666); err != nil {
// t.Fatalf("Failed to write tokens: %v\n", err)
// }

b = append([]byte{}, []byte("ngram,weight,count\n")...)
for _, d := range state.Documents {
for _, ng := range d.ngrams {
b := append([]byte{}, []byte("ngram,weight,count\n")...)
for _, d := range s.Documents {
for _, ng := range d.NGrams {
b = append(b, []byte(fmt.Sprintf("%s, %f, %d\n", ng.Keyword(), ng.Weight(), ng.Count()))...)
}
}
Expand All @@ -50,9 +52,9 @@ func TestReadNotesDirs(t *testing.T) {
}

b = append([]byte{}, []byte("ngram,weight,count,ndocs\n")...)
mng := ngrams.FilterMeaningfulNGrams(state.NGrams, 2, int(float64(len(state.Documents))/1.5), 4.0)
for _, s := range mng {
b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", s, state.NGrams[s].Weight(), state.NGrams[s].Count(), len(state.NGrams[s].Documents())))...)
mng := ngrams.FilterMeaningfulNGrams(s.NGrams, 2, int(float64(len(s.Documents))/1.5), 4.0)
for _, k := range mng {
b = append(b, []byte(fmt.Sprintf("%s,%f,%d,%d\n", k, s.NGrams[k].Weight(), s.NGrams[k].Count(), len(s.NGrams[k].Documents())))...)
}
if err := os.WriteFile("./meaningful-ngrams.csv", b, 0666); err != nil {
t.Fatalf("Failed to write out meaningful ngrams: %v\n", err)
Expand Down
30 changes: 9 additions & 21 deletions internal/documents/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,13 @@ import (

// Document type stores lexical tokens and NGrams for a single document
type Document struct {
path string
ngwgts map[string]float64
Path string
Weights map[string]float64
NGrams map[string]*ngrams.NGram

ngrams map[string]*ngrams.NGram
tokens []lexer.Lexeme
}

// Document implementation of Note interface
func (d *Document) Path() string { return d.path }
func (d *Document) KeywordWeights() map[string]float64 { return d.ngwgts }

// Read in a single document file, lex, and generate NGrams
// Wraps readDocument for explicit use with files
func readDocumentByFile(documentPath string) (*Document, error) {
Expand All @@ -36,7 +32,7 @@ func readDocumentByFile(documentPath string) (*Document, error) {
if err != nil {
return nil, err
}
d.path = documentPath
d.Path = documentPath

return d, nil
}
Expand All @@ -48,24 +44,16 @@ func readDocument(r io.Reader, documentPath string) (*Document, error) {
l.Lex(r)

doc := &Document{
path: documentPath,
Path: documentPath,
tokens: l.Output,
}

// extract ngrams from document
log.Printf("Generating NGrams for %s...\n", documentPath)
doc.ngrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.path)
doc.NGrams = ngrams.Generate(doc.tokens, config.NGramRange(), doc.Path)

// FIX: weight setting must occur after document NGRam maps are merged
doc.setWeightsMap()
// initialize weights map to avoid nil pointer issues
doc.Weights = make(map[string]float64, len(doc.NGrams))

return doc, nil
}

// Generate map of weights for all NGrams found in the document
func (d *Document) setWeightsMap() {
wgts := make(map[string]float64)
for k, v := range d.ngrams {
wgts[k] = v.Documents()[d.path].DocumentWeight
}
d.ngwgts = wgts
}
63 changes: 0 additions & 63 deletions internal/documents/state.go

This file was deleted.

4 changes: 2 additions & 2 deletions internal/linking/lexer/lexer_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ func (l *Lexer) detectZone() {
peekBuffer, _ := l.br.Peek(32)

switch {
// FIX: handle remaining cases
// TODO: add capture group for code blocks (might just need a boolean flag for them)
// TODO: handle remaining cases
// - add capture group for code blocks (might just need a boolean flag for them)
case h1Pattern.Match(peekBuffer):
l.zone = H1
case h2Pattern.Match(peekBuffer):
Expand Down
1 change: 0 additions & 1 deletion internal/linking/ngrams/frequency.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ func tfidf(ngmap map[string]*NGram) {
// Best Matching 25 -- Alternative matching function that doesn't downweight common terms as much
// k1: controls saturation of TF (normally between 1.2 and 2)
// b: controls document length normalization (0 is no normaliztion)
// TODO: add bm25f modifications to account for zones -- add zone tracking to lexer (zones affect b, k1, idf)
func bm25(ngmap map[string]*NGram) {
d := make(map[string]float64)
totalLength := 0.0
Expand Down
4 changes: 4 additions & 0 deletions internal/linking/ngrams/ngram.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ func Merge(maps ...map[string]*NGram) {
// ngram key found in map, merge counts and document info
// weights should be calculated elsewhere after all merges are completed
v0.globalCount += vi.globalCount
// lower zones are considered better, take best
if v0.zone > vi.zone {
v0.zone = vi.zone
}
for dk, dv := range vi.documents {
v0.documents[dk] = dv
}
Expand Down
Loading

0 comments on commit 1b362f8

Please sign in to comment.