diff --git a/v3/core/chunk.go b/v3/core/chunk.go index cb60f9c..b0c369a 100644 --- a/v3/core/chunk.go +++ b/v3/core/chunk.go @@ -261,13 +261,21 @@ func (g *GrokkerInternal) similarChunks(embedding []float64, tokenLimit int, fil // findChunks returns the most relevant chunks for a query, limited by tokenLimit. func (g *GrokkerInternal) findChunks(query string, tokenLimit int, files []string) (chunks []*Chunk, err error) { defer Return(&err) - // get the embeddings for the query. - embeddings, err := g.createEmbeddings([]string{query}) + // break the query into chunks. + queryChunks, err := g.chunksFromString(nil, query, g.embeddingTokenLimit) Ck(err) - queryEmbedding := embeddings[0] - if queryEmbedding == nil { + // get the embeddings for the chunks. + var queryStrings []string + for _, chunk := range queryChunks { + queryStrings = append(queryStrings, chunk.text) + } + embeddings, err := g.createEmbeddings(queryStrings) + Ck(err) + if len(embeddings) == 0 { return } + // average the embeddings. + queryEmbedding := util.MeanVector(embeddings) // find the most similar chunks. chunks, err = g.similarChunks(queryEmbedding, tokenLimit, files) Ck(err) diff --git a/v3/core/cli.go b/v3/core/cli.go index 33ba0f4..7b8a22a 100644 --- a/v3/core/cli.go +++ b/v3/core/cli.go @@ -239,6 +239,8 @@ func cmdInSlice(cmd string, cmds []string) bool { // We use this function instead of kong.Parse() so that we can pass in // the arguments to parse. This allows us to more easily test the // cli subcommands, and could later ease e.g. WASM usage. +// +// XXX note how gitea/tea does this, also uses urfave instead of kong func Cli(args []string, config *CliConfig) (rc int, err error) { defer Return(&err) diff --git a/v3/core/document.go b/v3/core/document.go index 9202a35..20bc081 100644 --- a/v3/core/document.go +++ b/v3/core/document.go @@ -3,6 +3,7 @@ package core import ( "path/filepath" + "github.com/stevegt/envi" . "github.com/stevegt/goadapt" ) @@ -41,6 +42,15 @@ func (g *GrokkerInternal) updateDocument(doc *Document) (updated bool, err error // hash, offset, and length. We'll get embeddings later. var newChunks []*Chunk for _, chunk := range chunks { + if envi.Bool("DEBUG", false) { + // verify chunk text length + txt, err := g.chunkText(chunk, true, false) + Ck(err) + _, tokens, err := Tokenizer.Encode(txt) + Ck(err) + tc := len(tokens) + Assert(tc < g.embeddingTokenLimit, "chunk tokens %d exceeds limit %d: %v", tc, g.embeddingTokenLimit, chunk) + } // setChunk unsets the stale bit if the chunk is already in the // database. // XXX move the stale bit unset to this loop instead, for readability. @@ -53,6 +63,18 @@ func (g *GrokkerInternal) updateDocument(doc *Document) (updated bool, err error Debug("found %d new chunks", len(newChunks)) // orphaned chunks will be garbage collected. + if envi.Bool("DEBUG", false) { + // verify newChunks text length + for _, chunk := range newChunks { + txt, err := g.chunkText(chunk, true, false) + Ck(err) + _, tokens, err := Tokenizer.Encode(txt) + Ck(err) + tc := len(tokens) + Assert(tc < g.embeddingTokenLimit, "chunk tokens %d exceeds limit %d: %v", tc, g.embeddingTokenLimit, chunk) + } + } + // For each new chunk, generate an embedding using the // openai.Embedding.create() function. Store the embeddings for each // chunk in a data structure such as a list or dictionary. diff --git a/v3/core/grokker.go b/v3/core/grokker.go index 47d6582..88c7055 100644 --- a/v3/core/grokker.go +++ b/v3/core/grokker.go @@ -50,7 +50,7 @@ import ( const ( // See the "Semantic Versioning" section of the README for // information on API and db stability and versioning. - version = "3.0.9" + version = "3.0.10" ) type GrokkerInternal struct { diff --git a/v3/core/openai.go b/v3/core/openai.go index bb33cb2..069dc11 100644 --- a/v3/core/openai.go +++ b/v3/core/openai.go @@ -34,8 +34,6 @@ func (g *GrokkerInternal) createEmbeddings(texts []string) (embeddings [][]float // simply call c.CreateEmbeddings() once for each text chunk. for i := 0; i < len(texts); i++ { text := texts[i] - // XXX don't exceed max tokens - // set empty chunk embedding to nil if len(text) == 0 { embeddings = append(embeddings, nil) @@ -46,7 +44,7 @@ func (g *GrokkerInternal) createEmbeddings(texts []string) (embeddings [][]float Input: inputs, Model: fabius_models.AdaEmbeddingV2, } - // Debug("creating embedding for chunk %d of %d ...", i+1, len(texts)) + Debug("creating embedding for chunk %d of %d ...", i+1, len(texts)) // Debug("text: %q", text) // loop with backoff until we get a response var res *openai.EmbeddingResponse diff --git a/v3/go.mod b/v3/go.mod index eb6c88c..73cb3c0 100644 --- a/v3/go.mod +++ b/v3/go.mod @@ -13,7 +13,10 @@ require ( github.com/tiktoken-go/tokenizer v0.1.0 ) -require github.com/sergi/go-diff v1.3.1 +require ( + github.com/sergi/go-diff v1.3.1 + github.com/stevegt/envi v0.2.0 +) require ( github.com/dlclark/regexp2 v1.9.0 // indirect diff --git a/v3/go.sum b/v3/go.sum index 4bae341..44c8507 100644 --- a/v3/go.sum +++ b/v3/go.sum @@ -24,6 +24,9 @@ github.com/sashabaranov/go-openai v1.19.1 h1:lIAtrpgE6Lhc3avbWG7wV4zeRWVi4nymQ7I github.com/sashabaranov/go-openai v1.19.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= +github.com/stevegt/envi v0.2.0 h1:QIaNQz///ueqH/57ldwGQy0yRgaYr/d7JLBQfjg+RXs= +github.com/stevegt/envi v0.2.0/go.mod h1:Z8w7bE5V9Ce3H02CWNTEYkW3zWE8ulgfROtEt6ydDoY= +github.com/stevegt/goadapt v0.0.13/go.mod h1:BWNnTsXdIxaseRo0W/MoVgDeLNf+6L4S4fPhyAsBTi0= github.com/stevegt/goadapt v0.7.0 h1:brUmaaA4mr3hqQfglDAQh7/MVSWak52mEAOzfbSoMDg= github.com/stevegt/goadapt v0.7.0/go.mod h1:vquRbAl0Ek4iJHCvFUEDxziTsETR2HOT7r64NolhDKs= github.com/stevegt/semver v0.0.0-20230512043732-92220054a49f h1:erQJkdWx1bhOImDDPiVoNy+qP8sBoOJ8EsJ7gUiy8S8=