From f45d2ebba5afc916b2561091092bffb9ef0b0985 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Tue, 3 Dec 2024 16:09:10 +0530 Subject: [PATCH] code comments --- index/scorch/introducer.go | 1 + index/scorch/merge.go | 8 +++++++- index/scorch/persister.go | 18 +++++++++--------- index/scorch/scorch_test.go | 1 - index/scorch/snapshot_index_vr.go | 1 + search/scorer/scorer_term.go | 1 + 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 0d472ac8b..d7864ddb8 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -429,6 +429,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } skipped := true + // make the newly merged segments part of the newSnapshot being constructed for i, newMergedSegment := range nextMerge.new { // checking if this newly merged segment is worth keeping based on // obsoleted doc count since the merge intro started diff --git a/index/scorch/merge.go b/index/scorch/merge.go index be6de3863..1e318237d 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -443,6 +443,9 @@ type mergeTaskIntroStatus struct { skipped bool } +// this is important when it comes to introducing multiple merged segments in a +// single introducer channel push. That way there is a check to ensure that the +// file count doesn't explode during the index's lifetime. type mergedSegmentHistory struct { workerID uint64 oldNewDocIDs []uint64 @@ -501,6 +504,9 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) filename := zapFileName(newSegmentID) path := s.path + string(os.PathSeparator) + filename + + // the newly merged segment is already flushed out to disk, just needs + // to be opened using mmap. newDocNums, _, err := s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s) if err != nil { @@ -527,7 +533,7 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj // close the new merged segments _ = closeNewMergedSegments(newMergedSegments) - // tbd: need a better way to handle error + // tbd: need a better way to consolidate errors return nil, nil, errs[0] } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index c2d6754af..b48b49711 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -369,11 +369,13 @@ type flushable struct { totDocs uint64 } -var DefaultNumPersisterWorkers = 1 +// number workers which parallely perform an in-memory merge of the segments followed +// by a flush operation. +var DefaultNumPersisterWorkers = 4 // maximum size of data that a single worker is allowed to perform the in-memory // merge operation. -var DefaultMaxSizeInMemoryMerge = 0 +var DefaultMaxSizeInMemoryMerge = 200 * 1024 * 1024 func legacyFlushBehaviour() bool { // DefaultMaxSizeInMemoryMerge = 0 is a special value to preserve the leagcy @@ -417,6 +419,8 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( flushSet = append(flushSet, val) } else { + // constructs a flushSet where each flushable object contains a set of segments + // to be merged and flushed out to disk. for i, snapshot := range snapshot.segment { if totSize >= DefaultMaxSizeInMemoryMerge { if len(sbs) >= DefaultMinSegmentsForInMemoryMerge { @@ -480,12 +484,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( return false, nil } - // deploy the workers, have a wait group which waits for the flush set to complete - // each worker - // 1. merges the segments using mergeSegmentBases() - // wait for group to finish - // - // construct equiv snapshot and do a persistSnapshotDirect() + // drains out (after merging in memory) the segments in the flushSet parallely newSnapshot, newSegmentIDs, err := s.mergeSegmentBasesParallel(snapshot, flushSet) if err != nil { return false, err @@ -694,7 +693,8 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, } filenames = append(filenames, filename) case segment.UnpersistedSegment: - // need to persist this to disk + // need to persist this to disk if its not part of exclude list (which + // restricts which in-memory segment to be persisted to disk) if _, ok := exclude[segmentSnapshot.id]; !ok { filename := zapFileName(segmentSnapshot.id) path := filepath.Join(path, filename) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index d21156dd9..8165774e7 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -438,7 +438,6 @@ func TestIndexInsertThenDelete(t *testing.T) { t.Fatal(err) } - fmt.Println("start delete") err = idx.Delete("1") if err != nil { t.Errorf("Error deleting entry from index: %v", err) diff --git a/index/scorch/snapshot_index_vr.go b/index/scorch/snapshot_index_vr.go index 2d226214a..320364bc7 100644 --- a/index/scorch/snapshot_index_vr.go +++ b/index/scorch/snapshot_index_vr.go @@ -118,6 +118,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( nnum := next.Number() rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) rv.Score = float64(next.Score()) + i.currID = rv.ID i.currPosting = next diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index 883af40ab..ca268648b 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -94,6 +94,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { // update the query weight s.queryWeight = s.queryBoost * s.idf * s.queryNorm + if s.options.Explain { childrenExplanations := make([]*search.Explanation, 3) childrenExplanations[0] = &search.Explanation{