-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
chunkingUtils.js
112 lines (93 loc) · 4.23 KB
/
chunkingUtils.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import { tokenizer } from './embeddingUtils.js';
import { cosineSimilarity } from './similarityUtils.js';
import { createEmbedding } from './embeddingUtils.js';
// -----------------------------------------------------------
// -- Function to create chunks of text based on similarity --
// -----------------------------------------------------------
export function createChunks(sentences, similarities, maxTokenSize, similarityThreshold, logging) {
let chunks = [];
let currentChunk = [sentences[0]];
if (logging) {
console.log('Initial sentence:', sentences[0]);
}
for (let i = 1; i < sentences.length; i++) {
const nextSentence = sentences[i];
// For cramit (when similarities is null), only check token size
if (!similarities) {
const currentChunkText = currentChunk.join(" ");
const currentChunkSize = tokenizer(currentChunkText).input_ids.size;
const nextSentenceTokenCount = tokenizer(nextSentence).input_ids.size;
if (currentChunkSize + nextSentenceTokenCount <= maxTokenSize) {
currentChunk.push(nextSentence);
} else {
chunks.push(currentChunkText);
currentChunk = [nextSentence];
}
continue;
}
// Check similarity first for chunkit
if (similarities[i - 1] >= similarityThreshold) {
if (logging) {
console.log(`Adding sentence ${i} with similarity ${similarities[i - 1]}`);
}
// Then check token size
const currentChunkText = currentChunk.join(" ");
const currentChunkSize = tokenizer(currentChunkText).input_ids.size;
const nextSentenceTokenCount = tokenizer(nextSentence).input_ids.size;
if (currentChunkSize + nextSentenceTokenCount <= maxTokenSize) {
currentChunk.push(nextSentence);
} else {
chunks.push(currentChunkText);
currentChunk = [nextSentence];
}
} else {
if (logging) {
console.log(`Starting new chunk at sentence ${i}, similarity was ${similarities[i - 1]}`);
}
chunks.push(currentChunk.join(" "));
currentChunk = [nextSentence];
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(" "));
}
return chunks;
}
// --------------------------------------------------------------
// -- Optimize and Rebalance Chunks (optionally use Similarity) --
// --------------------------------------------------------------
export async function optimizeAndRebalanceChunks(combinedChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold = 0.5) {
let optimizedChunks = [];
let currentChunkText = "";
let currentChunkTokenCount = 0;
let currentEmbedding = null;
for (let index = 0; index < combinedChunks.length; index++) {
const chunk = combinedChunks[index];
const chunkTokenCount = tokenizer(chunk).input_ids.size;
if (currentChunkText && (currentChunkTokenCount + chunkTokenCount <= maxTokenSize)) {
const nextEmbedding = await createEmbedding(chunk);
const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0;
if (similarity >= combineChunksSimilarityThreshold) {
currentChunkText += " " + chunk;
currentChunkTokenCount += chunkTokenCount;
currentEmbedding = nextEmbedding;
continue;
}
}
if (currentChunkText) optimizedChunks.push(currentChunkText);
currentChunkText = chunk;
currentChunkTokenCount = chunkTokenCount;
currentEmbedding = await createEmbedding(chunk);
}
if (currentChunkText) optimizedChunks.push(currentChunkText);
return optimizedChunks.filter(chunk => chunk);
}
// ------------------------------------------------
// -- Helper function to apply prefix to a chunk --
// ------------------------------------------------
export function applyPrefixToChunk(chunkPrefix, chunk) {
if (chunkPrefix && chunkPrefix.trim()) {
return `${chunkPrefix}: ${chunk}`;
}
return chunk;
};