diff --git a/package.json b/package.json index e59a91b..e90e28f 100644 --- a/package.json +++ b/package.json @@ -7,8 +7,7 @@ "bin": { "tweets2character": "scripts/tweets2character.js", "folder2knowledge": "scripts/folder2knowledge.js", - "knowledge2character": "scripts/knowledge2character.js" - }, + "knowledge2character": "scripts/knowledge2character.js" }, "scripts": { "tweets2character": "node scripts/tweets2character.js", "folder2knowledge": "node scripts/folder2knowledge.js", @@ -28,6 +27,7 @@ "node-fetch": "^3.3.2", "node-llama-cpp": "^3.0.0-beta.44", "node-stream-zip": "^1.15.0", + "pdfjs-dist": "^2.16.105", "systeminformation": "^5.23.5", "tiktoken": "^1.0.16" }, diff --git a/scripts/folder2knowledge.js b/scripts/folder2knowledge.js index da33930..4ea1832 100755 --- a/scripts/folder2knowledge.js +++ b/scripts/folder2knowledge.js @@ -75,6 +75,27 @@ const getApiKey = async () => { } }; +// Add this function to handle text chunking +const chunkText = (text, maxChunkSize = 1000) => { + const chunks = []; + const sentences = text.split(/(?<=[.!?])\s+/); + let currentChunk = ''; + + for (const sentence of sentences) { + if ((currentChunk + sentence).length > maxChunkSize && currentChunk.length > 0) { + chunks.push(currentChunk.trim()); + currentChunk = ''; + } + currentChunk += (currentChunk ? ' ' : '') + sentence; + } + + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()); + } + + return chunks; +}; + const processDocument = async (filePath) => { console.log(`Processing file: ${filePath}`); @@ -89,7 +110,13 @@ const processDocument = async (filePath) => { content = await fs.readFile(filePath, 'utf8'); } - return content; + // Create chunks from the content + const chunks = chunkText(content); + + return { + document: content, + chunks: chunks + }; }; // Asynchronous function to recursively find files and process them @@ -103,6 +130,9 @@ const findAndProcessFiles = async (dirPath) => { const chunks = []; for (const dirent of filesAndDirectories) { + // Skip .DS_Store files + if (dirent.name === '.DS_Store') continue; + const fullPath = path.join(dirPath, dirent.name); if (dirent.isDirectory()) {