diff --git a/src/utils/knowledge/embedding.ts b/src/utils/knowledge/embedding.ts index b0cc0da..d3e806f 100644 --- a/src/utils/knowledge/embedding.ts +++ b/src/utils/knowledge/embedding.ts @@ -27,6 +27,7 @@ export const generateChunks = async ( // Need to do this synchronously to avoid timeout on the embedding model API for (const chunk of documentChunks) { + console.log(chunk.pageContent); const embedding_vector = await embed(chunk.pageContent); result.push({ content: chunk.pageContent, @@ -43,8 +44,29 @@ const getTextSplitter = ( chunkOverlap: number, ): MarkdownTextSplitter | RecursiveCharacterTextSplitter => { switch (fileType) { - case 'text/markdown': - return new MarkdownTextSplitter(); + case 'markdown': + case 'html': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'php': + case 'proto': + case 'rst': + case 'scala': + case 'swift': + case 'sol': + return RecursiveCharacterTextSplitter.fromLanguage(fileType); + case 'ts': + return RecursiveCharacterTextSplitter.fromLanguage('js'); + case 'py': + return RecursiveCharacterTextSplitter.fromLanguage('python'); + case 'rb': + return RecursiveCharacterTextSplitter.fromLanguage('ruby'); + case 'rs': + return RecursiveCharacterTextSplitter.fromLanguage('rust'); + case 'md': + return RecursiveCharacterTextSplitter.fromLanguage('markdown'); default: return new RecursiveCharacterTextSplitter({ chunkSize, diff --git a/src/utils/knowledge/parsing.ts b/src/utils/knowledge/parsing.ts index fc7279e..e2f9636 100644 --- a/src/utils/knowledge/parsing.ts +++ b/src/utils/knowledge/parsing.ts @@ -1,8 +1,27 @@ -import mime from 'mime'; import * as pdfjs from 'pdfjs-dist'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; -export const supportedInputFiles = ['.txt', '.md', '.pdf'].join(','); +export const supportedInputFiles = [ + '.txt', + '.md', + '.markdown', + '.pdf', + '.html', + '.cpp', + '.go', + '.java', + '.js', + '.ts', + '.php', + '.proto', + '.py', + '.rst', + '.rb', + '.rs', + '.scala', + '.swift', + '.sol', +].join(','); const extractTextFromPdfFile = async (file: File): Promise => { const pdfUrl = URL.createObjectURL(file); @@ -27,15 +46,31 @@ const extractTextFromPdfFile = async (file: File): Promise => { export const extractFileContent = async (file: File): Promise<{ type: string; content: string }> => { let extractedText = ''; - const fileType = mime.getType(file.name) ?? file.type; + const fileType = file.name.split('.').pop(); try { switch (fileType) { - case 'application/pdf': + case 'pdf': extractedText = await extractTextFromPdfFile(file); break; - case 'text/markdown': - case 'text/plain': + case 'markdown': + case 'md': + case 'txt': + case 'html': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'ts': + case 'php': + case 'proto': + case 'py': + case 'rst': + case 'rb': + case 'rs': + case 'scala': + case 'swift': + case 'sol': extractedText = await new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = (event) => resolve(event.target!.result as string);