From 5c4baa74e5b869ee607a54b8c88be5cb3fedf60b Mon Sep 17 00:00:00 2001 From: Reza Rahemtola Date: Wed, 11 Sep 2024 16:34:54 +0900 Subject: [PATCH] feat(kb): Support programming languages --- package-lock.json | 14 ---------- package.json | 1 - src/utils/knowledge/embedding.ts | 25 +++++++++++++++-- src/utils/knowledge/parsing.ts | 47 ++++++++++++++++++++++++++++---- 4 files changed, 64 insertions(+), 23 deletions(-) diff --git a/package-lock.json b/package-lock.json index 52aa93e..f5c633d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,7 +27,6 @@ "localforage": "^1.10.0", "marked": "^14.0.0", "marked-highlight": "^2.1.4", - "mime": "^4.0.4", "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", @@ -13538,19 +13537,6 @@ "miller-rabin": "bin/miller-rabin" } }, - "node_modules/mime": { - "version": "4.0.4", - "funding": [ - "https://github.com/sponsors/broofa" - ], - "license": "MIT", - "bin": { - "mime": "bin/cli.js" - }, - "engines": { - "node": ">=16" - } - }, "node_modules/mime-db": { "version": "1.52.0", "license": "MIT", diff --git a/package.json b/package.json index 7f5eee9..fb6004e 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,6 @@ "localforage": "^1.10.0", "marked": "^14.0.0", "marked-highlight": "^2.1.4", - "mime": "^4.0.4", "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", diff --git a/src/utils/knowledge/embedding.ts b/src/utils/knowledge/embedding.ts index b0cc0da..42e37d6 100644 --- a/src/utils/knowledge/embedding.ts +++ b/src/utils/knowledge/embedding.ts @@ -43,8 +43,29 @@ const getTextSplitter = ( chunkOverlap: number, ): MarkdownTextSplitter | RecursiveCharacterTextSplitter => { switch (fileType) { - case 'text/markdown': - return new MarkdownTextSplitter(); + case 'markdown': + case 'html': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'php': + case 'proto': + case 'rst': + case 'scala': + case 'swift': + case 'sol': + return RecursiveCharacterTextSplitter.fromLanguage(fileType); + case 'ts': + return RecursiveCharacterTextSplitter.fromLanguage('js'); + case 'py': + return RecursiveCharacterTextSplitter.fromLanguage('python'); + case 'rb': + return RecursiveCharacterTextSplitter.fromLanguage('ruby'); + case 'rs': + return RecursiveCharacterTextSplitter.fromLanguage('rust'); + case 'md': + return RecursiveCharacterTextSplitter.fromLanguage('markdown'); default: return new RecursiveCharacterTextSplitter({ chunkSize, diff --git a/src/utils/knowledge/parsing.ts b/src/utils/knowledge/parsing.ts index fc7279e..e2f9636 100644 --- a/src/utils/knowledge/parsing.ts +++ b/src/utils/knowledge/parsing.ts @@ -1,8 +1,27 @@ -import mime from 'mime'; import * as pdfjs from 'pdfjs-dist'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; -export const supportedInputFiles = ['.txt', '.md', '.pdf'].join(','); +export const supportedInputFiles = [ + '.txt', + '.md', + '.markdown', + '.pdf', + '.html', + '.cpp', + '.go', + '.java', + '.js', + '.ts', + '.php', + '.proto', + '.py', + '.rst', + '.rb', + '.rs', + '.scala', + '.swift', + '.sol', +].join(','); const extractTextFromPdfFile = async (file: File): Promise => { const pdfUrl = URL.createObjectURL(file); @@ -27,15 +46,31 @@ const extractTextFromPdfFile = async (file: File): Promise => { export const extractFileContent = async (file: File): Promise<{ type: string; content: string }> => { let extractedText = ''; - const fileType = mime.getType(file.name) ?? file.type; + const fileType = file.name.split('.').pop(); try { switch (fileType) { - case 'application/pdf': + case 'pdf': extractedText = await extractTextFromPdfFile(file); break; - case 'text/markdown': - case 'text/plain': + case 'markdown': + case 'md': + case 'txt': + case 'html': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'ts': + case 'php': + case 'proto': + case 'py': + case 'rst': + case 'rb': + case 'rs': + case 'scala': + case 'swift': + case 'sol': extractedText = await new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = (event) => resolve(event.target!.result as string);