From 70f46064539ce984a3a3345fcfd1039b35b813e0 Mon Sep 17 00:00:00 2001 From: Reza Rahemtola Date: Wed, 11 Sep 2024 16:34:54 +0900 Subject: [PATCH] feat(kb): Support programming languages --- package-lock.json | 14 ------------ package.json | 1 - src/utils/knowledge/embedding.ts | 21 +++++++++++++++++ src/utils/knowledge/parsing.ts | 39 +++++++++++++++++++++++++++++++- 4 files changed, 59 insertions(+), 16 deletions(-) diff --git a/package-lock.json b/package-lock.json index 4d96c02..470a9d1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,7 +27,6 @@ "localforage": "^1.10.0", "marked": "^14.0.0", "marked-highlight": "^2.1.4", - "mime": "^4.0.4", "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", @@ -13537,19 +13536,6 @@ "miller-rabin": "bin/miller-rabin" } }, - "node_modules/mime": { - "version": "4.0.4", - "funding": [ - "https://github.com/sponsors/broofa" - ], - "license": "MIT", - "bin": { - "mime": "bin/cli.js" - }, - "engines": { - "node": ">=16" - } - }, "node_modules/mime-db": { "version": "1.52.0", "license": "MIT", diff --git a/package.json b/package.json index 98faed2..93b205a 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,6 @@ "localforage": "^1.10.0", "marked": "^14.0.0", "marked-highlight": "^2.1.4", - "mime": "^4.0.4", "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", diff --git a/src/utils/knowledge/embedding.ts b/src/utils/knowledge/embedding.ts index eb865fa..0373040 100644 --- a/src/utils/knowledge/embedding.ts +++ b/src/utils/knowledge/embedding.ts @@ -44,9 +44,30 @@ const getTextSplitter = ( ): MarkdownTextSplitter | RecursiveCharacterTextSplitter => { switch (fileType) { case 'markdown': + case 'html': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'php': + case 'proto': + case 'rst': + case 'scala': + case 'swift': + case 'sol': return RecursiveCharacterTextSplitter.fromLanguage(fileType); + case 'ts': + return RecursiveCharacterTextSplitter.fromLanguage('js'); + case 'py': + return RecursiveCharacterTextSplitter.fromLanguage('python'); + case 'rb': + return RecursiveCharacterTextSplitter.fromLanguage('ruby'); + case 'rs': + return RecursiveCharacterTextSplitter.fromLanguage('rust'); case 'md': return RecursiveCharacterTextSplitter.fromLanguage('markdown'); + case 'htm': + return RecursiveCharacterTextSplitter.fromLanguage('html'); default: return new RecursiveCharacterTextSplitter({ chunkSize, diff --git a/src/utils/knowledge/parsing.ts b/src/utils/knowledge/parsing.ts index aef9b01..d72e2d7 100644 --- a/src/utils/knowledge/parsing.ts +++ b/src/utils/knowledge/parsing.ts @@ -1,7 +1,28 @@ import * as pdfjs from 'pdfjs-dist'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; -export const supportedInputFiles = ['.txt', '.md', '.markdown', '.pdf'].join(','); +export const supportedInputFiles = [ + '.txt', + '.md', + '.markdown', + '.pdf', + '.html', + '.htm', + '.cpp', + '.go', + '.java', + '.js', + '.ts', + '.php', + '.proto', + '.py', + '.rst', + '.rb', + '.rs', + '.scala', + '.swift', + '.sol', +].join(','); const extractTextFromPdfFile = async (file: File): Promise => { const pdfUrl = URL.createObjectURL(file); @@ -36,6 +57,22 @@ export const extractFileContent = async (file: File): Promise<{ type: string; co case 'markdown': case 'md': case 'txt': + case 'html': + case 'htm': + case 'cpp': + case 'go': + case 'java': + case 'js': + case 'ts': + case 'php': + case 'proto': + case 'py': + case 'rst': + case 'rb': + case 'rs': + case 'scala': + case 'swift': + case 'sol': extractedText = await new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = (event) => resolve(event.target!.result as string);