Skip to content

Commit

Permalink
feat(kb): Support programming languages
Browse files Browse the repository at this point in the history
  • Loading branch information
RezaRahemtola committed Sep 11, 2024
1 parent 23f6a23 commit 70f4606
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 16 deletions.
14 changes: 0 additions & 14 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
"localforage": "^1.10.0",
"marked": "^14.0.0",
"marked-highlight": "^2.1.4",
"mime": "^4.0.4",
"ml-distance": "^4.0.1",
"pdfjs-dist": "^4.5.136",
"pinia": "^2.2.1",
Expand Down
21 changes: 21 additions & 0 deletions src/utils/knowledge/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,30 @@ const getTextSplitter = (
): MarkdownTextSplitter | RecursiveCharacterTextSplitter => {
switch (fileType) {
case 'markdown':
case 'html':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'php':
case 'proto':
case 'rst':
case 'scala':
case 'swift':
case 'sol':
return RecursiveCharacterTextSplitter.fromLanguage(fileType);
case 'ts':
return RecursiveCharacterTextSplitter.fromLanguage('js');
case 'py':
return RecursiveCharacterTextSplitter.fromLanguage('python');
case 'rb':
return RecursiveCharacterTextSplitter.fromLanguage('ruby');
case 'rs':
return RecursiveCharacterTextSplitter.fromLanguage('rust');
case 'md':
return RecursiveCharacterTextSplitter.fromLanguage('markdown');
case 'htm':
return RecursiveCharacterTextSplitter.fromLanguage('html');
default:
return new RecursiveCharacterTextSplitter({
chunkSize,
Expand Down
39 changes: 38 additions & 1 deletion src/utils/knowledge/parsing.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
import * as pdfjs from 'pdfjs-dist';
import { TextItem } from 'pdfjs-dist/types/src/display/api';

export const supportedInputFiles = ['.txt', '.md', '.markdown', '.pdf'].join(',');
export const supportedInputFiles = [
'.txt',
'.md',
'.markdown',
'.pdf',
'.html',
'.htm',
'.cpp',
'.go',
'.java',
'.js',
'.ts',
'.php',
'.proto',
'.py',
'.rst',
'.rb',
'.rs',
'.scala',
'.swift',
'.sol',
].join(',');

const extractTextFromPdfFile = async (file: File): Promise<string> => {
const pdfUrl = URL.createObjectURL(file);
Expand Down Expand Up @@ -36,6 +57,22 @@ export const extractFileContent = async (file: File): Promise<{ type: string; co
case 'markdown':
case 'md':
case 'txt':
case 'html':
case 'htm':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'ts':
case 'php':
case 'proto':
case 'py':
case 'rst':
case 'rb':
case 'rs':
case 'scala':
case 'swift':
case 'sol':
extractedText = await new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => resolve(event.target!.result as string);
Expand Down

0 comments on commit 70f4606

Please sign in to comment.