Skip to content

Commit

Permalink
feat(kb): Support programming languages
Browse files Browse the repository at this point in the history
  • Loading branch information
RezaRahemtola committed Sep 11, 2024
1 parent 9cf69f2 commit 5c4baa7
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 23 deletions.
14 changes: 0 additions & 14 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
"localforage": "^1.10.0",
"marked": "^14.0.0",
"marked-highlight": "^2.1.4",
"mime": "^4.0.4",
"ml-distance": "^4.0.1",
"pdfjs-dist": "^4.5.136",
"pinia": "^2.2.1",
Expand Down
25 changes: 23 additions & 2 deletions src/utils/knowledge/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,29 @@ const getTextSplitter = (
chunkOverlap: number,
): MarkdownTextSplitter | RecursiveCharacterTextSplitter => {
switch (fileType) {
case 'text/markdown':
return new MarkdownTextSplitter();
case 'markdown':
case 'html':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'php':
case 'proto':
case 'rst':
case 'scala':
case 'swift':
case 'sol':
return RecursiveCharacterTextSplitter.fromLanguage(fileType);
case 'ts':
return RecursiveCharacterTextSplitter.fromLanguage('js');
case 'py':
return RecursiveCharacterTextSplitter.fromLanguage('python');
case 'rb':
return RecursiveCharacterTextSplitter.fromLanguage('ruby');
case 'rs':
return RecursiveCharacterTextSplitter.fromLanguage('rust');
case 'md':
return RecursiveCharacterTextSplitter.fromLanguage('markdown');
default:
return new RecursiveCharacterTextSplitter({
chunkSize,
Expand Down
47 changes: 41 additions & 6 deletions src/utils/knowledge/parsing.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
import mime from 'mime';
import * as pdfjs from 'pdfjs-dist';
import { TextItem } from 'pdfjs-dist/types/src/display/api';

export const supportedInputFiles = ['.txt', '.md', '.pdf'].join(',');
export const supportedInputFiles = [
'.txt',
'.md',
'.markdown',
'.pdf',
'.html',
'.cpp',
'.go',
'.java',
'.js',
'.ts',
'.php',
'.proto',
'.py',
'.rst',
'.rb',
'.rs',
'.scala',
'.swift',
'.sol',
].join(',');

const extractTextFromPdfFile = async (file: File): Promise<string> => {
const pdfUrl = URL.createObjectURL(file);
Expand All @@ -27,15 +46,31 @@ const extractTextFromPdfFile = async (file: File): Promise<string> => {

export const extractFileContent = async (file: File): Promise<{ type: string; content: string }> => {
let extractedText = '';
const fileType = mime.getType(file.name) ?? file.type;
const fileType = file.name.split('.').pop();

try {
switch (fileType) {
case 'application/pdf':
case 'pdf':
extractedText = await extractTextFromPdfFile(file);
break;
case 'text/markdown':
case 'text/plain':
case 'markdown':
case 'md':
case 'txt':
case 'html':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'ts':
case 'php':
case 'proto':
case 'py':
case 'rst':
case 'rb':
case 'rs':
case 'scala':
case 'swift':
case 'sol':
extractedText = await new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => resolve(event.target!.result as string);
Expand Down

0 comments on commit 5c4baa7

Please sign in to comment.