Skip to content

Commit

Permalink
feat(kb): Support programming languages
Browse files Browse the repository at this point in the history
  • Loading branch information
RezaRahemtola committed Sep 11, 2024
1 parent 9cf69f2 commit 1648bcc
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 8 deletions.
26 changes: 24 additions & 2 deletions src/utils/knowledge/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export const generateChunks = async (

// Need to do this synchronously to avoid timeout on the embedding model API
for (const chunk of documentChunks) {
console.log(chunk.pageContent);
const embedding_vector = await embed(chunk.pageContent);
result.push({
content: chunk.pageContent,
Expand All @@ -43,8 +44,29 @@ const getTextSplitter = (
chunkOverlap: number,
): MarkdownTextSplitter | RecursiveCharacterTextSplitter => {
switch (fileType) {
case 'text/markdown':
return new MarkdownTextSplitter();
case 'markdown':
case 'html':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'php':
case 'proto':
case 'rst':
case 'scala':
case 'swift':
case 'sol':
return RecursiveCharacterTextSplitter.fromLanguage(fileType);
case 'ts':
return RecursiveCharacterTextSplitter.fromLanguage('js');
case 'py':
return RecursiveCharacterTextSplitter.fromLanguage('python');
case 'rb':
return RecursiveCharacterTextSplitter.fromLanguage('ruby');
case 'rs':
return RecursiveCharacterTextSplitter.fromLanguage('rust');
case 'md':
return RecursiveCharacterTextSplitter.fromLanguage('markdown');
default:
return new RecursiveCharacterTextSplitter({
chunkSize,
Expand Down
47 changes: 41 additions & 6 deletions src/utils/knowledge/parsing.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
import mime from 'mime';
import * as pdfjs from 'pdfjs-dist';
import { TextItem } from 'pdfjs-dist/types/src/display/api';

export const supportedInputFiles = ['.txt', '.md', '.pdf'].join(',');
export const supportedInputFiles = [
'.txt',
'.md',
'.markdown',
'.pdf',
'.html',
'.cpp',
'.go',
'.java',
'.js',
'.ts',
'.php',
'.proto',
'.py',
'.rst',
'.rb',
'.rs',
'.scala',
'.swift',
'.sol',
].join(',');

const extractTextFromPdfFile = async (file: File): Promise<string> => {
const pdfUrl = URL.createObjectURL(file);
Expand All @@ -27,15 +46,31 @@ const extractTextFromPdfFile = async (file: File): Promise<string> => {

export const extractFileContent = async (file: File): Promise<{ type: string; content: string }> => {
let extractedText = '';
const fileType = mime.getType(file.name) ?? file.type;
const fileType = file.name.split('.').pop();

try {
switch (fileType) {
case 'application/pdf':
case 'pdf':
extractedText = await extractTextFromPdfFile(file);
break;
case 'text/markdown':
case 'text/plain':
case 'markdown':
case 'md':
case 'txt':
case 'html':
case 'cpp':
case 'go':
case 'java':
case 'js':
case 'ts':
case 'php':
case 'proto':
case 'py':
case 'rst':
case 'rb':
case 'rs':
case 'scala':
case 'swift':
case 'sol':
extractedText = await new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => resolve(event.target!.result as string);
Expand Down

0 comments on commit 1648bcc

Please sign in to comment.