Skip to content

Commit

Permalink
Can now upload files in batches with upsert
Browse files Browse the repository at this point in the history
  • Loading branch information
overmode committed Jan 8, 2025
1 parent b5e2577 commit 14386a5
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 71 deletions.
97 changes: 28 additions & 69 deletions front/components/data_source/MultipleDocumentsUpload.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { Dialog, useSendNotification } from "@dust-tt/sparkle";
import { Dialog } from "@dust-tt/sparkle";
import type {
DataSourceViewType,
LightWorkspaceType,
PlanType,
} from "@dust-tt/types";
import { Err, getSupportedNonImageFileExtensions } from "@dust-tt/types";
import { concurrentExecutor } from "@dust-tt/types";
import { getSupportedNonImageFileExtensions } from "@dust-tt/types";
import type { ChangeEvent } from "react";
import { useCallback, useEffect, useRef, useState } from "react";

Expand All @@ -14,8 +15,7 @@ import type {
FileBlobWithFileId,
} from "@app/hooks/useFileUploaderService";
import { useFileUploaderService } from "@app/hooks/useFileUploaderService";
import { useCreateDataSourceViewDocument } from "@app/lib/swr/data_source_view_documents";
import { getFileProcessedUrl } from "@app/lib/swr/file";
import { useUpsertFileAsDatasourceEntry } from "@app/lib/swr/file";

type MultipleDocumentsUploadProps = {
dataSourceView: DataSourceViewType;
Expand All @@ -34,7 +34,6 @@ export const MultipleDocumentsUpload = ({
totalNodesCount,
plan,
}: MultipleDocumentsUploadProps) => {
const sendNotification = useSendNotification();
const fileInputRef = useRef<HTMLInputElement>(null);
const [isLimitPopupOpen, setIsLimitPopupOpen] = useState(false);
const [wasOpened, setWasOpened] = useState(isOpen);
Expand All @@ -50,31 +49,16 @@ export const MultipleDocumentsUpload = ({
[onClose]
);

const getFileProcessedContent = useCallback(
async (fileId: string) => {
const url = getFileProcessedUrl(owner, fileId);
const res = await fetch(url);
if (!res.ok) {
return new Err(`Error reading the file content: ${res.status}`);
}
const content = await res.text();
if (content === null || content === "") {
return new Err("Empty file content");
}
return content;
},
[owner]
);

// Used for creating files, with text extraction post-processing
const fileUploaderService = useFileUploaderService({
owner,
useCase: "folder_document",
});

// Mutation for creating documents, throw error on partial failure
const doCreate = useCreateDataSourceViewDocument(owner, dataSourceView);

const doUpsertFileAsDataSourceEntry = useUpsertFileAsDatasourceEntry(
owner,
dataSourceView
);
const [isBulkFilesUploading, setIsBulkFilesUploading] = useState<null | {
total: number;
completed: number;
Expand Down Expand Up @@ -105,7 +89,7 @@ export const MultipleDocumentsUpload = ({
completed: 0,
});

// upload Files and get FileBlobs (only keep successfull uploads)
// upload Files and get FileBlobs (only keep successful uploads)
// Each individual error triggers a notification
const fileBlobs = (await fileUploaderService.handleFileChange(e))?.filter(
(fileBlob: FileBlob): fileBlob is FileBlobWithFileId =>
Expand All @@ -119,62 +103,37 @@ export const MultipleDocumentsUpload = ({
}

// upsert the file as Data Source Documents
// Done 1 by 1 for simplicity
let i = 0;
for (const blob of fileBlobs) {
setIsBulkFilesUploading({
total: fileBlobs.length,
completed: i++,
});
// TODO : use an upsert endpoint here that will handle the upsert of the file

// get processed text
const content = await getFileProcessedContent(blob.fileId);
if (content instanceof Err) {
sendNotification({
type: "error",
title: `Error processing document ${blob.filename}`,
description: content.error,
await concurrentExecutor(
fileBlobs,
async (blob: { fileId: string; filename: string }) => {
// This also notifies in case of error
await doUpsertFileAsDataSourceEntry({
fileId: blob.fileId,
upsertArgs: {
title: blob.filename,
name: blob.filename,
},
});
continue;
}

// Create the document
const body = {
name: blob.filename,
title: blob.filename,
mime_type: blob.contentType ?? undefined,
timestamp: null,
parent_id: null,
parents: [blob.filename],
section: {
prefix: null,
content: content,
sections: [],
},
text: null,
source_url: undefined,
tags: [],
light_document_output: true,
upsert_context: null,
async: false,
};
await doCreate(body);
}

setIsBulkFilesUploading((prev) => ({
total: fileBlobs.length,
completed: prev ? prev.completed + 1 : 1,
}));
},
{ concurrency: 8 }
);

// Reset the upload state
setIsBulkFilesUploading(null);
fileUploaderService.resetUpload();
close(true);
},
[
doCreate,
fileUploaderService,
getFileProcessedContent,
close,
plan.limits.dataSources.documents.count,
sendNotification,
totalNodesCount,
doUpsertFileAsDataSourceEntry,
]
);

Expand Down
6 changes: 5 additions & 1 deletion front/lib/api/files/upsert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,11 @@ const getProcessingFunction = ({
case "application/json":
case "application/xml":
case "application/x-sh":
if (useCase === "conversation" || useCase === "tool_output") {
if (
useCase === "conversation" ||
useCase === "tool_output" ||
useCase === "folder_document"
) {
return upsertDocumentToDatasource;
}
break;
Expand Down
3 changes: 2 additions & 1 deletion front/pages/api/w/[wId]/data_sources/[dsId]/files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { apiError } from "@app/logger/withlogging";

export interface UpsertFileToDataSourceRequestBody {
fileId: string;
upsertArgs:
upsertArgs?:
| Pick<UpsertDocumentArgs, "name" | "title" | "tags">
| Pick<
UpsertTableArgs,
Expand Down Expand Up @@ -102,6 +102,7 @@ async function handler(
{ file, upsertArgs: upsertArgs }
);
if (rUpsert.isErr()) {
console.log("ERROR UPSERTING FILE", rUpsert);
return apiError(req, res, {
status_code: 500,
api_error: {
Expand Down

0 comments on commit 14386a5

Please sign in to comment.