diff --git a/backend/database/data_profile_manager.py b/backend/database/data_profile_manager.py index 88c56d6..499989e 100644 --- a/backend/database/data_profile_manager.py +++ b/backend/database/data_profile_manager.py @@ -5,9 +5,14 @@ class DataProfileManager: def __init__(self, session): self.session = session - def get_dataprofile_by_name(self, name): + def get_dataprofile_by_name_and_org(self, name, org_id) -> DataProfile: """Retrieve a DataProfile by its name.""" - return self.session.query(DataProfile).filter(DataProfile.name == name).first() + return ( + self.session.query(DataProfile) + .filter(DataProfile.name == name) + .filter(DataProfile.organization_id == org_id) + .first() + ) def get_all_data_profiles(self): """Retrieve all DataProfiles.""" diff --git a/backend/envs/dev/initialization/setup_dev_environment.py b/backend/envs/dev/initialization/setup_dev_environment.py index f82b97b..74c8d03 100644 --- a/backend/envs/dev/initialization/setup_dev_environment.py +++ b/backend/envs/dev/initialization/setup_dev_environment.py @@ -178,7 +178,9 @@ def create_sample_dataprofile(): # Using DatabaseManager to manage the database session with DatabaseManager() as session: profile_manager = DataProfileManager(session) - existing_profile = profile_manager.get_dataprofile_by_name(sample_profile.name) + existing_profile = profile_manager.get_dataprofile_by_name_and_org( + sample_profile.name, 1 + ) if not existing_profile: profile_manager.create_dataprofile(sample_profile) logger.debug("Sample data profile created.") diff --git a/backend/llms/gpt.py b/backend/llms/gpt.py index 0d0a2e7..a2bfc1b 100644 --- a/backend/llms/gpt.py +++ b/backend/llms/gpt.py @@ -7,6 +7,7 @@ from database.database_manager import DatabaseManager from llms.prompt_manager import PromptManager from llms.system_message_manager import SystemMessageManager +from models.data_profile import DataProfile from models.user import User from openai import ChatCompletion from settings import OPENAI_API_KEY @@ -402,11 +403,12 @@ def generate_text(self, input_text): return assistant_message_content async def extract_data_from_jpgs( - self, instructions: str, jpg_presigned_urls: List[str] + self, data_profile: DataProfile, jpg_presigned_urls: List[str] ): self._add_system_message(assistant_type="jpg_data_extraction") self._set_model(model_type="img") + instructions = data_profile.description prompt = self.prompt_manager.jpg_data_extraction_prompt(instructions) assistant_message_content = await self._send_and_receive_message( diff --git a/backend/llms/prompt_manager.py b/backend/llms/prompt_manager.py index 4d49ea0..61a1336 100644 --- a/backend/llms/prompt_manager.py +++ b/backend/llms/prompt_manager.py @@ -102,6 +102,7 @@ def jpg_data_extraction_prompt(self, instructions: str): {instructions} Provide output in a JSON string using the requested information as keys. + The JSON string should be flat, not nested. Example output: {{ diff --git a/backend/object_storage/digitalocean_space_manager.py b/backend/object_storage/digitalocean_space_manager.py index 2863dfc..d9f3524 100644 --- a/backend/object_storage/digitalocean_space_manager.py +++ b/backend/object_storage/digitalocean_space_manager.py @@ -11,6 +11,7 @@ from typing import List import boto3 +from fastapi import UploadFile from settings import ( SPACES_ACCESS_KEY, SPACES_BUCKET_NAME, @@ -21,7 +22,12 @@ class DigitalOceanSpaceManager: - def __init__(self, organization_name: str = "", file_paths: List[str] = []): + def __init__( + self, + organization_name: str = "", + files: List[UploadFile] = [], + file_paths: List[str] = [], + ): session = boto3.session.Session() self.client = session.client( "s3", @@ -34,6 +40,7 @@ def __init__(self, organization_name: str = "", file_paths: List[str] = []): self.organization_name = organization_name.replace(" ", "_") + self.files = files self.file_paths = file_paths self.file_names = [os.path.basename(file_path) for file_path in file_paths] self.object_names: List[str] = [] @@ -52,6 +59,27 @@ def upload_files(self): """ all_uploaded = True + # Upload the files + for file in self.files: + # Prepend the organization_name to the object_name + object_name = f"{self.organization_name}/{file.filename}" + try: + file.file.seek(0) # Ensure we're at the start of the file + self.client.upload_fileobj(file.file, self.bucket_name, object_name) + self.object_names.append(object_name) + except Exception as e: + print(e) + all_uploaded = False + + return all_uploaded + + def upload_files_by_paths(self): + """Upload multiple files using their file paths to an S3 bucket + + :return: True if files were uploaded, else False + """ + all_uploaded = True + # Upload the files for file_path, file_name in zip(self.file_paths, self.file_names): # Prepend the organization_name to the object_name diff --git a/backend/routes/data_profile_routes.py b/backend/routes/data_profile_routes.py index a83a290..f00b515 100644 --- a/backend/routes/data_profile_routes.py +++ b/backend/routes/data_profile_routes.py @@ -1,9 +1,11 @@ +import os import tempfile from typing import List from database.data_profile_manager import DataProfileManager from database.database_manager import DatabaseManager from database.organization_manager import OrganizationManager +from database.table_manager import TableManager from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile from llms.gpt import GPTLLM from models.data_profile import ( @@ -43,12 +45,15 @@ async def save_data_profiles( ) -> DataProfileCreateResponse: with DatabaseManager() as session: data_profile_manager = DataProfileManager(session) - if data_profile_manager.get_dataprofile_by_name(request.name): + if data_profile_manager.get_dataprofile_by_name_and_org( + request.name, current_user.organization_id + ): raise HTTPException(status_code=400, detail="Data Profile already exists") new_data_profile = DataProfile( name=request.name, description=request.description, + organization_id=current_user.organization_id, ) created_data_profile = data_profile_manager.create_dataprofile(new_data_profile) @@ -104,11 +109,99 @@ async def preview_data_profile( with DigitalOceanSpaceManager( organization_name=organization_name, file_paths=jpg_file_paths ) as space_manager: - space_manager.upload_files() + space_manager.upload_files_by_paths() jpg_presigned_urls = space_manager.create_presigned_urls() gpt = GPTLLM(chat_id=1, user=current_user) extracted_data = await gpt.extract_data_from_jpgs( instructions, jpg_presigned_urls ) - return extracted_data + # Delete the temporary files + for path in temp_file_paths: + os.remove(path) + + return extracted_data + + +@data_profile_router.post("/data-profiles/{data_profile_name}/preview/") +async def preview_data_profile_upload( + data_profile_name: str, + files: List[UploadFile] = File(...), + current_user: User = Depends(get_current_user), +): + temp_file_paths = [] + for file in files: + if file.filename: + suffix = file.filename.split(".")[-1] + + # Save the uploaded file temporarily + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) + temp_file.write(await file.read()) + temp_file.close() + temp_file_paths.append(temp_file.name) + + # Get the organization name + with DatabaseManager() as session: + org_manager = OrganizationManager(session) + organization_name = org_manager.get_organization( + current_user.organization_id + ).name + + data_profile_manager = DataProfileManager(session) + data_profile = data_profile_manager.get_dataprofile_by_name_and_org( + data_profile_name, current_user.organization_id + ) + + # Use the ImageConversionManager context manager to convert the PDF to JPG + with ImageConversionManager(temp_file_paths) as manager: + jpg_file_paths = manager.convert_to_jpgs() + + # Upload the JPG file to DigitalOcean Spaces, automatically deleting it when done + with DigitalOceanSpaceManager( + organization_name=organization_name, file_paths=jpg_file_paths + ) as space_manager: + space_manager.upload_files_by_paths() + jpg_presigned_urls = space_manager.create_presigned_urls() + gpt = GPTLLM(chat_id=1, user=current_user) + extracted_data = await gpt.extract_data_from_jpgs( + data_profile, jpg_presigned_urls + ) + + # Delete the temporary files + for path in temp_file_paths: + os.remove(path) + + return extracted_data + + +@data_profile_router.post("/data-profiles/{data_profile_name}/extracted-data/") +async def save_extracted_data( + data_profile_name: str, + extracted_data: dict, + files: List[UploadFile] = File(...), + current_user: User = Depends(get_current_user), +): + # Get the organization name + with DatabaseManager() as session: + org_manager = OrganizationManager(session) + organization_name = org_manager.get_organization( + current_user.organization_id + ).name + + data_profile_manager = DataProfileManager(session) + data_profile: DataProfile = ( + data_profile_manager.get_dataprofile_by_name_and_org( + data_profile_name, current_user.organization_id + ) + ) + + table_manager = TableManager(session) + print(data_profile, table_manager) # TODO: To be further implemented + + # Upload the JPG file to DigitalOcean Spaces, automatically deleting it when done + with DigitalOceanSpaceManager( + organization_name=organization_name, files=files + ) as space_manager: + space_manager.upload_files() + + return {"message": "Extracted data saved successfully"} diff --git a/backend/utils/image_conversion_manager.py b/backend/utils/image_conversion_manager.py index 7a36483..f323ff0 100644 --- a/backend/utils/image_conversion_manager.py +++ b/backend/utils/image_conversion_manager.py @@ -20,12 +20,19 @@ def __exit__(self, exc_type, exc_value, traceback): os.unlink(converted_file_path) # Delete the file def convert_to_jpgs(self): - if all(file_path.endswith(".pdf") for file_path in self.file_paths): + if all(file_path.lower().endswith(".pdf") for file_path in self.file_paths): return self._convert_pdfs_to_jpgs(self.file_paths) - elif all(file_path.endswith(".png") for file_path in self.file_paths): + elif all(file_path.lower().endswith(".png") for file_path in self.file_paths): return self._convert_pngs_to_jpgs(self.file_paths) + elif all( + file_path.lower().endswith((".jpg", ".jpeg")) + for file_path in self.file_paths + ): + return self.file_paths else: - print("All files must be of the same type (either all .pdf or all .png)") + print( + "All files must be of the same type (either all .pdf, all .png, or all .jpg/.jpeg)" + ) return [] def _convert_pdfs_to_jpgs(self, file_paths: List[str]): diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index 4820081..f202404 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -40,10 +40,6 @@ function AppWrapper() { function App() { const { isLoading } = useAuth(); - if (isLoading) { - return