Skip to content

Commit

Permalink
upload page and data profiling implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
liberty-rising committed Jan 20, 2024
1 parent 2897d1e commit f83f75a
Show file tree
Hide file tree
Showing 12 changed files with 232 additions and 61 deletions.
9 changes: 7 additions & 2 deletions backend/database/data_profile_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ class DataProfileManager:
def __init__(self, session):
self.session = session

def get_dataprofile_by_name(self, name):
def get_dataprofile_by_name_and_org(self, name, org_id) -> DataProfile:
"""Retrieve a DataProfile by its name."""
return self.session.query(DataProfile).filter(DataProfile.name == name).first()
return (
self.session.query(DataProfile)
.filter(DataProfile.name == name)
.filter(DataProfile.organization_id == org_id)
.first()
)

def get_all_data_profiles(self):
"""Retrieve all DataProfiles."""
Expand Down
4 changes: 3 additions & 1 deletion backend/envs/dev/initialization/setup_dev_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ def create_sample_dataprofile():
# Using DatabaseManager to manage the database session
with DatabaseManager() as session:
profile_manager = DataProfileManager(session)
existing_profile = profile_manager.get_dataprofile_by_name(sample_profile.name)
existing_profile = profile_manager.get_dataprofile_by_name_and_org(
sample_profile.name, 1
)
if not existing_profile:
profile_manager.create_dataprofile(sample_profile)
logger.debug("Sample data profile created.")
Expand Down
4 changes: 3 additions & 1 deletion backend/llms/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from database.database_manager import DatabaseManager
from llms.prompt_manager import PromptManager
from llms.system_message_manager import SystemMessageManager
from models.data_profile import DataProfile
from models.user import User
from openai import ChatCompletion
from settings import OPENAI_API_KEY
Expand Down Expand Up @@ -402,11 +403,12 @@ def generate_text(self, input_text):
return assistant_message_content

async def extract_data_from_jpgs(
self, instructions: str, jpg_presigned_urls: List[str]
self, data_profile: DataProfile, jpg_presigned_urls: List[str]
):
self._add_system_message(assistant_type="jpg_data_extraction")
self._set_model(model_type="img")

instructions = data_profile.description
prompt = self.prompt_manager.jpg_data_extraction_prompt(instructions)

assistant_message_content = await self._send_and_receive_message(
Expand Down
1 change: 1 addition & 0 deletions backend/llms/prompt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def jpg_data_extraction_prompt(self, instructions: str):
{instructions}
Provide output in a JSON string using the requested information as keys.
The JSON string should be flat, not nested.
Example output:
{{
Expand Down
30 changes: 29 additions & 1 deletion backend/object_storage/digitalocean_space_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import List

import boto3
from fastapi import UploadFile
from settings import (
SPACES_ACCESS_KEY,
SPACES_BUCKET_NAME,
Expand All @@ -21,7 +22,12 @@


class DigitalOceanSpaceManager:
def __init__(self, organization_name: str = "", file_paths: List[str] = []):
def __init__(
self,
organization_name: str = "",
files: List[UploadFile] = [],
file_paths: List[str] = [],
):
session = boto3.session.Session()
self.client = session.client(
"s3",
Expand All @@ -34,6 +40,7 @@ def __init__(self, organization_name: str = "", file_paths: List[str] = []):

self.organization_name = organization_name.replace(" ", "_")

self.files = files
self.file_paths = file_paths
self.file_names = [os.path.basename(file_path) for file_path in file_paths]
self.object_names: List[str] = []
Expand All @@ -52,6 +59,27 @@ def upload_files(self):
"""
all_uploaded = True

# Upload the files
for file in self.files:
# Prepend the organization_name to the object_name
object_name = f"{self.organization_name}/{file.filename}"
try:
file.file.seek(0) # Ensure we're at the start of the file
self.client.upload_fileobj(file.file, self.bucket_name, object_name)
self.object_names.append(object_name)
except Exception as e:
print(e)
all_uploaded = False

return all_uploaded

def upload_files_by_paths(self):
"""Upload multiple files using their file paths to an S3 bucket
:return: True if files were uploaded, else False
"""
all_uploaded = True

# Upload the files
for file_path, file_name in zip(self.file_paths, self.file_names):
# Prepend the organization_name to the object_name
Expand Down
99 changes: 96 additions & 3 deletions backend/routes/data_profile_routes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
import tempfile
from typing import List

from database.data_profile_manager import DataProfileManager
from database.database_manager import DatabaseManager
from database.organization_manager import OrganizationManager
from database.table_manager import TableManager
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
from llms.gpt import GPTLLM
from models.data_profile import (
Expand Down Expand Up @@ -43,12 +45,15 @@ async def save_data_profiles(
) -> DataProfileCreateResponse:
with DatabaseManager() as session:
data_profile_manager = DataProfileManager(session)
if data_profile_manager.get_dataprofile_by_name(request.name):
if data_profile_manager.get_dataprofile_by_name_and_org(
request.name, current_user.organization_id
):
raise HTTPException(status_code=400, detail="Data Profile already exists")

new_data_profile = DataProfile(
name=request.name,
description=request.description,
organization_id=current_user.organization_id,
)
created_data_profile = data_profile_manager.create_dataprofile(new_data_profile)

Expand Down Expand Up @@ -104,11 +109,99 @@ async def preview_data_profile(
with DigitalOceanSpaceManager(
organization_name=organization_name, file_paths=jpg_file_paths
) as space_manager:
space_manager.upload_files()
space_manager.upload_files_by_paths()
jpg_presigned_urls = space_manager.create_presigned_urls()
gpt = GPTLLM(chat_id=1, user=current_user)
extracted_data = await gpt.extract_data_from_jpgs(
instructions, jpg_presigned_urls
)

return extracted_data
# Delete the temporary files
for path in temp_file_paths:
os.remove(path)

return extracted_data


@data_profile_router.post("/data-profiles/{data_profile_name}/preview/")
async def preview_data_profile_upload(
data_profile_name: str,
files: List[UploadFile] = File(...),
current_user: User = Depends(get_current_user),
):
temp_file_paths = []
for file in files:
if file.filename:
suffix = file.filename.split(".")[-1]

# Save the uploaded file temporarily
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix)
temp_file.write(await file.read())
temp_file.close()
temp_file_paths.append(temp_file.name)

# Get the organization name
with DatabaseManager() as session:
org_manager = OrganizationManager(session)
organization_name = org_manager.get_organization(
current_user.organization_id
).name

data_profile_manager = DataProfileManager(session)
data_profile = data_profile_manager.get_dataprofile_by_name_and_org(
data_profile_name, current_user.organization_id
)

# Use the ImageConversionManager context manager to convert the PDF to JPG
with ImageConversionManager(temp_file_paths) as manager:
jpg_file_paths = manager.convert_to_jpgs()

# Upload the JPG file to DigitalOcean Spaces, automatically deleting it when done
with DigitalOceanSpaceManager(
organization_name=organization_name, file_paths=jpg_file_paths
) as space_manager:
space_manager.upload_files_by_paths()
jpg_presigned_urls = space_manager.create_presigned_urls()
gpt = GPTLLM(chat_id=1, user=current_user)
extracted_data = await gpt.extract_data_from_jpgs(
data_profile, jpg_presigned_urls
)

# Delete the temporary files
for path in temp_file_paths:
os.remove(path)

return extracted_data


@data_profile_router.post("/data-profiles/{data_profile_name}/extracted-data/")
async def save_extracted_data(
data_profile_name: str,
extracted_data: dict,
files: List[UploadFile] = File(...),
current_user: User = Depends(get_current_user),
):
# Get the organization name
with DatabaseManager() as session:
org_manager = OrganizationManager(session)
organization_name = org_manager.get_organization(
current_user.organization_id
).name

data_profile_manager = DataProfileManager(session)
data_profile: DataProfile = (
data_profile_manager.get_dataprofile_by_name_and_org(
data_profile_name, current_user.organization_id
)
)

table_manager = TableManager(session)
print(data_profile, table_manager) # TODO: To be further implemented

# Upload the JPG file to DigitalOcean Spaces, automatically deleting it when done
with DigitalOceanSpaceManager(
organization_name=organization_name, files=files
) as space_manager:
space_manager.upload_files()

return {"message": "Extracted data saved successfully"}
13 changes: 10 additions & 3 deletions backend/utils/image_conversion_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,19 @@ def __exit__(self, exc_type, exc_value, traceback):
os.unlink(converted_file_path) # Delete the file

def convert_to_jpgs(self):
if all(file_path.endswith(".pdf") for file_path in self.file_paths):
if all(file_path.lower().endswith(".pdf") for file_path in self.file_paths):
return self._convert_pdfs_to_jpgs(self.file_paths)
elif all(file_path.endswith(".png") for file_path in self.file_paths):
elif all(file_path.lower().endswith(".png") for file_path in self.file_paths):
return self._convert_pngs_to_jpgs(self.file_paths)
elif all(
file_path.lower().endswith((".jpg", ".jpeg"))
for file_path in self.file_paths
):
return self.file_paths
else:
print("All files must be of the same type (either all .pdf or all .png)")
print(
"All files must be of the same type (either all .pdf, all .png, or all .jpg/.jpeg)"
)
return []

def _convert_pdfs_to_jpgs(self, file_paths: List[str]):
Expand Down
8 changes: 4 additions & 4 deletions frontend/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ function AppWrapper() {
function App() {
const { isLoading } = useAuth();

if (isLoading) {
return <div>Loading...</div>; // Or any other loading indicator
}

useEffect(() => {
if (APP_ENV === "dev") {
document.title = "DocShow AI - Dev";
Expand All @@ -52,6 +48,10 @@ function App() {
}
}, []);

if (isLoading) {
return <div>Loading...</div>; // Or any other loading indicator
}

return (
<Routes>
<Route
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/pages/analytics/AIAssistant.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {
import axios from "axios";
import { API_URL } from "../../utils/constants";

const AIAssistant = ({ table }) => {
const AIAssistant = () => {
const [userInput, setUserInput] = useState("");
const [chatHistory, setChatHistory] = useState([]);
const chatEndRef = useRef(null);
Expand Down
28 changes: 2 additions & 26 deletions frontend/src/pages/analytics/AnalyticsPage.jsx
Original file line number Diff line number Diff line change
@@ -1,41 +1,17 @@
// AnalyticsPage.js
import React, { useEffect, useState } from "react";
import React from "react";
import { Box, Typography, Grid } from "@mui/material";
import AIAssistant from "./AIAssistant";
import TableSelectDropdown from "../../components/tables/selects/TableSelectDropdown";
import { fetchOrganizationTables } from "../../api/organizationTables";

function AnalyticsPage() {
const [tables, setTables] = useState([]);
const [selectedTable, setSelectedTable] = useState("");

useEffect(() => {
const getOrganizationTables = async () => {
const data = await fetchOrganizationTables();
setTables(data);
};

getOrganizationTables();
}, []);

const handleTableSelect = (table) => {
setSelectedTable(table);
};

useEffect(() => {
if (selectedTable) {
handleTableSelect(selectedTable);
}
}, [selectedTable]);

return (
<Box>
<Typography variant="h4" gutterBottom>
📊 AI Analyst
</Typography>
<Grid container spacing={2}>
<Grid item xs={12}>
<AIAssistant table={selectedTable} />
<AIAssistant />
</Grid>
</Grid>
</Box>
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/pages/upload/CreateDataProfilePage.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ function CreateDataProfilePage({ open, onClose, onCreate }) {

const handleSubmit = (event) => {
event.preventDefault();
onCreate({ name, extractInstructions });
onCreate(name, extractInstructions);
};

const handlePreview = () => {
Expand Down Expand Up @@ -101,7 +101,7 @@ function CreateDataProfilePage({ open, onClose, onCreate }) {
type="submit"
color="primary"
variant="contained"
disabled={!isPreviewTableOpen}
disabled={!isPreviewTableOpen || !name || !extractInstructions}
>
Create
</Button>
Expand Down
Loading

0 comments on commit f83f75a

Please sign in to comment.