Merge pull request #180 from DocShow-AI/image_data_extraction_llm

Image data extraction llm
liberty-rising · Jan 6, 2024 · b94dc35 · b94dc35
2 parents d5c5f86 + 775dd77
commit b94dc35
Show file tree

Hide file tree

Showing 8 changed files with 87 additions and 69 deletions.
diff --git a/backend/llms/gpt.py b/backend/llms/gpt.py
@@ -1,5 +1,5 @@
 from openai import ChatCompletion
-from typing import List, Optional
+from typing import Optional
 
 import json
 import openai
@@ -124,9 +124,22 @@ def _get_system_message_content(self, assistant_type: str = "generic") -> str:
         )
         return system_message_content
 
-    def _create_message(self, role: str, prompt: str):
+    def _create_message(self, role: str, prompt: str, image_url: str = ""):
         """Create either a user, system, or assistant message."""
-        return {"role": f"{role}", "content": f"{prompt}"}
+        if image_url:
+            return {
+                "role": f"{role}",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image",
+                        "image_url": {"url": image_url},
+                    },
+                ],
+                "image_url": f"{image_url}",
+            }
+        else:
+            return {"role": f"{role}", "content": f"{prompt}"}
 
     def _add_system_message(self, assistant_type: str) -> None:
         """
@@ -151,8 +164,8 @@ def _add_system_message(self, assistant_type: str) -> None:
 
         self.llm_type = assistant_type
 
-    async def _send_and_receive_message(self, prompt: str) -> str:
-        user_message = self._create_message("user", prompt)
+    async def _send_and_receive_message(self, prompt: str, image_url: str = "") -> str:
+        user_message = self._create_message("user", prompt, image_url)
         self.history.append(user_message)
 
         # Check token limit and truncate history if needed
@@ -330,5 +343,14 @@ def generate_text(self, input_text):
 
         return assistant_message_content
 
-    def generate_analytics_text(self, input_text: str, table_names: List[str]):
-        self._add_system_message(assistant_type="analytics_chat")
+    def extract_data_from_jpg(self, instructions: str, jpg_file: str):
+        self._add_system_message(assistant_type="jpg_data_extraction")
+
+        base64_image = tiktoken.image_to_base64(jpg_file)
+        image_url = f"data:image/jpeg;base64,{base64_image}"
+
+        prompt = self.prompt_manager.jpg_data_extraction_prompt(instructions)
+
+        assistant_message_content = self._send_and_receive_message(prompt, image_url)
+
+        return assistant_message_content
diff --git a/backend/llms/prompt_manager.py b/backend/llms/prompt_manager.py
@@ -93,3 +93,22 @@ def create_table_desc_prompt(
         if extra_desc:
             prompt += f"\n\nAdditional information about the sample data: {extra_desc}"
         return prompt
+
+    def jpg_data_extraction_prompt(self, instructions: str):
+        prompt = f"""
+            Extract the following data from the given JPG file:
+
+            User request:
+            {instructions}
+
+            Provide output in JSON format using the requested information as keys.
+
+            Example output:
+            {{
+                "client_name":"John Doe",
+                "invoice_amount":"1000",
+                "date":"01-01-2021"
+            }}
+            In this example, the requested information would have been client name, invoice amount, and date.
+            """
+        return prompt
diff --git a/backend/llms/system_message_manager.py b/backend/llms/system_message_manager.py
@@ -46,6 +46,10 @@ def __init__(self):
                 You are a table categorization assistant. Your task is to analyze sample data and existing table metadata to identify the most suitable
                 table for appending the sample data. Return only the name of the table.
             """,
+            """jpg_data_extraction""": """
+                You are a JPG data extraction assistant. Your task is to extract specific data in the order specifed from a JPG file and return it in a json format.
+                Return only the extracted data.
+            """,
             "generic": "You are a generic assistant.",
         }
 

diff --git a/backend/object_storage/__init__.py b/backend/object_storage/__init__.py
diff --git a/backend/object_storage/digitalocean_space_manager.py b/backend/object_storage/digitalocean_space_manager.py
@@ -81,3 +81,22 @@ def create_presigned_url(
 
         # The response contains the presigned URL
         return response
+
+    def delete_file(self, organization_name, object_name):
+        """Delete a file from an S3 bucket
+
+        :param organization_name: Name of the organization the file belongs to
+        :param object_name: S3 object name
+        :return: True if the referenced object was deleted, otherwise False
+        """
+
+        # Prepend the organization_name to the object_name
+        object_name = f"{organization_name}/{object_name}"
+
+        # Delete the file
+        try:
+            self.client.delete_object(Bucket=self.bucket_name, Key=object_name)
+        except Exception as e:
+            print(e)
+            return False
+        return True
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -20,4 +20,6 @@ langchain==0.0.351
 pytest==6.2.5
 pytest-asyncio==0.15.1
 sendgrid==6.11.0
-boto3==1.34.10
+boto3==1.34.10
+pillow==10.1.0
+pdf2image==1.16.3
diff --git a/backend/routes/data_profile_routes.py b/backend/routes/data_profile_routes.py
@@ -1,11 +1,10 @@
 from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
 
-# from starlette.responses import JSONResponse
 import tempfile
-import os
 
 from database.database_manager import DatabaseManager
 from database.data_profile_manager import DataProfileManager
+from llms.gpt import GPTLLM
 from models.data_profile import (
     DataProfile,
     DataProfileCreateRequest,
@@ -69,57 +68,13 @@ async def preview_data_profile(
 ):
     suffix = file.filename.split(".")[-1]
     # Save the uploaded file temporarily
-    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
+    with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file:
         temp_file.write(await file.read())
         temp_file_path = temp_file.name
 
-    # Use the ImageConversionManager context manager to convert the PDF to JPG
-    jpg_files = []
-    with ImageConversionManager(temp_file_path, "/change-me/") as manager:
-        jpg_files = manager.convert_to_jpg(temp_file_path)
-
-    # Clean up the uploaded temp file
-    os.unlink(temp_file_path)
-
-    # Assuming you have a function to send the JPGs to the LLM and get a response
-    # Send the JPG files to the LLM using the API
-    # You need to define how you'll handle multiple JPGs - this is just a placeholder
-    # if jpg_files:
-    #     # Here you would typically prepare and send your request to the LLM API.
-    #     # This will vary greatly depending on the LLM's API specifics.
-    #     # For now, this is a placeholder for how you might make the request.
-    #     # Replace with your actual API endpoint and key
-    #     llm_api_endpoint = "https://api.example.com/llm"
-    #     api_key = "your_api_key"
-    #     response = requests.post(
-    #         llm_api_endpoint,
-    #         headers={"Authorization": f"Bearer {api_key}"},
-    #         files={"file": open(jpg_files[0], "rb")},
-    #     )
-
-    #     # Handle the response
-    #     if response.status_code == 200:
-    #         llm_response = response.json()
-    #     else:
-    #         raise HTTPException(status_code=500, detail="LLM API request failed")
-    # else:
-    #     raise HTTPException(status_code=500, detail="Failed to convert file")
-
-    # Clean up the created JPG files
-    for jpg_file in jpg_files:
-        os.unlink(jpg_file)
-
-    # Return the LLM's response as JSON
-    # return JSONResponse(content=llm_response)
-
-
-# Now you would include this router in your FastAPI application instance.
-# from fastapi import FastAPI
-# app = FastAPI()
-# app.include_router(data_profile_router)
-
-
-# the response has to be a json
-
-# file -- > convert to jpg --> |
-# data-profile             --> | --> llm --> response
+        # Use the ImageConversionManager context manager to convert the PDF to JPG
+        with ImageConversionManager(temp_file_path, "/tmp/") as manager:
+            jpg_file = manager.convert_to_jpg(temp_file_path)
+            gpt = GPTLLM()
+            data = gpt.extract_data_from_jpg(instructions, jpg_file)
+        return data
diff --git a/backend/utils/image_conversion_manager.py b/backend/utils/image_conversion_manager.py
@@ -57,14 +57,11 @@ def convert_to_jpg(self, file_path):
         # Add more conditions for other file types if needed
 
     def _convert_pdf_to_jpg(self, file_path):
-        images = convert_from_path(file_path)
-        jpg_files = []
-        for i, image in enumerate(images):
-            jpg_filename = f"output_page_{i}.jpg"
-            jpg_file_path = os.path.join(self.output_folder, jpg_filename)
-            image.save(jpg_file_path, "JPEG")
-            jpg_files.append(jpg_file_path)
-        return jpg_files
+        image = convert_from_path(file_path)
+        jpg_filename = os.path.basename(file_path).replace(".pdf", ".jpg")
+        jpg_file_path = os.path.join(self.output_folder, jpg_filename)
+        image.save(jpg_file_path, "JPEG")
+        return jpg_file_path
 
     def _convert_png_to_jpg(self, file_path):
         rgb_im = Image.open(file_path).convert("RGB")