From 95a0cd021c5a0cc43a03b8d90c650add4ce21024 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Mon, 16 Dec 2024 10:00:29 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix:=20Enhance=20Error?= =?UTF-8?q?=20Logging,=20Update=20Dependencies,=20and=20Optimize=20NLTK=20?= =?UTF-8?q?Setup=20(#106)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: remove version specification from docker-compose.yaml * chore: add better error logging with traceback across the board * chore: bump unstructured and langchain core packages * chore: add NLTK data download and disable Unstructured analytics in Dockerfiles --- Dockerfile | 7 ++++ Dockerfile.lite | 7 ++++ docker-compose.yaml | 2 - main.py | 88 +++++++++++++++++++++++++++++++++++++++---- requirements.lite.txt | 8 ++-- requirements.txt | 8 ++-- 6 files changed, 102 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index c6281b60..03d1ae09 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,13 @@ RUN apt-get update \ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +# Download standard NLTK data, to prevent unstructured from downloading packages at runtime +RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger +ENV NLTK_DATA=/app/nltk_data + +# Disable Unstructured analytics +ENV SCARF_NO_ANALYTICS=true + COPY . . CMD ["python", "main.py"] diff --git a/Dockerfile.lite b/Dockerfile.lite index 2b558791..001eef93 100644 --- a/Dockerfile.lite +++ b/Dockerfile.lite @@ -14,6 +14,13 @@ RUN apt-get update \ COPY requirements.lite.txt . RUN pip install --no-cache-dir -r requirements.lite.txt +# Download standard NLTK data, to prevent unstructured from downloading packages at runtime +RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger +ENV NLTK_DATA=/app/nltk_data + +# Disable Unstructured analytics +ENV SCARF_NO_ANALYTICS=true + COPY . . CMD ["python", "main.py"] diff --git a/docker-compose.yaml b/docker-compose.yaml index 7d064b69..225299ef 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,5 +1,3 @@ -version: '3.8' - services: db: image: ankane/pgvector:latest diff --git a/main.py b/main.py index c82b550a..b2cbfa7f 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import aiofiles.os from typing import Iterable, List from shutil import copyfileobj +import traceback import uvicorn from langchain.schema import Document @@ -112,6 +113,11 @@ async def get_all_ids(): return list(set(ids)) except Exception as e: + logger.error( + "Failed to get all IDs | Error: %s | Traceback: %s", + str(e), + traceback.format_exc(), + ) raise HTTPException(status_code=500, detail=str(e)) @@ -126,10 +132,19 @@ def isHealthOK(): @app.get("/health") async def health_check(): - if await isHealthOK(): - return {"status": "UP"} - else: - return {"status": "DOWN"}, 503 + try: + if await isHealthOK(): + return {"status": "UP"} + else: + logger.error("Health check failed") + return {"status": "DOWN"}, 503 + except Exception as e: + logger.error( + "Error during health check | Error: %s | Traceback: %s", + str(e), + traceback.format_exc(), + ) + return {"status": "DOWN", "error": str(e)}, 503 @app.get("/documents", response_model=list[DocumentResponse]) @@ -154,8 +169,19 @@ async def get_documents_by_ids(ids: list[str] = Query(...)): return documents except HTTPException as http_exc: + logger.error( + "HTTP Exception in get_documents_by_ids | Status: %d | Detail: %s", + http_exc.status_code, + http_exc.detail, + ) raise http_exc except Exception as e: + logger.error( + "Error getting documents by IDs | IDs: %s | Error: %s | Traceback: %s", + ids, + str(e), + traceback.format_exc(), + ) raise HTTPException(status_code=500, detail=str(e)) @@ -177,6 +203,12 @@ async def delete_documents(document_ids: List[str] = Body(...)): "message": f"Documents for {file_count} file{'s' if file_count > 1 else ''} deleted successfully" } except Exception as e: + logger.error( + "Failed to delete documents | IDs: %s | Error: %s | Traceback: %s", + document_ids, + str(e), + traceback.format_exc(), + ) raise HTTPException(status_code=500, detail=str(e)) @@ -220,7 +252,13 @@ async def query_embeddings_by_file_id(body: QueryRequestBody, request: Request): return authorized_documents except Exception as e: - logger.error(e) + logger.error( + "Error in query embeddings | File ID: %s | Query: %s | Error: %s | Traceback: %s", + body.file_id, + body.query, + str(e), + traceback.format_exc(), + ) raise HTTPException(status_code=500, detail=str(e)) @@ -270,7 +308,13 @@ async def store_data_in_vector_db( return {"message": "Documents added successfully", "ids": ids} except Exception as e: - logger.error(e) + logger.error( + "Failed to store data in vector DB | File ID: %s | User ID: %s | Error: %s | Traceback: %s", + file_id, + user_id, + str(e), + traceback.format_exc(), + ) return {"message": "An error occurred while adding documents.", "error": str(e)} @@ -386,6 +430,12 @@ async def embed_file( while content := await file.read(chunk_size): await temp_file.write(content) except Exception as e: + logger.error( + "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s", + temp_file_path, + str(e), + traceback.format_exc(), + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save the uploaded file. Error: {str(e)}", @@ -420,6 +470,11 @@ async def embed_file( except Exception as e: response_status = False response_message = f"Error during file processing: {str(e)}" + logger.error( + "Error during file processing: %s\nTraceback: %s", + str(e), + traceback.format_exc(), + ) raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Error during file processing: {str(e)}", @@ -428,7 +483,12 @@ async def embed_file( try: await aiofiles.os.remove(temp_file_path) except Exception as e: - logger.info(f"Failed to remove temporary file: {str(e)}") + logger.error( + "Failed to remove temporary file | Path: %s | Error: %s | Traceback: %s", + temp_file_path, + str(e), + traceback.format_exc(), + ) return { "status": response_status, @@ -464,7 +524,12 @@ async def load_document_context(id: str): return process_documents(documents) except Exception as e: - logger.error(e) + logger.error( + "Error loading document context | Document ID: %s | Error: %s | Traceback: %s", + id, + str(e), + traceback.format_exc(), + ) raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.DEFAULT(e), @@ -549,6 +614,13 @@ async def query_embeddings_by_file_ids(body: QueryMultipleBody): return documents except Exception as e: + logger.error( + "Error in query multiple embeddings | File IDs: %s | Query: %s | Error: %s | Traceback: %s", + body.file_ids, + body.query, + str(e), + traceback.format_exc(), + ) raise HTTPException(status_code=500, detail=str(e)) diff --git a/requirements.lite.txt b/requirements.lite.txt index 69ef1a18..4e9f4d34 100644 --- a/requirements.lite.txt +++ b/requirements.lite.txt @@ -1,7 +1,7 @@ -langchain==0.3.9 -langchain-community==0.3.9 +langchain==0.3.12 +langchain-community==0.3.12 langchain-openai==0.2.11 -langchain-core==0.3.21 +langchain-core==0.3.25 sqlalchemy==2.0.28 python-dotenv==1.0.1 fastapi==0.110.0 @@ -9,7 +9,7 @@ psycopg2-binary==2.9.9 pgvector==0.2.5 uvicorn==0.28.0 pypdf==4.1.0 -unstructured==0.15.13 +unstructured==0.16.11 markdown==3.6 networkx==3.2.1 pandas==2.2.1 diff --git a/requirements.txt b/requirements.txt index 4cdda488..65522728 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -langchain==0.3.9 -langchain-community==0.3.9 +langchain==0.3.12 +langchain-community==0.3.12 langchain-openai==0.2.11 -langchain-core==0.3.21 +langchain-core==0.3.25 langchain-aws==0.2.1 boto3==1.34.144 sqlalchemy==2.0.28 @@ -11,7 +11,7 @@ psycopg2-binary==2.9.9 pgvector==0.2.5 uvicorn==0.28.0 pypdf==4.1.0 -unstructured==0.15.13 +unstructured==0.16.11 markdown==3.6 networkx==3.2.1 pandas==2.2.1