Skip to content

Commit

Permalink
šŸ› ļø fix: Enhance Error Logging, Update Dependencies, and Optimize NLTKā€¦
Browse files Browse the repository at this point in the history
ā€¦ Setup (#106)

* chore: remove version specification from docker-compose.yaml

* chore: add better error logging with traceback across the board

* chore: bump unstructured and langchain core packages

* chore: add NLTK data download and disable Unstructured analytics in Dockerfiles
  • Loading branch information
danny-avila authored Dec 16, 2024
1 parent 662b057 commit 95a0cd0
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 18 deletions.
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ RUN apt-get update \
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Download standard NLTK data, to prevent unstructured from downloading packages at runtime
RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger
ENV NLTK_DATA=/app/nltk_data

# Disable Unstructured analytics
ENV SCARF_NO_ANALYTICS=true

COPY . .

CMD ["python", "main.py"]
7 changes: 7 additions & 0 deletions Dockerfile.lite
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ RUN apt-get update \
COPY requirements.lite.txt .
RUN pip install --no-cache-dir -r requirements.lite.txt

# Download standard NLTK data, to prevent unstructured from downloading packages at runtime
RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger
ENV NLTK_DATA=/app/nltk_data

# Disable Unstructured analytics
ENV SCARF_NO_ANALYTICS=true

COPY . .

CMD ["python", "main.py"]
2 changes: 0 additions & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3.8'

services:
db:
image: ankane/pgvector:latest
Expand Down
88 changes: 80 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import aiofiles.os
from typing import Iterable, List
from shutil import copyfileobj
import traceback

import uvicorn
from langchain.schema import Document
Expand Down Expand Up @@ -112,6 +113,11 @@ async def get_all_ids():

return list(set(ids))
except Exception as e:
logger.error(
"Failed to get all IDs | Error: %s | Traceback: %s",
str(e),
traceback.format_exc(),
)
raise HTTPException(status_code=500, detail=str(e))


Expand All @@ -126,10 +132,19 @@ def isHealthOK():

@app.get("/health")
async def health_check():
if await isHealthOK():
return {"status": "UP"}
else:
return {"status": "DOWN"}, 503
try:
if await isHealthOK():
return {"status": "UP"}
else:
logger.error("Health check failed")
return {"status": "DOWN"}, 503
except Exception as e:
logger.error(
"Error during health check | Error: %s | Traceback: %s",
str(e),
traceback.format_exc(),
)
return {"status": "DOWN", "error": str(e)}, 503


@app.get("/documents", response_model=list[DocumentResponse])
Expand All @@ -154,8 +169,19 @@ async def get_documents_by_ids(ids: list[str] = Query(...)):

return documents
except HTTPException as http_exc:
logger.error(
"HTTP Exception in get_documents_by_ids | Status: %d | Detail: %s",
http_exc.status_code,
http_exc.detail,
)
raise http_exc
except Exception as e:
logger.error(
"Error getting documents by IDs | IDs: %s | Error: %s | Traceback: %s",
ids,
str(e),
traceback.format_exc(),
)
raise HTTPException(status_code=500, detail=str(e))


Expand All @@ -177,6 +203,12 @@ async def delete_documents(document_ids: List[str] = Body(...)):
"message": f"Documents for {file_count} file{'s' if file_count > 1 else ''} deleted successfully"
}
except Exception as e:
logger.error(
"Failed to delete documents | IDs: %s | Error: %s | Traceback: %s",
document_ids,
str(e),
traceback.format_exc(),
)
raise HTTPException(status_code=500, detail=str(e))


Expand Down Expand Up @@ -220,7 +252,13 @@ async def query_embeddings_by_file_id(body: QueryRequestBody, request: Request):
return authorized_documents

except Exception as e:
logger.error(e)
logger.error(
"Error in query embeddings | File ID: %s | Query: %s | Error: %s | Traceback: %s",
body.file_id,
body.query,
str(e),
traceback.format_exc(),
)
raise HTTPException(status_code=500, detail=str(e))


Expand Down Expand Up @@ -270,7 +308,13 @@ async def store_data_in_vector_db(
return {"message": "Documents added successfully", "ids": ids}

except Exception as e:
logger.error(e)
logger.error(
"Failed to store data in vector DB | File ID: %s | User ID: %s | Error: %s | Traceback: %s",
file_id,
user_id,
str(e),
traceback.format_exc(),
)
return {"message": "An error occurred while adding documents.", "error": str(e)}


Expand Down Expand Up @@ -386,6 +430,12 @@ async def embed_file(
while content := await file.read(chunk_size):
await temp_file.write(content)
except Exception as e:
logger.error(
"Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s",
temp_file_path,
str(e),
traceback.format_exc(),
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to save the uploaded file. Error: {str(e)}",
Expand Down Expand Up @@ -420,6 +470,11 @@ async def embed_file(
except Exception as e:
response_status = False
response_message = f"Error during file processing: {str(e)}"
logger.error(
"Error during file processing: %s\nTraceback: %s",
str(e),
traceback.format_exc(),
)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Error during file processing: {str(e)}",
Expand All @@ -428,7 +483,12 @@ async def embed_file(
try:
await aiofiles.os.remove(temp_file_path)
except Exception as e:
logger.info(f"Failed to remove temporary file: {str(e)}")
logger.error(
"Failed to remove temporary file | Path: %s | Error: %s | Traceback: %s",
temp_file_path,
str(e),
traceback.format_exc(),
)

return {
"status": response_status,
Expand Down Expand Up @@ -464,7 +524,12 @@ async def load_document_context(id: str):

return process_documents(documents)
except Exception as e:
logger.error(e)
logger.error(
"Error loading document context | Document ID: %s | Error: %s | Traceback: %s",
id,
str(e),
traceback.format_exc(),
)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
Expand Down Expand Up @@ -549,6 +614,13 @@ async def query_embeddings_by_file_ids(body: QueryMultipleBody):

return documents
except Exception as e:
logger.error(
"Error in query multiple embeddings | File IDs: %s | Query: %s | Error: %s | Traceback: %s",
body.file_ids,
body.query,
str(e),
traceback.format_exc(),
)
raise HTTPException(status_code=500, detail=str(e))


Expand Down
8 changes: 4 additions & 4 deletions requirements.lite.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
langchain==0.3.9
langchain-community==0.3.9
langchain==0.3.12
langchain-community==0.3.12
langchain-openai==0.2.11
langchain-core==0.3.21
langchain-core==0.3.25
sqlalchemy==2.0.28
python-dotenv==1.0.1
fastapi==0.110.0
psycopg2-binary==2.9.9
pgvector==0.2.5
uvicorn==0.28.0
pypdf==4.1.0
unstructured==0.15.13
unstructured==0.16.11
markdown==3.6
networkx==3.2.1
pandas==2.2.1
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
langchain==0.3.9
langchain-community==0.3.9
langchain==0.3.12
langchain-community==0.3.12
langchain-openai==0.2.11
langchain-core==0.3.21
langchain-core==0.3.25
langchain-aws==0.2.1
boto3==1.34.144
sqlalchemy==2.0.28
Expand All @@ -11,7 +11,7 @@ psycopg2-binary==2.9.9
pgvector==0.2.5
uvicorn==0.28.0
pypdf==4.1.0
unstructured==0.15.13
unstructured==0.16.11
markdown==3.6
networkx==3.2.1
pandas==2.2.1
Expand Down

0 comments on commit 95a0cd0

Please sign in to comment.