Skip to content

Commit

Permalink
Calculate total cache size in bytes instead of length
Browse files Browse the repository at this point in the history
  • Loading branch information
tianjing-li committed Dec 11, 2023
1 parent 12151c3 commit 1e8415f
Showing 1 changed file with 22 additions and 9 deletions.
31 changes: 22 additions & 9 deletions dropbox/provider/unstructured.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,45 @@
import sys
import asyncio
import aiohttp
import logging
import functools
from collections import OrderedDict
from flask import current_app as app

logger = logging.getLogger(__name__)

CACHE_SIZE = 256
CACHE_LIMIT_BYTES = 20 * 1024 * 1024 # 20 MB to bytes
TIMEOUT_SECONDS = 20

unstructured = None


class UnstructuredRequestSession:
def __init__(self, unstructured_base_url, api_key):
self.get_content_url = f"{unstructured_base_url}/general/v0/general"
self.headers = {"unstructured-api-key": api_key}
self.api_key = api_key
# Manually cache because functools.lru_cache does not support async methods
self.cache = OrderedDict()
self.start_session()

def start_session(self):
self.loop = asyncio.new_event_loop()
self.session = aiohttp.ClientSession(loop=self.loop)
# Create ClientTimeout object to apply timeout for every request in the session
client_timeout = aiohttp.ClientTimeout(total=TIMEOUT_SECONDS)
self.session = aiohttp.ClientSession(loop=self.loop, timeout=client_timeout)

def close_loop(self):
self.loop.stop()
self.loop.close()

def cache_size(self):
# Calculate the total size of values in bytes
total_size_bytes = functools.reduce(
lambda a, b: a + b, map(lambda v: sys.getsizeof(v), self.cache.values()), 0
)

return total_size_bytes

def cache_get(self, key):
self.cache.move_to_end(key)

Expand All @@ -35,7 +48,7 @@ def cache_get(self, key):
def cache_put(self, key, item):
self.cache[key] = item

if len(self.cache) > CACHE_SIZE:
while self.cache_size() > CACHE_LIMIT_BYTES:
self.cache.popitem()

async def close_session(self):
Expand All @@ -53,9 +66,12 @@ async def get_unstructured_content(self, file):
data = aiohttp.FormData()
data.add_field("files", file_data, filename=file_name)

# API key optional if self-hosted
headers = {} if self.api_key is None else {"unstructured-api-key": self.api_key}

async with self.session.post(
self.get_content_url,
headers=self.headers,
headers=headers,
data=data,
) as response:
content = await response.json()
Expand Down Expand Up @@ -91,13 +107,10 @@ def get_unstructured_client():
if unstructured is not None:
return unstructured

# Fetch environment variables
assert (
unstructured_base_url := app.config.get("UNSTRUCTURED_BASE_URL")
), "DROPBOX_UNSTRUCTURED_BASE_URL must be set"
assert (
api_key := app.config.get("UNSTRUCTURED_API_KEY")
), "DROPBOX_UNSTRUCTURED_API_KEY must be set"
api_key = app.config.get("UNSTRUCTURED_API_KEY", None)

unstructured = UnstructuredRequestSession(unstructured_base_url, api_key)

Expand Down

0 comments on commit 1e8415f

Please sign in to comment.