Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable archive.org queries after timeout #347

Merged
merged 3 commits into from
Oct 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 33 additions & 34 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from .metadata import Ckan


class GraphQLQuery:
class GitHubBatchedQuery:

PATH_PATTERN = re.compile(r'^/([^/]+)/([^/]+)')

# The URL that handles GitHub GraphQL requests
GITHUB_API = 'https://api.github.com/graphql'
Expand Down Expand Up @@ -137,6 +139,8 @@ def sum_graphql_result(self, apidata: Dict[str, Any]) -> int:

class SpaceDockBatchedQuery:

PATH_PATTERN = re.compile(r'^/mod/([^/]+)')

SPACEDOCK_API = 'https://spacedock.info/api/download_counts'

def __init__(self) -> None:
Expand Down Expand Up @@ -184,7 +188,6 @@ class InternetArchiveBatchedQuery:

def __init__(self) -> None:
self.ids: Dict[str, str] = {}
self.connect_timed_out = False

def empty(self) -> bool:
return len(self.ids) == 0
Expand All @@ -198,28 +201,21 @@ def add(self, ckan: Ckan) -> None:
def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
if counts is None:
counts = {}
if self.connect_timed_out:
return counts
try:
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
try:
counts[ckan_ident] = counts.get(ckan_ident, 0) + result[ia_ident]['all_time']
except KeyError as exc:
logging.error('InternetArchive id not found in downloads result: %s',
ia_ident, exc_info=exc)
return counts
except ConnectTimeout as exc:
# Cleanly turn off archive.org counting while the downtime continues
logging.error('Failed to get counts from archive.org',
exc_info=exc)
self.connect_timed_out = True
return counts
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
try:
counts[ckan_ident] = counts.get(ckan_ident, 0) + result[ia_ident]['all_time']
except KeyError as exc:
logging.error('InternetArchive id not found in downloads result: %s',
ia_ident, exc_info=exc)
return counts


class SourceForgeQuerier:

PATH_PATTERN = re.compile(r'^/project/([^/]+)')

# https://sourceforge.net/p/forge/documentation/Download%20Stats%20API/
API_TEMPLATE = Template('https://sourceforge.net/projects/${proj_id}/files/stats/json'
'?start_date=2010-01-01&end_date=${today}'
Expand All @@ -245,10 +241,6 @@ def get_result(cls, ident: str, proj_id: str,

class DownloadCounter:

GITHUB_PATH_PATTERN = re.compile(r'^/([^/]+)/([^/]+)')
SPACEDOCK_PATH_PATTERN = re.compile(r'^/mod/([^/]+)')
SOURCEFORGE_PATH_PATTERN = re.compile(r'^/project/([^/]+)')

def __init__(self, game_id: str, ckm_repo: CkanMetaRepo, github_token: str) -> None:
self.game_id = game_id
self.ckm_repo = ckm_repo
Expand All @@ -260,17 +252,17 @@ def __init__(self, game_id: str, ckm_repo: CkanMetaRepo, github_token: str) -> N
)

def get_counts(self) -> None:
graph_query = GraphQLQuery(self.github_token)
graph_query = GitHubBatchedQuery(self.github_token)
sd_query = SpaceDockBatchedQuery()
ia_query = InternetArchiveBatchedQuery()
ia_query: Optional[InternetArchiveBatchedQuery] = InternetArchiveBatchedQuery()
for ckan in self.ckm_repo.all_latest_modules(): # pylint: disable=too-many-nested-blocks
if ckan.kind == 'dlc':
continue
for download in ckan.downloads:
try:
url_parse = urllib.parse.urlparse(download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
match = GitHubBatchedQuery.PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
Expand All @@ -280,20 +272,27 @@ def get_counts(self) -> None:
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
match = SpaceDockBatchedQuery.PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
if ia_query:
ia_query.add(ckan)
if ia_query.full():
try:
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except ConnectTimeout as exc:
# Cleanly turn off archive.org counting while the downtime continues
logging.error('Failed to get counts from archive.org',
exc_info=exc)
ia_query = None
elif url_parse.netloc.endswith('.sourceforge.net'):
match = self.SOURCEFORGE_PATH_PATTERN.match(url_parse.path)
match = SourceForgeQuerier.PATH_PATTERN.match(url_parse.path)
if match:
SourceForgeQuerier.get_result(ckan.identifier, match.group(1),
self.counts)
Expand All @@ -310,7 +309,7 @@ def get_counts(self) -> None:
if not graph_query.empty():
# Final pass doesn't overflow the bound
graph_query.get_result(self.counts)
if not ia_query.empty():
if ia_query and not ia_query.empty():
ia_query.get_result(self.counts)

def write_json(self) -> None:
Expand Down