diff --git a/main.py b/main.py index a80e56c..c6b826c 100755 --- a/main.py +++ b/main.py @@ -2,25 +2,25 @@ """Open issues in repos for errors in publiccode.yml.""" import argparse -import os import hashlib -import re -import requests import logging - -import jinja2 -import github - +import os +import re +import sys +import time from collections import namedtuple -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from urllib.parse import urlparse -API_BASEURL = os.getenv('API_BASEURL', 'https://api.developers.italia.it/v1') -GITHUB_USERNAME = os.getenv('GITHUB_USERNAME', 'publiccode-validator-bot') +import github +import jinja2 +import requests + +API_BASEURL = os.getenv("API_BASEURL", "https://api.developers.italia.it/v1") +GITHUB_USERNAME = os.getenv("GITHUB_USERNAME", "publiccode-validator-bot") SoftwareLog = namedtuple("SoftwareLog", "log_url software formatted_error_output") -# logging.basicConfig(level=logging.DEBUG) def to_markdown(error: str, repo_url: str) -> str: out = "| |Message|\n|-|-|\n" @@ -30,21 +30,48 @@ def to_markdown(error: str, repo_url: str) -> str: } for line in error.splitlines(): - m = re.match(r"^publiccode.yml:(?P\d+):\d+: (?P.*?): (?P.*)", line) + m = re.match( + r"^publiccode.yml:(?P\d+):\d+: (?P.*?): (?P.*)", line + ) if m is None: out += f"|❌|`{line}`|\n" continue - line_num = m.group('line_num') - level = m.group('level') - msg = m.group('msg') + line_num = m.group("line_num") + level = m.group("level") + msg = m.group("msg") - repo_url = repo_url.removesuffix('.git') + repo_url = repo_url.removesuffix(".git") out += f"|{icons.get(level, '')} [`publiccode.yml:{line_num}`]({repo_url}/blob/HEAD/publiccode.yml#L{line_num})| `{msg}`|\n" return out + +def software(): + software = [] + + page = True + page_after = "" + + while page: + res = requests.get( + f"{API_BASEURL}/software{page_after}", + params={ + "page[size]": 100, + }, + ) + res.raise_for_status() + + body = res.json() + software += body["data"] + + page_after = body["links"]["next"] + page = bool(page_after) + + return software + + def software_logs(days): logs = [] @@ -55,36 +82,45 @@ def software_logs(days): while page: res = requests.get( - f'{API_BASEURL}/logs{page_after}', + f"{API_BASEURL}/logs{page_after}", params={ - 'from': begin.strftime('%Y-%m-%dT%H:%M:%SZ'), - 'page[size]': 100, - } + "from": begin.strftime("%Y-%m-%dT%H:%M:%SZ"), + "page[size]": 100, + }, ) res.raise_for_status() body = res.json() - logs += body['data'] + logs += body["data"] - page_after = body['links']['next'] + page_after = body["links"]["next"] page = bool(page_after) # XXX: get last for repo for log in logs: - match = re.search(r"^\[(?P.+?)\] BAD publiccode.yml: (?P.*)", log['message'], re.MULTILINE|re.DOTALL) + match = re.search( + r"^\[(?P.+?)\] BAD publiccode.yml: (?P.*)", + log["message"], + re.MULTILINE | re.DOTALL, + ) if match: - entity = log.get('entity') + entity = log.get("entity") if not entity or entity == "//": continue - res = requests.get(f'{API_BASEURL}{entity}') + res = requests.get(f"{API_BASEURL}{entity}") res.raise_for_status() software = res.json() - yield SoftwareLog(f'{API_BASEURL}/logs/{log["id"]}', software, to_markdown(match.group('parser_output'), software['url'])) + yield SoftwareLog( + f'{API_BASEURL}/logs/{log["id"]}', + software, + to_markdown(match.group("parser_output"), software["url"]), + ) + def issues(gh: github.Github, repo: str) -> list[github.Issue.Issue]: issues = gh.search_issues( @@ -92,19 +128,29 @@ def issues(gh: github.Github, repo: str) -> list[github.Issue.Issue]: "updated", "desc", repo=repo, - author=GITHUB_USERNAME + author=GITHUB_USERNAME, ) return [issue for issue in issues] + +def issues_by_state( + issues: list[github.Issue.Issue], state: str +) -> list[github.Issue.Issue]: + return list(filter(lambda issue: issue.state == state, issues)) + + def has_issue(issues: list[github.Issue.Issue]) -> github.Issue.Issue | None: return next(iter(issues), False) + def has_open_issue(issues: list[github.Issue.Issue]) -> github.Issue.Issue | None: - return next(filter(lambda issue: issue.state == 'open', issues), None) + return next(filter(lambda issue: issue.state == "open", issues), None) + def has_closed_issue(issues: list[github.Issue.Issue]) -> github.Issue.Issue | None: - return next(filter(lambda issue: issue.state == 'closed', issues), None) + return next(filter(lambda issue: issue.state == "closed", issues), None) + def should_update_issue(sha1sum: str, issue: github.Issue.Issue) -> bool: compact = list(filter(None, issue.body.splitlines())) @@ -115,62 +161,128 @@ def should_update_issue(sha1sum: str, issue: github.Issue.Issue) -> bool: print(f"WARNING: can't find hidden_fields in issue {issue.html_url}") return False - data = re.split("[,=]", m.group('data')) + data = re.split("[,=]", m.group("data")) hidden_fields = dict(zip(data[::2], data[1::2])) - return issue.state == 'open' or hidden_fields['sha1sum'] != sha1sum + return issue.state == "open" or hidden_fields["sha1sum"] != sha1sum -def main(): - parser = argparse.ArgumentParser( - description="Open issues in repos for errors in publiccode.yml from the logs in Developers Italia API.", - ) - parser.add_argument( - '--since', action="store", dest="since", type=int, - default=1, help="Number of days to go back for the analyzed logs (default: 1)" - ) - args = parser.parse_args() - gh = github.Github(os.getenv("BOT_GITHUB_TOKEN")) - # github.enable_console_debug_logging() +def status(gh): + issues = gh.search_issues(f"author:{GITHUB_USERNAME} type:issue sort:updated") + + for issue in issues: + print(f"{issue.state}\t{issue.updated_at.isoformat()}\t{issue.html_url}") + +def run(gh, since, dry_run): environment = jinja2.Environment(loader=jinja2.FileSystemLoader("templates/")) template = environment.get_template("github/issue.it.tpl.txt") - for log in software_logs(args.since): - if not log.software['url'].startswith("https://github.com/"): - print(f"{log.software['url']} is not a GitHub repo. Only GitHub is supported for now.") + issues_created = 0 + issues_updated = 0 + issues_untouched = 0 + + for log in software_logs(since): + url = log.software["url"].lower() + if not url.startswith("https://github.com/"): + print(f"🚫 {url} is not a GitHub repo. Only GitHub is supported for now.") continue - if log.software['url'] != 'https://github.com/teamdigitale/dati-semantic-lode.git' and \ - log.software['url'] != 'https://github.com/KnowageLabs/Knowage-Server.git': - print(f"Skipping {log.software['url']}") + if "codiceIPA is " in log.formatted_error_output: + print(f"skipping {url} (codiceIPA is...)") continue - debug = urlparse(log.software['url']).path - sha1sum = hashlib.sha1(log.formatted_error_output.encode('utf-8')).hexdigest() + debug = urlparse(url).path + sha1sum = hashlib.sha1(log.formatted_error_output.encode("utf-8")).hexdigest() content = template.render( - {"formatted_error_output": log.formatted_error_output, - "debug": debug, - "api_log_url": log.log_url, - "hidden_fields": { - "sha1sum": sha1sum - }}, + { + "formatted_error_output": log.formatted_error_output, + "debug": debug, + "api_log_url": log.log_url, + "hidden_fields": {"sha1sum": sha1sum}, + }, ) - repo_path = urlparse(log.software['url']).path[1:].removesuffix('.git') - iss = issues(gh, repo_path) - - issue = has_issue(iss) - if not issue: - repo = gh.get_repo(f"{repo_path}", lazy=True) - print(f"Creating issue for {log.software['url']}...") - repo.create_issue(title="Errors in publiccode.yml file", body=content) - elif should_update_issue(sha1sum, issue): - print(f"Updating issue for {log.software['url']}...") - issue.edit(body=content) - else: - print("Doing nothing") + repo_path = urlparse(url).path[1:].removesuffix(".git") + + try: + iss = issues(gh, repo_path) + + issue = has_issue(iss) + if not issue: + repo = gh.get_repo(f"{repo_path}", lazy=True) + print(f"➕ Creating issue for {url}...") + if not dry_run: + repo.create_issue( + title="Errors in publiccode.yml file", body=content + ) + + issues_created += 1 + elif should_update_issue(sha1sum, issue): + print(f"🔄 Updating issue for {url}...") + if not dry_run: + issue.edit(body=content) + + issues_updated += 1 + else: + print(f"== Issue is open and unchanged, doing nothing ({url})") + issues_untouched += 1 + except github.GithubException as e: + if isinstance(e, github.RateLimitExceededException): + reset_timestamp = int(e.headers.get("x-ratelimit-reset", 0)) + sleep_duration = max(0, reset_timestamp - int(time.time()) + 10) + + print(f"Rate limit exceeded. Sleeping for {sleep_duration} seconds...", end="") + sys.stdout.flush() + time.sleep(sleep_duration) + print("done") + else: + print(f"Error in the GitHub request, repo=${url}: {e}") + continue + + print("==== Summary ====") + print(f"➕ Created issues:\t{issues_created}") + print(f"🔄 Updated issues:\t{issues_updated}") + print(f"== Untouched issues:\t{issues_untouched}") + + +def main(): + parser = argparse.ArgumentParser( + description="Open issues in repos for errors in publiccode.yml from the logs in Developers Italia API.", + ) + parser.add_argument( + "--since", + action="store", + dest="since", + type=int, + default=1, + help="Number of days to go back for the analyzed logs (default: 1)", + ) + parser.add_argument( + "-n", + "--dry-run", + action="store_true", + dest="dry_run", + default=False, + help="Don't actually create or update issues, just print", + ) + + subparsers = parser.add_subparsers(dest="command") + subparsers.add_parser("status") + + args = parser.parse_args() + + gh = github.Github(os.getenv("BOT_GITHUB_TOKEN")) + + # logging.basicConfig(level=logging.DEBUG) + # github.enable_console_debug_logging() + + if args.command == "status": + status(gh) + else: + run(gh, args.since, args.dry_run) + if __name__ == "__main__": main()