diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3d763d4..23509b3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[3.4.0] - 2024-02-01 +~~~~~~~~~~~~~~~~~~~~ +Changed +_______ +* GitHub Actions downloader script now pulls all attempts and records in JSON. CLI has changed. + [3.3.0] - 2024-01-23 ~~~~~~~~~~~~~~~~~~~~ Changed diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index 2bf1c4b..37183df 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '3.3.0' +__version__ = '3.4.0' diff --git a/edx_arch_experiments/scripts/get_action_errors.py b/edx_arch_experiments/scripts/get_action_errors.py index df23791..bda479c 100755 --- a/edx_arch_experiments/scripts/get_action_errors.py +++ b/edx_arch_experiments/scripts/get_action_errors.py @@ -1,138 +1,262 @@ """ -Script to get the annotations from all failed checks in edx-platform after a given date +Script to get the annotations from all checks in a repo in a given date range. -Gets all the commits to master after the date, then for each commit gets each check suite, then for each failed check -suite gets each run. Collects the annotations for all the failed runs. The annotations will sometimes contain useful -error messages, sometimes just the exit code. Getting the full logs requires admin permissions to edx-platform so it's -not included in this script. -Example output row: -commit_date,run_started_at,run_completed_at,commit_hash,name,message -2023-07-26T20:59:01Z,2023-07-27T06:56:23Z,2023-07-27T07:01:58Z,06e738e64a3485ecec037a9b8a36cf4ae145ea8a, -upgrade-one-python-dependency-workflow,Process completed with exit code 2. - - -This script takes a pretty long time to run (15m for 2 months) and there is a risk if you look too far back you will hit -your API limit. +Run with --help (or see docstring of `run`) for more details. """ -from csv import DictWriter +import json +import os +import time from datetime import datetime +from os import path import click import requests -@click.command() -@click.option('--token', envvar='GITHUB_TOKEN') -@click.option('--start_date', type=click.DateTime(formats=["%Y-%m-%d"]), help="Date of earliest commit") -@click.option('--filename', help="Where to write the data") -def get_errors_from_date(token, start_date, filename): +def _ensure_dir(base, *more): """ - Creates a csv documenting the annotations from all failed runs for commits to edx-platform after the given date + Join these path segments, create as dir if not already, and return the path. + """ + subdir = path.join(base, *more) + os.makedirs(subdir, exist_ok=True) + return subdir - Parameters: - token (string): The GitHub API token. Retrieved from the env GITHUB_TOKEN variable - start_date (date): The earliest date to look for - filename (string): Where to write the csv - """ - headers = {'Authorization': f"Bearer {token}"} - all_commits_after_date = get_commits_after_date(start_date, headers=headers) - all_check_suites = [] - all_rows = [] - for commit in all_commits_after_date: - # gather all the check suite data from each commit into a single list - add_commit_check_suites(commit, all_check_suites, headers) - for check_suite in all_check_suites: - # only record annotations for failed runs - if check_suite['conclusion'] == 'failure': - check_runs = requests.get(check_suite['check_runs_url'], headers=headers).json() - for run in check_runs['check_runs']: - if run['conclusion'] == 'failure' and run['output']['annotations_count'] > 0: - annotations = requests.get(run['output']['annotations_url'], headers=headers).json() - for annotation in annotations: - all_rows.append({ - 'commit_hash': run['head_sha'], - 'name': run['name'], - 'message': annotation['message'], - 'run_started_at': run['started_at'], - 'run_completed_at': run['completed_at'], - 'commit_date': check_suite['commit_date'] - }) - - with open(filename, 'w') as f: - writes = DictWriter(f, fieldnames=['commit_date', 'run_started_at', 'run_completed_at', 'commit_hash', 'name', - 'message']) - writes.writeheader() - writes.writerows(all_rows) - - -def get_commits_after_date(cut_off_date, headers): - """ - Get API data for all commits to edx-platform/master after the given date +class ActionsDownloader: - Parameters: - cut_off_date (date): Earliest date to look - headers (dict): Authentication headers for GH requests + def __init__(self, *, output_dir, token): + self.output_dir = output_dir + self.api_headers = { + 'Accept': 'application/vnd.github+json', + 'Authorization': f"Bearer {token}", + 'X-GitHub-Api-Version': '2022-11-28', + } - Returns: - A list of all the API responses for each commit after the date - """ - base_url = "https://api.github.com/repos/openedx/edx-platform/commits?sha=master&per_page=100" - # will keep track of whether we've hit our start_date. the API automatically returns commits ordered - # by date, descending - found_last = False - all_commits_after_date = [] - page = 1 - while not found_last: - page_url = f"{base_url}&page={page}" - print(f"Fetching page {page_url}") - response = requests.get(page_url, headers=headers) - if response.status_code >= 400: - print(response) - break - response_json = response.json() - if len(response_json) == 0: - break - for single_commit in response_json: - # if present, take off the "Z" at the end of the date to make it proper ISO format - commit_date = datetime.fromisoformat(single_commit['commit']['committer']['date'].replace("Z", "")) - if commit_date < cut_off_date: - found_last = True - break - all_commits_after_date.append(single_commit) - page += 1 - return all_commits_after_date - - -def add_commit_check_suites(current_commit, current_suites, headers): + # Unix epoch timestamp in seconds when GitHub will allow us to + # resume making requests (or None if not rate-limited) + self.github_sleep_until_s = None + + # The actual contents doesn't really matter, but empty files might + # be confusing and maybe the fetch date will be useful. + self.download_marker_data = { + 'fetch_run_timestamp': datetime.utcnow().isoformat(), + } + self.workflow_fetch_params = { + # We don't want the response to include all the PRs that include this commit. + # It can be large for active repos and it doesn't help us. + 'exclude_pull_requests': 'true', + } + + def _github_get(self, url, *, params=None): + """ + GET the url with GitHub auth token and return a `requests` response. + + Performs both proactive backoff based on response header hints and + reactive backoff based on error responses responses. + + Docs: + - https://docs.github.com/en/rest/using-the-rest-api/best-practices-for-using-the-rest-api?apiVersion=2022-11-28 + - https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 + """ + backoff_s = None # exponential backoff in seconds, or None if not in effect + while True: + # If GitHub has told us how long to wait, use that instead of the + # current exponential backoff value. + if self.github_sleep_until_s is not None: + # Add slop to prevent tight loop on expiry + sleep_s = self.github_sleep_until_s + 5.0 - time.time() + else: + sleep_s = backoff_s + + if sleep_s and sleep_s > 0: + time.sleep(sleep_s) + + response = requests.get(url, params=params, headers=self.api_headers, timeout=20.0) + + # Update rate-limiting data for next call + if out_of_requests := (response.headers.get('x-ratelimit-remaining') == '0'): + self.github_sleep_until_s = int(response.headers.get('x-ratelimit-reset')) + print( + "Reached rate limit. " + f"Will wait {int(self.github_sleep_until_s - time.time())} seconds " + "before next request." + ) + else: + self.github_sleep_until_s = None + backoff_s = None + + if response.status_code == 200: + # We're good to go! + return response + elif out_of_requests: + # We got an error and have been informed we're out of requests. + # (Should be a 429 or 403 according to GitHub's docs.) We'll try + # again. + continue + elif response.status_code == 429: + # It's possible that GitHub might give us a 429 without the + # expected rate-limiting headers. + print("Rate-limited without timing hint; performing exponential backoff.") + backoff_s = 2 * backoff_s if backoff_s else 4 + continue + else: + # Generic error case + response.raise_for_status() + + def _write_json(self, data, *path_parts): + """ + Write data as pretty-printed JSON to the given path (joining as needed). + """ + with open(path.join(*path_parts), 'w') as f: + json.dump(data, f, sort_keys=True, indent=2) + + def _log_attempt(self, attempt, workflow_dir): + """ + Log this attempt and all of its checks and annotations. + """ + attempt_dir = _ensure_dir(workflow_dir, f"attempt_{attempt['run_attempt']}") + + attempt_file = path.join(attempt_dir, f"attempt.json") + if path.isfile(attempt_file): + print("Attempt already fully downloaded; skipping.") + return + + # Get the checks associated with this workflow run -- this + # includes output title, summary, and text. + for check_run in self._github_get(attempt['check_suite_url'] + '/check-runs').json()['check_runs']: + self._write_json(check_run, attempt_dir, f"check_run_{check_run['id']}.json") + + annotations = self._github_get(check_run['output']['annotations_url']).json() + self._write_json(annotations, attempt_dir, f"annotations_{check_run['id']}.json") + + # Do this last, indicating that the attempt was completely + # downloaded. This allows us to skip it next time. + self._write_json(attempt, attempt_file) + + def _list_all_attempts(self, run): + """ + Yield all attempts of the given workflow run, including that one. + """ + yield run + while next_url := run.get('previous_attempt_url'): + resp = self._github_get(next_url, params=self.workflow_fetch_params) + run = resp.json() + yield run + + def _download_workflow_run(self, run, repo_dir): + print(f"Downloading workflow run id={run['id']}") + + workflow_dir = _ensure_dir(repo_dir, f"run_{run['id']}",) + + download_marker = path.join(workflow_dir, f"download-marker.json") + if path.isfile(download_marker): + print("Workflow already fully downloaded; skipping.") + return + + # We're getting the *most recent attempt* of a run. Spool + # out the whole list of attempts and write them out. + for attempt in self._list_all_attempts(run): + self._log_attempt(attempt, workflow_dir) + + # Once all attempts have been logged, write out a marker file + # that indicates this workflow has been completely downloaded + # and can be skipped in the future. + self._write_json(self.download_marker_data, download_marker) + + def _list_completed_runs(self, owner, repo, start_date, end_date): + """ + Yield all completed workflow runs. + """ + # https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-repository + runs_url = f"https://api.github.com/repos/{owner}/{repo}/actions/runs" + while runs_url is not None: + params = { + **self.workflow_fetch_params, + # Filter on status=completed to get all workflows that have finished + # running. The API docs allow you to use a status *or* a conclusion + # here, but doesn't explain their relationship. The check run API docs + # seem to cover basically the same values, though: + # https://docs.github.com/en/rest/guides/using-the-rest-api-to-interact-with-checks?apiVersion=2022-11-28#about-check-runs + 'status': 'completed', + # https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates + 'created': f'{start_date}..{end_date}', + 'per_page': 100, + } + print(f"Requesting {runs_url} with {params=!r}") + resp = self._github_get(runs_url, params=params) + yield from resp.json()['workflow_runs'] + runs_url = resp.links.get('next', {}).get('url') + + def download(self, owner, repo, start_date, end_date): + repo_dir = _ensure_dir(self.output_dir, owner, repo) + + for workflow_run in self._list_completed_runs(owner, repo, start_date, end_date): + self._download_workflow_run(workflow_run, repo_dir) + + +@click.command() +@click.option( + '--token', envvar='GITHUB_TOKEN', + required=True, + help="A GitHub access token that has access to the repository.", +) +@click.option( + '--output-dir', type=click.Path(file_okay=False, dir_okay=True, writable=True), + required=True, + help="A directory (or path where one can be created) where the output will be written.", +) +@click.option( + '--owner', type=str, required=True, + help="Owning user or organization of the repo, e.g. openedx.", +) +@click.option( + '--repo', type=str, required=True, + help="Repo shortname, e.g. edx-platform.", +) +@click.option( + '--start-date', type=click.DateTime(formats=["%Y-%m-%d"]), required=True, + help="Only fetch workflow runs starting from this date.", +) +@click.option( + '--end-date', type=click.DateTime(formats=["%Y-%m-%d"]), required=True, + help="Only fetch workflow runs up through this date.", +) +def run(*, token, output_dir, owner, repo, start_date, end_date): """ - Add API information from all check suites performed for a given commit to the given list + Fetch information about workflows and check outcomes for a repository + within some date range, writing the output to a directory. + + This script will fetch: + + \b + - Workflow runs, including prior attempts + - Check runs associated with the attempts + - Annotations produced by the checks + + The output directory will contain subdirectories of the form + `{OWNER}/{REPO}/run_#/attempt_#/` for each attempt of a workflow run. + (run_# is numbered by workflow run ID, and attempt_# by attempt index.) + The run_# directory will also contain a `download-marker.json` which + indicates that all information about the workflow run was successfully + downloaded. (If missing, this indicates a partial download.) + + Each attempt directory will contain: - Parameters: - current_commit (str): the SHA of the commit to check - current_suites (list): list to be extended - headers (dict): Authentication headers for connecting to GitHub + \b + - attempt.json: Information about the workflow run. The documents for each + attempt of a workflow run will be largely the same. + - check_run_#.json: One of the checks associated with the attempt, numbered + by check-run ID. + - annotations_#.json: Annotations associated with the check run of that ID. """ - sha = current_commit['sha'] - check_url = f"https://api.github.com/repos/openedx/edx-platform/commits/{sha}/check-suites?per_page=100" - page = 1 - while True: - # Keep going until we get an empty check_suites list or an error. An empty list means we've hit the last page. - paginated_url = f"{check_url}&page={page}" - print(f"Fetching page {paginated_url}") - response = requests.get(paginated_url, headers=headers).json() - if 'check_suites' not in response.keys(): - print(response) - break - check_suites = response['check_suites'] - if len(check_suites) == 0: - break - # silly line to pass the date of the commit along to eventually write in the spreadsheet - current_suites.extend([{**s, 'commit_date': current_commit['commit']['committer']['date']} - for s in check_suites]) - page += 1 + dl = ActionsDownloader(output_dir=output_dir, token=token) + dl.download( + owner, repo, + # We just want the Y-m-d part + start_date.date().isoformat(), end_date.date().isoformat(), + ) if __name__ == '__main__': - get_errors_from_date() + run()