From 82c1625ec3cb2c54f8a40cb00ca03eade3206e1b Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 13 May 2024 17:27:44 +0200 Subject: [PATCH] Save other CI jobs' result (torch/tf pipeline, example, deepspeed etc) (#30699) * update * update * update * update * update * update * update * update * Update utils/notification_service.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: ydshieh Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .github/workflows/slack-report.yml | 15 +++++--- utils/notification_service.py | 41 +++++++++++++++------- utils/notification_service_quantization.py | 7 ++++ 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 77cfdc8c140241..0d1197a05d122a 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -60,12 +60,10 @@ jobs: # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts - # Only the model testing job is concerned for this step - if: ${{ inputs.job == 'run_models_gpu' }} uses: actions/upload-artifact@v4 with: - name: ci_results - path: ci_results + name: ci_results_${{ inputs.job }} + path: ci_results_${{ inputs.job }} - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 @@ -77,6 +75,7 @@ jobs: SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} CI_EVENT: scheduled CI_SHA: ${{ github.sha }} + CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. @@ -85,3 +84,11 @@ jobs: pip install slack_sdk pip show slack_sdk python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + if: ${{ inputs.job == 'run_quantization_torch_gpu' }} + uses: actions/upload-artifact@v4 + with: + name: ci_results_${{ inputs.job }} + path: ci_results_${{ inputs.job }} \ No newline at end of file diff --git a/utils/notification_service.py b/utils/notification_service.py index 0598278368cb4b..cf126cd68a3385 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -416,7 +416,7 @@ def per_model_sum(model_category_dict): reports=sorted_model_reports, to_truncate=False, ) - file_path = os.path.join(os.getcwd(), "ci_results/model_failures_report.txt") + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/model_failures_report.txt") with open(file_path, "w", encoding="UTF-8") as fp: fp.write(model_failures_report) @@ -426,18 +426,18 @@ def per_model_sum(model_category_dict): reports=sorted_module_reports, to_truncate=False, ) - file_path = os.path.join(os.getcwd(), "ci_results/module_failures_report.txt") + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/module_failures_report.txt") with open(file_path, "w", encoding="UTF-8") as fp: fp.write(module_failures_report) if self.prev_ci_artifacts is not None: - # if the last run produces artifact named `ci_results` + # if the last run produces artifact named `ci_results_{job_name}` if ( - "ci_results" in self.prev_ci_artifacts - and "model_failures_report.txt" in self.prev_ci_artifacts["ci_results"] + f"ci_results_{job_name}" in self.prev_ci_artifacts + and "model_failures_report.txt" in self.prev_ci_artifacts[f"ci_results_{job_name}"] ): # Compute the difference of the previous/current (model failure) table - prev_model_failures = self.prev_ci_artifacts["ci_results"]["model_failures_report.txt"] + prev_model_failures = self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_failures_report.txt"] entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures) if len(entries_changed) > 0: # Save the complete difference @@ -447,7 +447,7 @@ def per_model_sum(model_category_dict): reports=entries_changed, to_truncate=False, ) - file_path = os.path.join(os.getcwd(), "ci_results/changed_model_failures_report.txt") + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/changed_model_failures_report.txt") with open(file_path, "w", encoding="UTF-8") as fp: fp.write(diff_report) @@ -643,8 +643,11 @@ def get_new_model_failure_blocks(self, with_header=True): sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) prev_model_results = {} - if "ci_results" in self.prev_ci_artifacts and "model_results.json" in self.prev_ci_artifacts["ci_results"]: - prev_model_results = json.loads(self.prev_ci_artifacts["ci_results"]["model_results.json"]) + if ( + f"ci_results_{job_name}" in self.prev_ci_artifacts + and "model_results.json" in self.prev_ci_artifacts[f"ci_results_{job_name}"] + ): + prev_model_results = json.loads(self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"]) all_failure_lines = {} for job, job_result in sorted_dict: @@ -1139,20 +1142,32 @@ def prepare_reports(title, header, reports, to_truncate=True): with open(os.path.join(directory, "selected_warnings.json")) as fp: selected_warnings = json.load(fp) - if not os.path.isdir(os.path.join(os.getcwd(), "ci_results")): - os.makedirs(os.path.join(os.getcwd(), "ci_results")) + if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")): + os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as # results. if job_name == "run_models_gpu": - with open("ci_results/model_results.json", "w", encoding="UTF-8") as fp: + with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp: json.dump(model_results, fp, indent=4, ensure_ascii=False) + # Must have the same keys as in `additional_results`. + # The values are used as the file names where to save the corresponding CI job results. + test_to_result_name = { + "PyTorch pipelines": "torch_pipeline", + "TensorFlow pipelines": "tf_pipeline", + "Examples directory": "example", + "Torch CUDA extension tests": "deepspeed", + } + for job, job_result in additional_results.items(): + with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp: + json.dump(job_result, fp, indent=4, ensure_ascii=False) + prev_ci_artifacts = None target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main" if os.environ.get("CI_WORKFLOW_REF") == target_workflow: # Get the last previously completed CI's failure tables - artifact_names = ["ci_results"] + artifact_names = [f"ci_results_{job_name}"] output_dir = os.path.join(os.getcwd(), "previous_reports") os.makedirs(output_dir, exist_ok=True) prev_ci_artifacts = get_last_daily_ci_reports( diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py index 1687eeaa25f32f..6d026bc0d053dc 100644 --- a/utils/notification_service_quantization.py +++ b/utils/notification_service_quantization.py @@ -242,6 +242,13 @@ def post_reply(self): {"line": line, "trace": stacktraces.pop(0)} ) + job_name = os.getenv("CI_TEST_JOB") + if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")): + os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) + + with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp: + json.dump(quantization_results, fp, indent=4, ensure_ascii=False) + message = QuantizationMessage( title, results=quantization_results,