Skip to content

Commit

Permalink
Save other CI jobs' result (torch/tf pipeline, example, deepspeed etc) (
Browse files Browse the repository at this point in the history
#30699)

* update

* update

* update

* update

* update

* update

* update

* update

* Update utils/notification_service.py

Co-authored-by: amyeroberts <[email protected]>

---------

Co-authored-by: ydshieh <[email protected]>
Co-authored-by: amyeroberts <[email protected]>
  • Loading branch information
3 people authored May 13, 2024
1 parent 2e27291 commit 82c1625
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 17 deletions.
15 changes: 11 additions & 4 deletions .github/workflows/slack-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,10 @@ jobs:
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
# Only the model testing job is concerned for this step
if: ${{ inputs.job == 'run_models_gpu' }}
uses: actions/upload-artifact@v4
with:
name: ci_results
path: ci_results
name: ci_results_${{ inputs.job }}
path: ci_results_${{ inputs.job }}

- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
Expand All @@ -77,6 +75,7 @@ jobs:
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
CI_EVENT: scheduled
CI_SHA: ${{ github.sha }}
CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }}
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
Expand All @@ -85,3 +84,11 @@ jobs:
pip install slack_sdk
pip show slack_sdk
python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
uses: actions/upload-artifact@v4
with:
name: ci_results_${{ inputs.job }}
path: ci_results_${{ inputs.job }}
41 changes: 28 additions & 13 deletions utils/notification_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def per_model_sum(model_category_dict):
reports=sorted_model_reports,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "ci_results/model_failures_report.txt")
file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(model_failures_report)

Expand All @@ -426,18 +426,18 @@ def per_model_sum(model_category_dict):
reports=sorted_module_reports,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "ci_results/module_failures_report.txt")
file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/module_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(module_failures_report)

if self.prev_ci_artifacts is not None:
# if the last run produces artifact named `ci_results`
# if the last run produces artifact named `ci_results_{job_name}`
if (
"ci_results" in self.prev_ci_artifacts
and "model_failures_report.txt" in self.prev_ci_artifacts["ci_results"]
f"ci_results_{job_name}" in self.prev_ci_artifacts
and "model_failures_report.txt" in self.prev_ci_artifacts[f"ci_results_{job_name}"]
):
# Compute the difference of the previous/current (model failure) table
prev_model_failures = self.prev_ci_artifacts["ci_results"]["model_failures_report.txt"]
prev_model_failures = self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_failures_report.txt"]
entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
if len(entries_changed) > 0:
# Save the complete difference
Expand All @@ -447,7 +447,7 @@ def per_model_sum(model_category_dict):
reports=entries_changed,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "ci_results/changed_model_failures_report.txt")
file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/changed_model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(diff_report)

Expand Down Expand Up @@ -643,8 +643,11 @@ def get_new_model_failure_blocks(self, with_header=True):
sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0])

prev_model_results = {}
if "ci_results" in self.prev_ci_artifacts and "model_results.json" in self.prev_ci_artifacts["ci_results"]:
prev_model_results = json.loads(self.prev_ci_artifacts["ci_results"]["model_results.json"])
if (
f"ci_results_{job_name}" in self.prev_ci_artifacts
and "model_results.json" in self.prev_ci_artifacts[f"ci_results_{job_name}"]
):
prev_model_results = json.loads(self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"])

all_failure_lines = {}
for job, job_result in sorted_dict:
Expand Down Expand Up @@ -1139,20 +1142,32 @@ def prepare_reports(title, header, reports, to_truncate=True):
with open(os.path.join(directory, "selected_warnings.json")) as fp:
selected_warnings = json.load(fp)

if not os.path.isdir(os.path.join(os.getcwd(), "ci_results")):
os.makedirs(os.path.join(os.getcwd(), "ci_results"))
if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))

# Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as
# results.
if job_name == "run_models_gpu":
with open("ci_results/model_results.json", "w", encoding="UTF-8") as fp:
with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp:
json.dump(model_results, fp, indent=4, ensure_ascii=False)

# Must have the same keys as in `additional_results`.
# The values are used as the file names where to save the corresponding CI job results.
test_to_result_name = {
"PyTorch pipelines": "torch_pipeline",
"TensorFlow pipelines": "tf_pipeline",
"Examples directory": "example",
"Torch CUDA extension tests": "deepspeed",
}
for job, job_result in additional_results.items():
with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp:
json.dump(job_result, fp, indent=4, ensure_ascii=False)

prev_ci_artifacts = None
target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
# Get the last previously completed CI's failure tables
artifact_names = ["ci_results"]
artifact_names = [f"ci_results_{job_name}"]
output_dir = os.path.join(os.getcwd(), "previous_reports")
os.makedirs(output_dir, exist_ok=True)
prev_ci_artifacts = get_last_daily_ci_reports(
Expand Down
7 changes: 7 additions & 0 deletions utils/notification_service_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,13 @@ def post_reply(self):
{"line": line, "trace": stacktraces.pop(0)}
)

job_name = os.getenv("CI_TEST_JOB")
if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))

with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp:
json.dump(quantization_results, fp, indent=4, ensure_ascii=False)

message = QuantizationMessage(
title,
results=quantization_results,
Expand Down

0 comments on commit 82c1625

Please sign in to comment.