Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug Fix: Data Diagnosis - Fix bug of failure test and warning of pandas in data diagnosis #638

Merged
merged 5 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion superbench/analyzer/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def aggregate(raw_data_df, pattern=None):
match = re.search(pattern, metric)
if match:
metric_in_list = list(metric)
for i in range(1, len(match.groups()) + 1):
for i in range(len(match.groups()), 0, -1):
metric_in_list[match.start(i):match.end(i)] = '*'
short = ''.join(metric_in_list)
if short not in metric_store:
Expand Down
11 changes: 5 additions & 6 deletions superbench/analyzer/data_diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,8 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
all_data_df = data_not_accept_df[[
append_columns[index]
]].merge(all_data_df, left_index=True, right_index=True, how='right')
all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, True)
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].replace(np.nan, 0)
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].astype(int)
all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, 1).astype('bool')
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].replace(np.nan, 0).astype('int')

return all_data_df

Expand Down Expand Up @@ -296,7 +295,7 @@ def output_diagnosis_in_jsonl(self, data_not_accept_df, output_path):
data_not_accept_df (DataFrame): the DataFrame to output
output_path (str): the path of output jsonl file
"""
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
p = Path(output_path)
try:
data_not_accept_json = data_not_accept_df.to_json(orient='index')
Expand Down Expand Up @@ -327,7 +326,7 @@ def output_diagnosis_in_json(self, data_not_accept_df, output_path):
data_not_accept_df (DataFrame): the DataFrame to output
output_path (str): the path of output jsonl file
"""
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
data_not_accept_df = data_not_accept_df.reset_index()
data_not_accept_df = data_not_accept_df.rename(
columns={
Expand Down Expand Up @@ -378,7 +377,7 @@ def generate_md_lines(self, data_not_accept_df, rules, round):
data_not_accept_df = data_analysis.round_significant_decimal_places(
data_not_accept_df, round, [metric]
)
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
lines = file_handler.generate_md_table(data_not_accept_df, header)
return lines

Expand Down
11 changes: 7 additions & 4 deletions superbench/analyzer/diagnosis_rule_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,19 +239,22 @@ def failure_check(data_row, rule, summary_data_row, details, categories, raw_rul
violated_metric_num = 0
for metric_regex in raw_rule['metrics']:
match = False
violate = False
for metric in rule['metrics']:
if re.search(metric_regex, metric):
match = True
# metric not in raw_data or the value is none, miss test
if metric not in data_row or pd.isna(data_row[metric]):
violated_metric_num += 1
break
if RuleOp.miss_test(metric, rule, data_row, details, categories):
violate = True
# metric_regex written in rules is not matched by any metric, miss test
if not match:
violated_metric_num += 1
violate = True
RuleOp.add_categories_and_details(metric_regex + '_miss', rule['categories'], details, categories)
if violate:
violated_metric_num += 1
# return code != 0, failed test
violated_metric_num += RuleOp.value(data_row, rule, summary_data_row, details, categories)
details[:] = list(dict.fromkeys(details)) # remove duplicate details
return violated_metric_num


Expand Down
Loading