diff --git a/setup.py b/setup.py index b42639eea..4785728fc 100644 --- a/setup.py +++ b/setup.py @@ -160,11 +160,12 @@ def run(self): 'matplotlib>=3.0.0', 'natsort>=7.1.1', 'networkx>=2.5', - 'numpy>=1.19.2', + 'numpy>=1.20.3', 'omegaconf==2.0.6', 'openpyxl>=3.0.7', 'pandas>=1.1.5', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', + 'python-dateutil>=2.8.2', 'pyyaml>=5.3', 'requests>=2.27.1', 'seaborn>=0.11.2', diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py index 2337435ea..f9f4065f9 100644 --- a/superbench/analyzer/file_handler.py +++ b/superbench/analyzer/file_handler.py @@ -39,7 +39,7 @@ def read_raw_data(raw_data_path): raw_data_df = raw_data_df.rename(raw_data_df['node']) raw_data_df = raw_data_df.drop(columns=['node']) except Exception as e: - logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data fomat - {}'.format(str(e))) + logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data format - {}'.format(str(e))) return raw_data_df diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py new file mode 100644 index 000000000..9f82d26f8 --- /dev/null +++ b/superbench/analyzer/generate_baseline.py @@ -0,0 +1,248 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for baseline generation.""" + +import argparse +from copy import deepcopy +import json +import re + +from joblib import Parallel, delayed +import pandas as pd + +from superbench.common.utils import logger +from superbench.analyzer import file_handler +from superbench.analyzer import data_analysis +from superbench.analyzer import DataDiagnosis +from superbench.analyzer import ResultSummary +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType +from superbench.benchmarks.context import Enum + + +class BaselineAlgoType(Enum): + """The Enum class representing different baseline generation algorithm.""" + + MEAN = 'mean' + FIX_THRESHOLD = 'fix_threshold' + + +class GenerateBaseline(DataDiagnosis): + """The class to generate baseline for raw data.""" + + def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op): + """Fix threshold outlier detection algorithm. + + Step 0: Put all data in the collection + Step 1: Regenerate the collection + Calculate the average number in the collection as the baseline + Remove all data which cannot pass the fix threshold based on the new baseline + Step 2: If no data has been removed from Step 1, go to Step 3; otherwise, go to Step 1 + Step 3: Use the baseline and fix threshold for Outlier Detection + + Args: + data_series (pd.Series): data of the metric + single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format + metric (str): the name of the metric to execute the algorithm + rule_op (function): diagnosis rule op function + + Returns: + tuple: the baseline of the metric, normal data of the metric + """ + if single_metric_with_baseline['metrics'][metric] != None and single_metric_with_baseline['metrics'][metric + ] != -1: + return single_metric_with_baseline['metrics'][metric] + tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline) + tmp_single_metric_with_baseline['metrics'] = {} + clean = False + while clean is False: + clean = True + baseline_val = data_series.mean() + for val in data_series.index: + tmp_single_metric_with_baseline['metrics'][metric] = baseline_val + if baseline_val == 0: + break + data_row = pd.Series([data_series[val]], index=[metric]) + details = [] + categories = set() + summary_data_row = pd.Series(index=[metric], dtype=float) + violated_num = rule_op(data_row, tmp_single_metric_with_baseline, summary_data_row, details, categories) + if violated_num: + data_series = data_series.drop(val) + clean = False + baseline = tmp_single_metric_with_baseline['metrics'][metric] + return baseline, data_series + + def get_aggregate_data(self, raw_data_file, summary_rule_file): + r"""Aggregate raw data according to the summary rule file. + + If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data + If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern + + Args: + raw_data_file (str): the file name of the raw data file + summary_rule_file (str): the file name of the summary rule file + + Returns: + DataFrame: aggregated data + """ + self.rs = ResultSummary() + rules = self.rs._preprocess(raw_data_file, summary_rule_file) + # parse rules for result summary + if not self.rs._parse_rules(rules): + return + aggregated_df = pd.DataFrame() + for rule in self.rs._sb_rules: + single_metric_rule = self.rs._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + data_df_of_rule = self.rs._raw_data_df[metrics] + if self.rs._sb_rules[rule]['aggregate']: + # if aggregate is True, aggregate in ranks + if self.rs._sb_rules[rule]['aggregate'] is True: + data_df_of_rule = data_analysis.aggregate(data_df_of_rule) + # if aggregate is not empty and is a pattern in regex, aggregate according to pattern + else: + pattern = self.rs._sb_rules[rule]['aggregate'] + data_df_of_rule_with_short_name = data_analysis.aggregate(data_df_of_rule, pattern) + data_df_of_rule = pd.DataFrame(columns=metrics) + # restore the columns of data_fd to full metric names + for metric in metrics: + short = '' + match = re.search(pattern, metric) + if match: + metric_in_list = list(metric) + for i in range(1, len(match.groups()) + 1): + metric_in_list[match.start(i):match.end(i)] = '*' + short = ''.join(metric_in_list) + data_df_of_rule[metric] = data_df_of_rule_with_short_name[short] + aggregated_df = pd.concat([aggregated_df, data_df_of_rule], axis=1) + return aggregated_df + + def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline): + """Generate the baseline in json format. + + Args: + algo (str): the algorithm to generate the baseline + aggregated_df (DataFrame): aggregated data + diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm + baseline (dict): existing baseline of some metrics + + Returns: + dict: baseline of metrics defined in diagnosis_rule_files for fix_threshold algorithm or defined in rule_summary_files for mean + """ + baseline = {} + # re-organize metrics by benchmark names + self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) + if algo == 'mean': + mean_df = self._raw_data_df.mean() + for metric in self._raw_data_df.columns: + if metric in baseline: + return baseline[metric] + baseline[metric] = mean_df[metric] + elif algo == 'fix_threshold': + # read diagnosis rules + rules = file_handler.read_rules(diagnosis_rule_file) + if not self._parse_rules_and_baseline(rules, baseline): + return baseline + else: + for rule in self._sb_rules: + single_metric_rule = self._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + function_name = self._sb_rules[rule]['function'] + rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) + outputs = Parallel(n_jobs=-1)( + delayed(self.fix_threshold_outlier_detection) + (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics + ) + for index, out in enumerate(outputs): + baseline[metrics[index]] = out[0] + aggregated_df[metrics[index]] = out[1] + return baseline + + def run( + self, + raw_data_file, + summary_rule_file, + output_dir, + algorithm='mean', + diagnosis_rule_file=None, + baseline_file=None, + digit=2 + ): + """Export baseline to json file. + + If diagnosis_rule_file is None, use mean of the data as baseline. + If diagnosis_rule_file is not None, use the rules in diagnosis_rule_file to execute fix_threshold algorithm. + + Args: + raw_data_df (DataFrame): raw data + summary_rule_file (str): the file name of the summary rule file + output_dir (str): the directory to save the baseline file + algorithm (str): the algorithm to generate the baseline + diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm + baseline_file (str): the file name of the baseline file + digit (int): the number of digits after the decimal point + """ + try: + # aggregate results from different devices + self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file) + # read existing baseline + baseline = {} + if baseline_file: + baseline = file_handler.read_baseline() + # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method + baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline) + for metric in baseline: + val = baseline[metric] + if isinstance(self._raw_data_df[metric].iloc[0], float): + baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val + elif isinstance(self._raw_data_df[metric].iloc[0], int): + baseline[metric] = int(val) + else: + try: + baseline[metric] = float(val) + except Exception as e: + logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e))) + baseline = json.dumps(baseline, indent=2, sort_keys=True) + baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline) + with open(output_dir + '/baseline.json', mode='w') as f: + f.write(baseline) + + except Exception as e: + logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e))) + + +if __name__ == '__main__': + global args + parser = argparse.ArgumentParser() + parser.add_argument( + '--algo', + type=str, + default='fix_threshold', + required=False, + help='Algorithm to generate baseline, eg, mean/fix_threshold.' + ) + parser.add_argument( + '--input_dir', + type=str, + default=None, + required=False, + help='Input directory which stores the results-summary.jsonl.' + ) + parser.add_argument( + '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.' + ) + parser.add_argument( + '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.' + ) + args = parser.parse_args() + folder = args.input_dir + if args.algo == 'mean': + # simply use mean, need result_summary rules to define how to aggregate the metrics. + print('Generate baseine using mean of the data.') + elif args.algo == 'fix_threshold': + # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. + print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.') + GenerateBaseline().run( + folder + '/results-summary.jsonl', args.summary_rule_file, folder, 'fix_threshold', args.diagnosis_rule_file + ) diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py new file mode 100644 index 000000000..d64813366 --- /dev/null +++ b/superbench/analyzer/generate_statistic.py @@ -0,0 +1,200 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for baseline generation.""" + +import argparse +import os +import natsort as ns + +from joblib import Parallel, delayed +import pandas as pd +import matplotlib.pyplot as plt + +from superbench.common.utils import logger +from superbench.analyzer import file_handler +from superbench.analyzer import data_analysis +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType +from generate_baseline import GenerateBaseline + + +def plot_steps(data, title=None, save_path=None, show=True): + """Plot steps. + + Args: + data (list): data to plot + title (str): title of the plot + save_path (str): path to save the plot + show (bool): whether to show the plot + """ + plt.figure(figsize=(10, 6)) + plt.scatter(range(0, len(data)), data) + if title: + plt.title(title) + plt.xlabel('Devices') + plt.ylabel('Value') + plt.ylim(0, max(data) * 1.1) + if save_path is not None: + plt.savefig(save_path) + if show: + plt.show() + plt.close() + + +class GenerateStatistics(GenerateBaseline): + """GenerateStatistics class to generate statistics for raw data.""" + def calculate_statistics(self, healthy_df): + """Calculate statistics for healthy data. + + Args: + healthy_df (DataFrame): healthy data + + Returns: + DataFrame: statistics for healthy data + """ + stat_df = data_analysis.statistic(healthy_df) + stat_df.loc['(max-min)/max'] = (stat_df.loc['max'] - stat_df.loc['min']) / stat_df.loc['max'] + stat_df = stat_df.drop(index='1%') + stat_df = stat_df.drop(index='5%') + stat_df = stat_df.drop(index='95%') + stat_df = stat_df.drop(index='99%') + return stat_df + + def output_excel(self, excel_file, stat_df, digit=2): + """Output excel file. + + Args: + excel_file (str): excel file path + stat_df (DataFrame): statistics data + digit (int): digit to round + """ + try: + writer = pd.ExcelWriter(excel_file, engine='xlsxwriter') + + for benchmark in self._benchmark_metrics_dict: + benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]] + #benchmark_df = benchmark_df[,mixedsort(names(benchmark_df))] + benchmark_df = benchmark_df.reindex(ns.natsorted(benchmark_df.columns), axis=1) + sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1] + benchmark_df.to_excel(writer, sheet_name=sheet_name) + worksheet = writer.sheets[sheet_name] + row_start = 1 + row_end = max(row_start, len(self._benchmark_metrics_dict[benchmark])) + columns = list(benchmark_df.index) + col_index = columns.index('(max-min)/max') + 1 + workbook = writer.book + percent_format = workbook.add_format({'num_format': '0.00%'}) + worksheet.conditional_format( + col_index, + row_start, + col_index, + row_end, # start_row, start_col, end_row, end_col + { + 'type': 'no_blanks', + 'format': percent_format + } + ) + num_format = f'0.{digit * "0"}' + for col_index in range(2, len(columns)): + round_format = workbook.add_format({'num_format': num_format}) + worksheet.conditional_format( + col_index, + row_start, + col_index, + row_end, # start_row, start_col, end_row, end_col + { + 'type': 'no_blanks', + 'format': round_format + } + ) + writer.close() + except Exception as e: + logger.error('output excel failed: {}'.format(str(e))) + + def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_file=None, digit=2, plot=False): + """Run the statistics generation. + + Args: + raw_data_file (str): raw data file path + output_dir (str): output directory + diagnosis_rule_file (str): diagnosis rule file path + summary_rule_file (str): summary rule file path + digit (int): digit to round + plot (bool): whether to plot the data + """ + try: + # aggregate results from different devices + self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file) + # re-organize metrics by benchmark names + self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) + # read existing baseline + baseline = {} + # read diagnosis rules + aggregated_df = self._raw_data_df.copy() + rules = file_handler.read_rules(diagnosis_rule_file) + if not self._parse_rules_and_baseline(rules, baseline): + logger.error('parse rule failed') + return None + else: + for rule in self._sb_rules: + single_metric_rule = self._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + function_name = self._sb_rules[rule]['function'] + rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) + outputs = Parallel(n_jobs=-1)( + delayed(self.fix_threshold_outlier_detection) + (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics + ) + for index, out in enumerate(outputs): + if not out: + logger.error('Analyzer: filter healthy nodes failed') + return + aggregated_df[metrics[index]] = out[1] + if plot: + plot_steps( + out[1].tolist(), + title=metrics[index], + save_path=os.path.join( + output_dir, 'figures', metrics[index].replace('/', '_').replace(':', '_') + '.png' + ), + show=False + ) + stat_df = self.calculate_statistics(aggregated_df) + excel_file = os.path.join(output_dir, 'benchmark_stability_stat.xlsx') + self.output_excel(excel_file, stat_df, digit) + + except Exception as e: + logger.error('Analyzer: generate statisitics failed, msg: {}'.format(str(e))) + + +if __name__ == '__main__': + global args + parser = argparse.ArgumentParser() + + parser.add_argument( + '--input_dir', + type=str, + default='rawdata/', + required=False, + help='Input directory which stores the results-summary.jsonl.' + ) + parser.add_argument( + '--diagnosis_rule_file', + type=str, + default='rules/diagnosis_rules.yaml', + required=False, + help='The input path of diagnosis rule file.' + ) + parser.add_argument( + '--summary_rule_file', + type=str, + default='rules/analysis_rules.yaml', + required=False, + help='The input path of summary rule file.' + ) + args = parser.parse_args() + + # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. + GenerateStatistics().run( + args.input_dir + '/results-summary.jsonl', args.input_dir, args.diagnosis_rule_file, args.summary_rule_file + ) diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml new file mode 100644 index 000000000..3b330b5bb --- /dev/null +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -0,0 +1,77 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + model-train-benchmarks: + statistics: + - mean + categories: model + metrics: + - model-benchmarks:.*/.*/.*_train_throughput + - gpt_models/.*/.*_train_throughput + - lstm_models/pytorch-lstm/.*_train_throughput + - bert_models/pytorch-bert-.*/.*_train_throughput + - resnet_models/pytorch-resnet\d*/.*_train_throughput + - vgg_models/pytorch-vgg\d*/.*_train_throughput + - densenet_models/.*/.*_train_throughput + model-inference-benchmarks: + statistics: + - mean + categories: model + aggregate: True + metrics: + - model-benchmarks:.*/.*/.*_inference_throughput:\d+ + micro-aggregation: + statistics: mean + categories: MICRO1 + aggregate: True + metrics: + - gemm-flops:*.*/.*ops + - mem-bw:*.*/.*_bw + - kernel-launch:*.*/.*_time + - computation-communication-overlap:*.*/.*_time + - cublas-function:*.*/.*_time + - cudnn-function:*.*/.*_time + - ort-inference/.*_time.* + - tensorrt-inference/.*_time.* + - cublaslt-gemm:*.*/.*ops + - dist-inference/.*step_times.* + micro-nonaggregation: + statistics: mean + categories: MICRO2 + metrics: + - nccl-bw:*.*/allreduce_.*_busbw + - rccl-bw:*.*/allreduce_.*_busbw + - sharding-matmul:*.*/.*_time + - matmul:*.*/.*_time + - gpu-burn/.*_pass + - gpu_burn/.*_abort + - cpu-memory-bw-latency/.*_bw + - cpu-memory-bw-latency/.*_lat + gpu-copy-bw: + statistics: mean + categories: DTOH + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw + gpu-copy-bw1: + statistics: mean + categories: DTOH + metrics: + #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw + gpu-copy-bw2: + statistics: mean + categories: HTOD + metrics: + #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + disk: + statistics: mean + categories: DISK + aggregate: disk-benchmark/nvme(\d+n1)_.*_iops + metrics: + - disk-benchmark/nvme(\d+n1)_.*_iops diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml new file mode 100644 index 000000000..23f8e4c2a --- /dev/null +++ b/superbench/analyzer/rules/analysis_rules.yaml @@ -0,0 +1,100 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + model-benchmarks: + statistics: + - mean + categories: models + metrics: + - model-benchmarks:.*/.*/.*_train_throughput + - gpt_models/.*/.*_train_throughput + - lstm_models/pytorch-lstm/.*_train_throughput + - bert_models/pytorch-bert-.*/.*_train_throughput + - resnet_models/pytorch-resnet\d*/.*_train_throughput + - vgg_models/pytorch-vgg\d*/.*_train_throughput + - densenet_models/.*/.*_train_throughput + micro-flops: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops + - gemm-flops:*.*/.*op + # micro-cublasltflops: + # statistics: mean + # categories: MICRO + # aggregate: True + # metrics: + # - cublaslt-gemm/fp.*_.*_flops + # micro-cublasltbatchflops: + # statistics: mean + # categories: MICRO + # aggregate: True + # metrics: + # - cublaslt-gemm:bmm/fp.*_.*_flops + micro-aggregation-with-aggregate: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - kernel-launch:*.*/.*_time + - dist-inference/.*step_times.* + - mem-bw:*.*/.*_bw + - computation-communication-overlap:*.*/.*_time + micro-aggregation-wo-aggregate: + statistics: mean + categories: MICRO + aggregate: False + metrics: + - nccl-bw:*.*/allreduce_.*_busbw + - sharding-matmul:*.*/.*_time + - matmul:*.*/.*_time + #- gpu-burn:*.*/.*_pass + - cpu-memory-bw-latency/.*_bw + - cpu-memory-bw-latency/.*_lat + #- cublas-function:*.*/.*_time + #- cudnn-function:*.*/.*_time + #- ort-inference/.*_time.* + #- tensorrt-inference/.*_time.* + gpu-copy-bw: + statistics: mean + categories: DTOH + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw + # gpu-copy-bw1: + # statistics: mean + # categories: DTOH + # metrics: + # #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + # - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + # aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw + # gpu-copy-bw2: + # statistics: mean + # categories: HTOD + # metrics: + # #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + # - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + # aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + # # ib-loopback1: + # # statistics: mean + # # categories: RDMA + # # metrics: + # # #- ib-loopback/ib_write_8388608_ib.*_bw + # # - ib-loopback/ib_write_.*_ib.*_bw + # # aggregate: ib-loopback/ib_write_.*_ib(.*)_bw + # ib-loopback2: + # statistics: mean + # categories: RDMA + # metrics: + # #- ib-loopback/ib_write_8388608_ib.*_bw + # - ib-loopback/ib_write_bw_.*:\d+ + # #aggregate: True + # disk: + # statistics: mean + # categories: DISK + # aggregate: disk-benchmark/nvme(\d+n1)_.*_iops + # metrics: + # - disk-benchmark/nvme(\d+n1)_.*_iops diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml new file mode 100644 index 000000000..cbe7994bd --- /dev/null +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -0,0 +1,237 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + # Rule 0: If one test fails, label it as Not acceptable + falure_rule: + function: value + criteria: 'lambda x:x>0' + categories: FailedTest + metrics: + - kernel-launch:*.*/return_code + - mem-bw:*.*/return_code + - gemm-flops:*.*/return_code + - ib-loopback:*.*/return_code + - nccl-bw:*.*/return_code + - gpt_models/.*/return_code + - lstm_models/.*/return_code + - bert_models/.*/return_code + - resnet_models/.*/return_code + - vgg_models/.*/return_code + - densenet_models/.*/return_code + - model-benchmarks:.*/return_code:.* + #- cudnn-function:*.*/return_code # 06:14 + - cublaslt-gemm:*.*/return_code + - cublas-function:*.*/return_code # 05:52 + - matmul:*.*/return_code # 00:25 + - gpu-burn:*.*/return_code # 15:08 + # microbenchmark - communication + - cpu-memory-bw-latency:*.*/return_code # 05:38 + - gpu-copy-bw:*.*/return_code # 08:44 + - computation-communication-overlap:*.*/return_code # 06:30 + - sharding-matmul:*.*/return_code # 00:24 + # microbenchmark - storage + #- disk-benchmark # 18:47 + # model benchmark - inference + # - ort-inference:*.*/return_code # 03:43 + ##tensorrt-inference # 02:03:33 + - dist-inference:*.*/return_code + - cublaslt-gemm:*.*/return_code + kernel_launch_rule_outlier: + function: value + criteria: 'lambda x:x<0.001' + categories: INVESTIGATING + metrics: + - kernel-launch:*.*/.*_time + kernel_launch_rule: + function: variance + criteria: 'lambda x:x>0.05' + categories: INVESTIGATING + metrics: + - kernel-launch:*.*/.*_time + tensorrt_inference: + function: variance + criteria: 'lambda x:x>0.05' + categories: TensorRT + metrics: + - tensorrt-inference/.*_time.* + # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable + tensor_core_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: TensorCore + metrics: + - gemm-flops:*.*/.*ops + cublaslt_gemm_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: CUBLASLT + metrics: + - cublaslt-gemm:*.*/.*ops + cublas: + function: variance + criteria: 'lambda x:x>0.05' + categories: CUBLAS + metrics: + - cublas-function:*.*/.*_time + cudnn: + function: variance + criteria: 'lambda x:x>0.05' + categories: CUDNN + metrics: + - cudnn-function:*.*/.*_time + matmul: + function: variance + criteria: 'lambda x:x>0.05' + categories: MATMUL + metrics: + - matmul:*.*/.*_time + - sharding-matmul:*.*/.*_time + cpu1: + function: variance + criteria: 'lambda x:x>0.05' + categories: CPU + metrics: + - cpu-memory-bw-latency/.*_lat + gpu_burn: + function: value + criteria: 'lambda x:x!=1' + categories: GPUBURN + metrics: + - gpu-burn:*.*/.*_pass + cpu: + function: variance + criteria: 'lambda x:x<-0.05' + categories: CPU + metrics: + - cpu-memory-bw-latency/.*_bw + # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable + mem_bw_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Mem + metrics: + - mem-bw:*.*/h2d_bw + - mem-bw:*.*/d2h_bw + gpu-copy: + function: variance + criteria: 'lambda x:x<-0.05' + categories: GPUCOPY + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable + # rdma_loopback_rule: + # function: variance + # criteria: 'lambda x:x<-0.05' + # categories: RDMA + # metrics: + # - ib-loopback:*.*/ib_write_bw_8388608 + #- ib-loopback:*.*/ib_write_bw_.* + # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable + nccl_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: NCCL + metrics: + - nccl-bw:*.*/allreduce_.*_busbw + overlap_rules: + function: variance + criteria: 'lambda x:x>0.05' + categories: OVERLAP + metrics: + - computation-communication-overlap:*.*/.*_time + investigating_rules: + function: variance + criteria: 'lambda x:x>0.05' + categories: DIST_INFERENCE + metrics: + - dist-inference/.*step_times + disk_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: DISK + metrics: + - disk-benchmark/nvme(\d+n1)_.*_iops + #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable + model_throughput_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Model + metrics: + - gpt_models/.*/.*_train_throughput + - lstm_models/.*lstm/.*_train_throughput + - bert_models/.*bert-.*/.*_train_throughput + - model-benchmarks:.*/.*bert.*/.*_train_throughput + - model-benchmarks:.*/.*gpt.*/.*_train_throughput + - model-benchmarks:.*/.*lstm.*/.*_train_throughput + # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable + resnet_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - resnet_models/.*resnet\d*/.*_train_throughput + - model-benchmarks:.*/.*resnet.*/.*_train_throughput + vgg_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - vgg_models/.*vgg\d*/.*_train_throughput + - model-benchmarks:.*/.*vgg.*/.*_train_throughput + densenet_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - densenet_models/.*densenet\d*/.*_train_throughput + - model-benchmarks:.*/.*densenet.*/.*_train_throughput + cnn_throughput_rule: + categories: CNN + function: multi_rules + criteria: 'lambda label:True if label["resnet_throughput"]+label["densenet_throughput"]>=2 else False' + vgg_throughput_rule: + categories: VGG + function: multi_rules + criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False' + model_inference_throughput_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Model + metrics: + - model-benchmarks:.*/.*bert.*/.*_inference_throughput + - model-benchmarks:.*/.*gpt.*/.*_inference_throughput + - model-benchmarks:.*/.*lstm.*/.*_inference_throughput + # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable + resnet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*resnet.*/.*_inference_throughput + vgg_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*vgg.*/.*_inference_throughput + densenet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*densenet.*/.*_inference_throughput + cnn_inference_throughput_rule: + categories: CNN + function: multi_rules + criteria: 'lambda label:True if label["resnet_inference_throughput"]+label["densenet_inference_throughput"]>=2 else False' + vgg_inference_throughput_rule: + categories: VGG + function: multi_rules + criteria: 'lambda label:True if label["vgg_inference_throughput"]>=2 else False'