From d4f91d243bcc74b42555a8ad5cc12a69fcbac072 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Mon, 30 May 2022 16:27:42 +0800 Subject: [PATCH 01/12] Generate baseline using mean of fix_threshold algorithm --- superbench/analyzer/generate_baseline.py | 205 ++++++++++++++++++ .../analyzer/rules/aggregation_rules.yaml | 102 +++++++++ .../analyzer/rules/diagnosis_rules.yaml | 106 +++++++++ 3 files changed, 413 insertions(+) create mode 100644 superbench/analyzer/generate_baseline.py create mode 100644 superbench/analyzer/rules/aggregation_rules.yaml create mode 100644 superbench/analyzer/rules/diagnosis_rules.yaml diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py new file mode 100644 index 000000000..3a5b730f7 --- /dev/null +++ b/superbench/analyzer/generate_baseline.py @@ -0,0 +1,205 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for baseline generation.""" + +import argparse +import json +from pathlib import Path +import re + + +from joblib import Parallel, delayed +import pandas as pd + +from superbench.common.utils import logger +from superbench.analyzer import file_handler +from superbench.analyzer import data_analysis +from superbench.analyzer import DataDiagnosis +from superbench.analyzer import ResultSummary +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType +from superbench.benchmarks.context import Enum + + +class BaselineAlgoType(Enum): + """The Enum class representing different baseline generation algorithm.""" + + MEAN = 'mean' + FIX_THRESHOLD = 'fix_threshold' + + +class GenerateBaseline(DataDiagnosis): + def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op): + """Fix threshold outlier detection algorithm. + + Step 0: Put all data in the collection + Step 1: Regenerate the collection + Calculate the average number in the collection as the baseline + Remove all data which cannot pass the fix threshold based on the new baseline + Step 2: If no data has been removed from Step 1, go to Step 3; otherwise, go to Step 1 + Step 3: Use the baseline and fix threshold for Outlier Detection + + Args: + data_series (pd.Series): data the the metric + single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format + metric (str): the name of the metric to execute the algorithm + rule_op (function): diagnosis rule op function + + Returns: + number: the baseline of the metric + """ + if single_metric_with_baseline['metrics'][metric] != -1: + return single_metric_with_baseline['metrics'][metric] + single_metric_with_baseline['metrics'] = {} + clean = False + while clean is False: + clean = True + baseline_val = data_series.mean() + for val in data_series.index: + single_metric_with_baseline['metrics'][metric] = baseline_val + if baseline_val == 0: + break + data_row = pd.Series([data_series[val]], index=[metric]) + details = [] + categories = set() + summary_data_row = pd.Series(index=[metric], dtype=float) + violated_num = rule_op(data_row, single_metric_with_baseline, summary_data_row, details, categories) + if violated_num: + data_series = data_series.drop(val) + clean = False + baseline = single_metric_with_baseline['metrics'][metric] + return baseline + + def get_aggregate_data(self, raw_data_file, summary_rule_file): + """Aggregate raw data according to the summary rule file. + + If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data + If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern + + Args: + raw_data_file (str): the file name of the raw data file + summary_rule_file (str): the file name of the summary rule file + + Returns: + DataFrame: aggregated data + """ + self.rs = ResultSummary() + rules = self.rs._preprocess(raw_data_file, summary_rule_file) + # parse rules for result summary + if not self.rs._parse_rules(rules): + return + aggregated_df = pd.DataFrame() + for rule in self.rs._sb_rules: + single_metric_rule = self.rs._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + data_df_of_rule = self.rs._raw_data_df[metrics] + if self.rs._sb_rules[rule]['aggregate']: + # if aggregate is True, aggregate in ranks + if self.rs._sb_rules[rule]['aggregate'] is True: + data_df_of_rule = data_analysis.aggregate(data_df_of_rule) + # if aggregate is not empty and is a pattern in regex, aggregate according to pattern + else: + pattern = self.rs._sb_rules[rule]['aggregate'] + data_df_of_rule_with_short_name = data_analysis.aggregate(data_df_of_rule, pattern) + data_df_of_rule = pd.DataFrame(columns=metrics) + # restore the columns of data_fd to full metric names + for metric in metrics: + short = '' + match = re.search(pattern, metric) + if match: + metric_in_list = list(metric) + for i in range(1, len(match.groups()) + 1): + metric_in_list[match.start(i):match.end(i)] = '*' + short = ''.join(metric_in_list) + data_df_of_rule[metric] = data_df_of_rule_with_short_name[short] + aggregated_df = pd.concat([aggregated_df, data_df_of_rule], axis=1) + return aggregated_df + + def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline): + """Generate the baseline in json format. + + Args: + algo (str): the algorithm to generate the baseline + aggregated_df (DataFrame): aggregated data + diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm + baseline (dict): existing baseline of some metrics + + Returns: + dict: baseline of metrics defined in diagnosis_rule_files for fix_threshold algorithm or defined in rule_summary_files for mean + """ + baseline = {} + # re-organize metrics by benchmark names + self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) + if algo == 'mean': + mean_df = self._raw_data_df.mean() + for metric in self._raw_data_df.columns: + if metric in baseline: + return baseline[metric] + baseline[metric] = mean_df[metric] + elif algo == 'fix_threshold': + # read diagnosis rules + rules = file_handler.read_rules(diagnosis_rule_file) + if not self._parse_rules_and_baseline(rules, baseline): + return baseline + else: + for rule in self._sb_rules: + single_metric_rule = self._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + function_name = self._sb_rules[rule]['function'] + rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) + outputs = Parallel(n_jobs=-1)( + delayed(self.fix_threshold_outlier_detection)( + aggregated_df[metric], single_metric_rule, metric, rule_op) + for metric in metrics) + for index, out in enumerate(outputs): + baseline[metrics[index]] = out + return baseline + + def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None): + """Export baseline to json file. + + If diagnosis_rule_file is None, use mean of the data as baseline. + If diagnosis_rule_file is not None, use the rules in diagnosis_rule_file to execute fix_threshold algorithm. + + Args: + raw_data_df (DataFrame): raw data + output_dir (str): the directory of output file + """ + try: + # aggregate results from different devices + self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file) + # read existing baseline + baseline = {} + if baseline_file: + baseline = file_handler.read_baseline() + # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method + baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline) + # output baseline to json file + with open(output_dir+'/baseline.json', mode='w') as f: + json.dump(baseline, f, indent=2) + + except Exception as e: + logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e))) + + +if __name__ == '__main__': + global args + parser = argparse.ArgumentParser() + parser.add_argument( + '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.' + ) + parser.add_argument( + '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.' + ) + args = parser.parse_args() + folder=args.input_dir + #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41' + if args.algo=='mean': + # simply use mean, need result_summary rules to define how to aggregate the metrics. + print('Generate baseine using mean of the data.') + GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder) + elif args.algo=='fix_threshold': + # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. + print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.') + GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', + folder, 'fix_threshold', 'rules/diagnosis_rules.yaml') diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml new file mode 100644 index 000000000..c5cb1708d --- /dev/null +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -0,0 +1,102 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + model-benchmarks-FP32: + statistics: + - mean + categories: model:FP32 + metrics: + - model-benchmarks:vgg:float/.*/fp32_train_throughput + - model-benchmarks:gpt2-large:float/.*/fp32_train_throughput + - model-benchmarks:bert-base:float/.*/fp32_train_throughput + - model-benchmarks:bert-large:float/.*/fp32_train_throughput + - model-benchmarks:lstm:float/.*/fp32_train_throughput + - model-benchmarks:resnet50:float/.*/fp32_train_throughput + - model-benchmarks:resnet101:float/.*/fp32_train_throughput + - model-benchmarks:resnet152:float/.*/fp32_train_throughput + - model-benchmarks:densenet169:float/.*/fp32_train_throughput + - model-benchmarks:densenet201:float/.*/fp32_train_throughput + - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp32_train_throughput + - model-benchmarks:vgg:float/.*/fp32_train_throughput + - gpt_models/.*/fp32_train_throughput + - lstm_models/pytorch-lstm/fp32_train_throughput + - bert_models/pytorch-bert-.*/fp32_train_throughput + - resnet_models/pytorch-resnet\d*/fp32_train_throughput + - vgg_models/pytorch-vgg\d*/fp32_train_throughput + - densenet_models/.*/fp32_train_throughput + model-benchmarks-FP16: + statistics: mean + categories: model:FP16 + metrics: + - model-benchmarks:gpt2-large:half/.*/fp16_train_throughput + - model-benchmarks:bert-base:half/.*/fp16_train_throughput + - model-benchmarks:bert-large:half/.*/fp16_train_throughput + - model-benchmarks:lstm:half/.*/fp16_train_throughput + - model-benchmarks:resnet50:half/.*/fp16_train_throughput + - model-benchmarks:resnet101:half/.*/fp16_train_throughput + - model-benchmarks:resnet152:half/.*/fp16_train_throughput + - model-benchmarks:densenet169:half/.*/fp16_train_throughput + - model-benchmarks:densenet201:half/.*/fp16_train_throughput + - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp16_train_throughput + - model-benchmarks:vgg:half/.*/fp16_train_throughput + - gpt_models/.*/fp16_train_throughput + - lstm_models/pytorch-lstm/fp16_train_throughput + - bert_models/pytorch-bert-.*/fp16_train_throughput + - resnet_models/pytorch-resnet\d*/fp16_train_throughput + - vgg_models/pytorch-vgg\d*/fp16_train_throughput + - densenet_models/.*/fp16_train_throughput + micro-aggregation: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - gemm-flops:*.*/.*ops + - mem-bw:*.*/.*_bw + - kernel-launch:*.*/.*_time + - computation-communication-overlap:*.*/.*_time + - cublas-function:*.*/.*_time + - cudnn-function:*.*/.*_time + micro-nonaggregation: + statistics: mean + categories: MICRO + metrics: + - nccl-bw:*.*/allreduce_.*_busbw + - rccl-bw:*.*/allreduce_.*_busbw + - sharding-matmul:*.*/.*_time + - matmul:*.*/.*_time + - gpu-burn/.* + - cpu-memory-bw-latency/.*_bw + - cpu-memory-bw-latency/.*_lat + ib-loopback: + statistics: mean + categories: RDMA + metrics: + #- ib-loopback/ib_write_8388608_ib.*_bw + - ib-loopback/ib_write_.*_ib.*_bw + aggregate: ib-loopback/ib_write_.*_ib(.*)_bw + disk: + statistics: mean + categories: DISK + aggregate: disk-benchmark/nvme(.*)_[seq|rand]_.* + metrics: + - disk-benchmark/nvme.* + # ib-loopback1: + # statistics: mean + # categories: RDMA1 + # metrics: + # - ib-loopback/ib_write_8388608_ib[0|1|2|3]_bw:([0|1]) + # - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3]) + # aggregate: ib-loopback/ib_write_.*_ib.*_bw:([0|1|2|3]) + # ib-loopback2: + # statistics: mean + # categories: RDMA2 + # metrics: + # - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3]) + # aggregate: ib-loopback/ib_write_.*_ib[4|5|6|7]_bw:([2|3]) + # statistics: mean + # categories: RCCL:4nodes + # metrics: + # - rccl-bw:4nodealltoall/alltoall_8589934592_busbw + # - rccl-bw:4node/allreduce_17179869184_busbw + diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml new file mode 100644 index 000000000..c3b299061 --- /dev/null +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -0,0 +1,106 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + # Rule 0: If one test fails, label it as Not acceptable + falure_rule: + function: value + criteria: 'lambda x:x>0' + categories: FailedTest + metrics: + - kernel-launch:*.*/return_code + - mem-bw:*.*/return_code + - gemm-flops:*.*/return_code + - ib-loopback:*.*/return_code + - nccl-bw:*.*/return_code + - gpt_models/.*/return_code + - lstm_models/.*/return_code + - bert_models/.*/return_code + - resnet_models/.*/return_code + - vgg_models/.*/return_code + - densenet_models/.*/return_code + - model-benchmarks:.*/return_code + # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable + tensor_core_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: TensorCore + metrics: + - gemm-flops:*.*/.*ops + # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable + mem_bw_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Mem + metrics: + - mem-bw:*.*/h2d_bw + - mem-bw:*.*/d2h_bw + # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable + rdma_loopback_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: RDMA + metrics: + - ib-loopback:*.*/ib_write_8388608_.*_bw + # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable + nccl_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: NCCL + metrics: + - nccl-bw:default/allreduce_8589934592_busbw + # Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable + model_throughput_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Model + metrics: + - gpt_models/.*/.*_train_throughput + - lstm_models/pytorch-lstm/.*_train_throughput + - bert_models/pytorch-bert-.*/.*_train_throughput + - model-benchmarks:.*/pytorch-bert.*/.*_train_throughput + - model-benchmarks:.*/pytorch-gpt.*/.*_train_throughput + - model-benchmarks:.*/pytorch-lstm.*/.*_train_throughput + # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable + resnet_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - resnet_models/pytorch-resnet\d*/.*_train_throughput + - model-benchmarks:.*/pytorch-resnet.*/.*_train_throughput + vgg_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - vgg_models/pytorch-vgg\d*/.*_train_throughput + - model-benchmarks:.*/pytorch-vgg.*/.*_train_throughput + densenet_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - densenet_models/pytorch-densenet\d*/.*_train_throughput + - model-benchmarks:.*/pytorch-densenet.*/.*_train_throughput + cnn_throughput_rule: + categories: CNN + function: multi_rules + criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False' + # Rule 10: If temperature of one GPU is > 85 °C, label it as Not acceptable + temperature_rule: + function: value + categories: TEMP + criteria: 'lambda x:x>85' + metrics: + - monitor/gpu_temperature + # Rule 11: If DBE > 0, label it as Not acceptable (Not Support) + dbe_rule: + function: value + categories: TEMP + criteria: 'lambda x:x>85' + metrics: + - monitor/gpu_uncorrected_ecc From d63e9620269045f4b830bbde844a6a8eb951d63b Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 18 Jul 2022 11:42:19 +0800 Subject: [PATCH 02/12] fix bugs --- superbench/analyzer/generate_baseline.py | 51 ++++++++++++++---------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py index 3a5b730f7..e920ca142 100644 --- a/superbench/analyzer/generate_baseline.py +++ b/superbench/analyzer/generate_baseline.py @@ -4,11 +4,11 @@ """A module for baseline generation.""" import argparse +from copy import deepcopy import json from pathlib import Path import re - from joblib import Parallel, delayed import pandas as pd @@ -40,35 +40,36 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli Step 3: Use the baseline and fix threshold for Outlier Detection Args: - data_series (pd.Series): data the the metric + data_series (pd.Series): data of the metric single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format metric (str): the name of the metric to execute the algorithm rule_op (function): diagnosis rule op function Returns: - number: the baseline of the metric + tuple: the baseline of the metric, normal data of the metric """ if single_metric_with_baseline['metrics'][metric] != -1: return single_metric_with_baseline['metrics'][metric] - single_metric_with_baseline['metrics'] = {} + tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline) + tmp_single_metric_with_baseline['metrics'] = {} clean = False while clean is False: clean = True baseline_val = data_series.mean() for val in data_series.index: - single_metric_with_baseline['metrics'][metric] = baseline_val + tmp_single_metric_with_baseline['metrics'][metric] = baseline_val if baseline_val == 0: break data_row = pd.Series([data_series[val]], index=[metric]) details = [] categories = set() summary_data_row = pd.Series(index=[metric], dtype=float) - violated_num = rule_op(data_row, single_metric_with_baseline, summary_data_row, details, categories) + violated_num = rule_op(data_row, tmp_single_metric_with_baseline, summary_data_row, details, categories) if violated_num: data_series = data_series.drop(val) clean = False - baseline = single_metric_with_baseline['metrics'][metric] - return baseline + baseline = tmp_single_metric_with_baseline['metrics'][metric] + return baseline, data_series def get_aggregate_data(self, raw_data_file, summary_rule_file): """Aggregate raw data according to the summary rule file. @@ -152,10 +153,11 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline): aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics) for index, out in enumerate(outputs): - baseline[metrics[index]] = out + baseline[metrics[index]] = out[0] + aggregated_df[metrics[index]] = out[1] return baseline - def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None): + def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None, digit=2): """Export baseline to json file. If diagnosis_rule_file is None, use mean of the data as baseline. @@ -174,9 +176,18 @@ def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', baseline = file_handler.read_baseline() # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline) - # output baseline to json file + for metric in baseline: + val = baseline[metric] + if isinstance(self._raw_data_df[metric].iloc[0], float): + baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val + elif isinstance(self._raw_data_df[metric].iloc[0], int): + baseline[metric] = int(val) + print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0]) + print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0].type) + baseline = json.dumps(baseline, indent=2, sort_keys=True) + baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline) with open(output_dir+'/baseline.json', mode='w') as f: - json.dump(baseline, f, indent=2) + f.write(baseline) except Exception as e: logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e))) @@ -186,20 +197,20 @@ def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', global args parser = argparse.ArgumentParser() parser.add_argument( - '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.' - ) + '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.' + ) parser.add_argument( - '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.' - ) + '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.' + ) args = parser.parse_args() - folder=args.input_dir + folder = args.input_dir #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41' - if args.algo=='mean': + if args.algo == 'mean': # simply use mean, need result_summary rules to define how to aggregate the metrics. print('Generate baseine using mean of the data.') GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder) - elif args.algo=='fix_threshold': + elif args.algo == 'fix_threshold': # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.') GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', - folder, 'fix_threshold', 'rules/diagnosis_rules.yaml') + folder, 'fix_threshold', 'rules/diagnosis_rules.yaml') From 8b9de38dc4dafdb5cf26327243dc18182d29b12a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 18 Jul 2022 15:23:29 +0800 Subject: [PATCH 03/12] update rules --- .../analyzer/rules/aggregation_rules.yaml | 65 ++++++----------- .../analyzer/rules/diagnosis_rules.yaml | 71 ++++++++++++++----- 2 files changed, 73 insertions(+), 63 deletions(-) diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml index c5cb1708d..6cf1e8df0 100644 --- a/superbench/analyzer/rules/aggregation_rules.yaml +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -7,18 +7,7 @@ superbench: - mean categories: model:FP32 metrics: - - model-benchmarks:vgg:float/.*/fp32_train_throughput - - model-benchmarks:gpt2-large:float/.*/fp32_train_throughput - - model-benchmarks:bert-base:float/.*/fp32_train_throughput - - model-benchmarks:bert-large:float/.*/fp32_train_throughput - - model-benchmarks:lstm:float/.*/fp32_train_throughput - - model-benchmarks:resnet50:float/.*/fp32_train_throughput - - model-benchmarks:resnet101:float/.*/fp32_train_throughput - - model-benchmarks:resnet152:float/.*/fp32_train_throughput - - model-benchmarks:densenet169:float/.*/fp32_train_throughput - - model-benchmarks:densenet201:float/.*/fp32_train_throughput - - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp32_train_throughput - - model-benchmarks:vgg:float/.*/fp32_train_throughput + - model-benchmarks:.*/.*/fp32_train_throughput - gpt_models/.*/fp32_train_throughput - lstm_models/pytorch-lstm/fp32_train_throughput - bert_models/pytorch-bert-.*/fp32_train_throughput @@ -29,17 +18,7 @@ superbench: statistics: mean categories: model:FP16 metrics: - - model-benchmarks:gpt2-large:half/.*/fp16_train_throughput - - model-benchmarks:bert-base:half/.*/fp16_train_throughput - - model-benchmarks:bert-large:half/.*/fp16_train_throughput - - model-benchmarks:lstm:half/.*/fp16_train_throughput - - model-benchmarks:resnet50:half/.*/fp16_train_throughput - - model-benchmarks:resnet101:half/.*/fp16_train_throughput - - model-benchmarks:resnet152:half/.*/fp16_train_throughput - - model-benchmarks:densenet169:half/.*/fp16_train_throughput - - model-benchmarks:densenet201:half/.*/fp16_train_throughput - - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp16_train_throughput - - model-benchmarks:vgg:half/.*/fp16_train_throughput + - model-benchmarks:.*/.*/fp16_train_throughput - gpt_models/.*/fp16_train_throughput - lstm_models/pytorch-lstm/fp16_train_throughput - bert_models/pytorch-bert-.*/fp16_train_throughput @@ -57,6 +36,8 @@ superbench: - computation-communication-overlap:*.*/.*_time - cublas-function:*.*/.*_time - cudnn-function:*.*/.*_time + - ort-inference/.*_time.* + - tensorrt-inference/.*_time.* micro-nonaggregation: statistics: mean categories: MICRO @@ -68,35 +49,29 @@ superbench: - gpu-burn/.* - cpu-memory-bw-latency/.*_bw - cpu-memory-bw-latency/.*_lat - ib-loopback: + gpu-copy-bw: + statistics: mean + categories: RDMA + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + aggregate: gpu-copy-bw:perf/.*gpu(.*)_to_gpu(.*)_write_by_.*_bw + ib-loopback1: statistics: mean categories: RDMA metrics: #- ib-loopback/ib_write_8388608_ib.*_bw - ib-loopback/ib_write_.*_ib.*_bw aggregate: ib-loopback/ib_write_.*_ib(.*)_bw + ib-loopback2: + statistics: mean + categories: RDMA + metrics: + #- ib-loopback/ib_write_8388608_ib.*_bw + - ib-loopback/ib_write_bw_.*:\d+ + aggregate: True disk: statistics: mean categories: DISK - aggregate: disk-benchmark/nvme(.*)_[seq|rand]_.* + aggregate: disk-benchmark/nvme(\d+n1)_.*_iops metrics: - - disk-benchmark/nvme.* - # ib-loopback1: - # statistics: mean - # categories: RDMA1 - # metrics: - # - ib-loopback/ib_write_8388608_ib[0|1|2|3]_bw:([0|1]) - # - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3]) - # aggregate: ib-loopback/ib_write_.*_ib.*_bw:([0|1|2|3]) - # ib-loopback2: - # statistics: mean - # categories: RDMA2 - # metrics: - # - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3]) - # aggregate: ib-loopback/ib_write_.*_ib[4|5|6|7]_bw:([2|3]) - # statistics: mean - # categories: RCCL:4nodes - # metrics: - # - rccl-bw:4nodealltoall/alltoall_8589934592_busbw - # - rccl-bw:4node/allreduce_17179869184_busbw - + - disk-benchmark/nvme(\d+n1)_.*_iops diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index c3b299061..1d58080b8 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -20,13 +20,40 @@ superbench: - vgg_models/.*/return_code - densenet_models/.*/return_code - model-benchmarks:.*/return_code + kernel_launch_rule: + function: variance + criteria: 'lambda x:x>0.05' + categories: INVESTIGATING + metrics: + - kernel-launch:*.*/.*_time # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable tensor_core_rule: function: variance criteria: 'lambda x:x<-0.05' - categories: TensorCore + categories: TensorCore metrics: - gemm-flops:*.*/.*ops + cudnn_cublas: + function: variance + criteria: 'lambda x:x>0.05' + categories: TensorCore + metrics: + - cublas-function:*.*/.*_time + - cudnn-function:*.*/.*_time + matmul: + function: variance + criteria: 'lambda x:x>0.05' + categories: TensorCore + metrics: + - matmul:*.*/.*_time + - gpu-burn/.* + - cpu-memory-bw-latency/.*_lat + cpu: + function: variance + criteria: 'lambda x:x<-0.05' + categories: TensorCore + metrics: + - cpu-memory-bw-latency/.*_bw # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable mem_bw_rule: function: variance @@ -35,21 +62,42 @@ superbench: metrics: - mem-bw:*.*/h2d_bw - mem-bw:*.*/d2h_bw + gpu-copy: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Mem + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable rdma_loopback_rule: function: variance criteria: 'lambda x:x<-0.05' categories: RDMA metrics: - - ib-loopback:*.*/ib_write_8388608_.*_bw + - ib-loopback:*.*/ib_write_bw_8388608 + #- ib-loopback:*.*/ib_write_bw_.* # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable nccl_rule: function: variance criteria: 'lambda x:x<-0.05' categories: NCCL metrics: - - nccl-bw:default/allreduce_8589934592_busbw - # Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable + - nccl-bw:.*/allreduce_8589934592_busbw + investigating_rules: + function: variance + criteria: 'lambda x:x>0.05' + categories: INVESTIGATING + metrics: + - computation-communication-overlap:*.*/.*_time + - sharding-matmul:*.*/.*_time + - ort-inference/.*_time.* + disk_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: NCCL + metrics: + - disk-benchmark/nvme(\d+n1)_.*_iops + #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable model_throughput_rule: function: variance criteria: 'lambda x:x<-0.05' @@ -90,17 +138,4 @@ superbench: categories: CNN function: multi_rules criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False' - # Rule 10: If temperature of one GPU is > 85 °C, label it as Not acceptable - temperature_rule: - function: value - categories: TEMP - criteria: 'lambda x:x>85' - metrics: - - monitor/gpu_temperature - # Rule 11: If DBE > 0, label it as Not acceptable (Not Support) - dbe_rule: - function: value - categories: TEMP - criteria: 'lambda x:x>85' - metrics: - - monitor/gpu_uncorrected_ecc + From 5eb9219e20b2a62ec4e3c381d324bb758ba98bbb Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 13 Apr 2023 02:43:16 +0000 Subject: [PATCH 04/12] add statistic and revise --- superbench/analyzer/generate_baseline.py | 62 ++++++-- superbench/analyzer/generate_statistic.py | 186 ++++++++++++++++++++++ 2 files changed, 232 insertions(+), 16 deletions(-) create mode 100644 superbench/analyzer/generate_statistic.py diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py index e920ca142..bdab5851b 100644 --- a/superbench/analyzer/generate_baseline.py +++ b/superbench/analyzer/generate_baseline.py @@ -6,7 +6,6 @@ import argparse from copy import deepcopy import json -from pathlib import Path import re from joblib import Parallel, delayed @@ -29,6 +28,7 @@ class BaselineAlgoType(Enum): class GenerateBaseline(DataDiagnosis): + """The class to generate baseline for raw data.""" def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op): """Fix threshold outlier detection algorithm. @@ -72,7 +72,7 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli return baseline, data_series def get_aggregate_data(self, raw_data_file, summary_rule_file): - """Aggregate raw data according to the summary rule file. + r"""Aggregate raw data according to the summary rule file. If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern @@ -149,15 +149,24 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline): function_name = self._sb_rules[rule]['function'] rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) outputs = Parallel(n_jobs=-1)( - delayed(self.fix_threshold_outlier_detection)( - aggregated_df[metric], single_metric_rule, metric, rule_op) - for metric in metrics) + delayed(self.fix_threshold_outlier_detection) + (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics + ) for index, out in enumerate(outputs): baseline[metrics[index]] = out[0] aggregated_df[metrics[index]] = out[1] return baseline - def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None, digit=2): + def run( + self, + raw_data_file, + summary_rule_file, + output_dir, + algorithm='mean', + diagnosis_rule_file=None, + baseline_file=None, + digit=2 + ): """Export baseline to json file. If diagnosis_rule_file is None, use mean of the data as baseline. @@ -165,7 +174,12 @@ def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', Args: raw_data_df (DataFrame): raw data - output_dir (str): the directory of output file + summary_rule_file (str): the file name of the summary rule file + output_dir (str): the directory to save the baseline file + algorithm (str): the algorithm to generate the baseline + diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm + baseline_file (str): the file name of the baseline file + digit (int): the number of digits after the decimal point """ try: # aggregate results from different devices @@ -182,11 +196,14 @@ def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val elif isinstance(self._raw_data_df[metric].iloc[0], int): baseline[metric] = int(val) - print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0]) - print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0].type) + else: + try: + baseline[metric] = float(val) + except Exception as e: + logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e))) baseline = json.dumps(baseline, indent=2, sort_keys=True) baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline) - with open(output_dir+'/baseline.json', mode='w') as f: + with open(output_dir + '/baseline.json', mode='w') as f: f.write(baseline) except Exception as e: @@ -197,20 +214,33 @@ def run(self, raw_data_file, summary_rule_file, output_dir, algorithm='mean', global args parser = argparse.ArgumentParser() parser.add_argument( - '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.' + '--algo', + type=str, + default='fix_threshold', + required=False, + help='Algorithm to generate baseline, eg, mean/fix_threshold.' + ) + parser.add_argument( + '--input_dir', + type=str, + default=None, + required=False, + help='Input directory which stores the results-summary.jsonl.' + ) + parser.add_argument( + '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.' ) parser.add_argument( - '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.' + '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.' ) args = parser.parse_args() folder = args.input_dir - #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41' if args.algo == 'mean': # simply use mean, need result_summary rules to define how to aggregate the metrics. print('Generate baseine using mean of the data.') - GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder) elif args.algo == 'fix_threshold': # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.') - GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', - folder, 'fix_threshold', 'rules/diagnosis_rules.yaml') + GenerateBaseline().run( + folder + '/results-summary.jsonl', args.diagnosis_rule_file, folder, 'fix_threshold', args.summary_rule_file + ) diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py new file mode 100644 index 000000000..51bf2d75a --- /dev/null +++ b/superbench/analyzer/generate_statistic.py @@ -0,0 +1,186 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for baseline generation.""" + +import argparse +import os + +from joblib import Parallel, delayed +import pandas as pd +import matplotlib.pyplot as plt + +from superbench.common.utils import logger +from superbench.analyzer import file_handler +from superbench.analyzer import data_analysis +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType +from generate_baseline import GenerateBaseline + + +def plot_steps(data, title=None, save_path=None, show=True): + """Plot steps. + + Args: + data (list): data to plot + title (str): title of the plot + save_path (str): path to save the plot + show (bool): whether to show the plot + """ + plt.figure(figsize=(10, 6)) + plt.scatter(range(0, len(data)), data) + if title: + plt.title(title) + plt.xlabel('Devices') + plt.ylabel('Value') + plt.ylim(0, max(data) * 1.1) + if save_path is not None: + plt.savefig(save_path) + if show: + plt.show() + plt.close() + + +class GenerateStatistics(GenerateBaseline): + """GenerateStatistics class to generate statistics for raw data.""" + def calculate_statistics(self, healthy_df): + """Calculate statistics for healthy data. + + Args: + healthy_df (DataFrame): healthy data + + Returns: + DataFrame: statistics for healthy data + """ + stat_df = data_analysis.statistic(healthy_df) + stat_df.loc['(max-min)/max'] = (stat_df.loc['max'] - stat_df.loc['min']) / stat_df.loc['max'] + stat_df = stat_df.drop(index='1%') + stat_df = stat_df.drop(index='5%') + stat_df = stat_df.drop(index='95%') + stat_df = stat_df.drop(index='99%') + return stat_df + + def output_excel(self, excel_file, stat_df, digit=2): + """Output excel file. + + Args: + excel_file (str): excel file path + stat_df (DataFrame): statistics data + digit (int): digit to round + """ + try: + writer = pd.ExcelWriter(excel_file, engine='xlsxwriter') + + for benchmark in self._benchmark_metrics_dict: + benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]] + sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1] + benchmark_df.to_excel(writer, sheet_name=sheet_name) + worksheet = writer.sheets[sheet_name] + row_start = 1 + row_end = max(row_start, len(self._benchmark_metrics_dict[benchmark])) + columns = list(benchmark_df.index) + col_index = columns.index('(max-min)/max') + 1 + workbook = writer.book + percent_format = workbook.add_format({'num_format': '0.00%'}) + worksheet.conditional_format( + col_index, + row_start, + col_index, + row_end, # start_row, start_col, end_row, end_col + { + 'type': 'no_blanks', + 'format': percent_format + } + ) + num_format = f'0.{digit * "0"}' + for col_index in range(2, len(columns)): + round_format = workbook.add_format({'num_format': num_format}) + worksheet.conditional_format( + col_index, + row_start, + col_index, + row_end, # start_row, start_col, end_row, end_col + { + 'type': 'no_blanks', + 'format': round_format + } + ) + writer.close() + except Exception as e: + logger.error('output excel failed: {}'.format(str(e))) + + def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_file=None, digit=2, plot=False): + """Run the statistics generation. + + Args: + raw_data_file (str): raw data file path + output_dir (str): output directory + diagnosis_rule_file (str): diagnosis rule file path + summary_rule_file (str): summary rule file path + digit (int): digit to round + plot (bool): whether to plot the data + """ + try: + # aggregate results from different devices + self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file) + # re-organize metrics by benchmark names + self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) + # read existing baseline + baseline = {} + # read diagnosis rules + aggregated_df = self._raw_data_df.copy() + rules = file_handler.read_rules(diagnosis_rule_file) + if not self._parse_rules_and_baseline(rules, baseline): + logger.error('parse rule failed') + return None + else: + for rule in self._sb_rules: + single_metric_rule = self._sb_rules[rule] + metrics = list(single_metric_rule['metrics'].keys()) + function_name = self._sb_rules[rule]['function'] + rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) + outputs = Parallel(n_jobs=-1)( + delayed(self.fix_threshold_outlier_detection) + (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics + ) + for index, out in enumerate(outputs): + aggregated_df[metrics[index]] = out[1] + if plot: + plot_steps( + out[1].tolist(), + title=metrics[index], + save_path=os.path.join( + output_dir, 'figures', metrics[index].replace('/', '_').replace(':', '_') + '.png' + ), + show=False + ) + stat_df = self.calculate_statistics(aggregated_df) + excel_file = os.path.join(output_dir, 'benchmark_stability_stat.xlsx') + self.output_excel(excel_file, stat_df, digit) + + except Exception as e: + logger.error('Analyzer: generate statisitics failed, msg: {}'.format(str(e))) + + +if __name__ == '__main__': + global args + parser = argparse.ArgumentParser() + + parser.add_argument( + '--input_dir', + type=str, + default=None, + required=False, + help='Input directory which stores the results-summary.jsonl.' + ) + parser.add_argument( + '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.' + ) + parser.add_argument( + '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.' + ) + args = parser.parse_args() + + # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. + GenerateStatistics().run( + args.input_dir + '/results-summary.jsonl', args.input_dir, args.diagnosis_rule_file, args.summary_rule_file + ) From c589e8d3c06aee860ccd7e92369357b86e543ce2 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 13 Apr 2023 02:51:21 +0000 Subject: [PATCH 05/12] add example rules --- superbench/analyzer/rules/analysis_rules.yaml | 100 +++++++++++++ .../analyzer/rules/diagnosis_rules.yaml | 131 ++++++++++++------ 2 files changed, 192 insertions(+), 39 deletions(-) create mode 100644 superbench/analyzer/rules/analysis_rules.yaml diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml new file mode 100644 index 000000000..f71d76b80 --- /dev/null +++ b/superbench/analyzer/rules/analysis_rules.yaml @@ -0,0 +1,100 @@ +# SuperBench rules +version: v0.5 +superbench: + rules: + model-benchmarks: + statistics: + - mean + categories: models + metrics: + - model-benchmarks:.*/.*/.*_train_throughput + - gpt_models/.*/.*_train_throughput + - lstm_models/pytorch-lstm/.*_train_throughput + - bert_models/pytorch-bert-.*/.*_train_throughput + - resnet_models/pytorch-resnet\d*/.*_train_throughput + - vgg_models/pytorch-vgg\d*/.*_train_throughput + - densenet_models/.*/.*_train_throughput + micro-flops: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops + - gemm-flops:*.*/.*op + micro-cublasltflops: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - cublaslt-gemm/fp.*_.*_flops + micro-cublasltbatchflops: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - cublaslt-gemm:bmm/fp.*_.*_flops + micro-aggregation-with-aggregate: + statistics: mean + categories: MICRO + aggregate: True + metrics: + - kernel-launch:*.*/.*_time + - dist-inference/.*step_times.* + - mem-bw:*.*/.*_bw + - computation-communication-overlap:*.*/.*_time + micro-aggregation-wo-aggregate: + statistics: mean + categories: MICRO + aggregate: False + metrics: + - nccl-bw:*.*/allreduce_.*_busbw + - sharding-matmul:*.*/.*_time + - matmul:*.*/.*_time + - gpu-burn:*.*/.* + - cpu-memory-bw-latency/.*_bw + - cpu-memory-bw-latency/.*_lat + #- cublas-function:*.*/.*_time + #- cudnn-function:*.*/.*_time + #- ort-inference/.*_time.* + #- tensorrt-inference/.*_time.* + gpu-copy-bw: + statistics: mean + categories: DTOH + metrics: + - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw + # gpu-copy-bw1: + # statistics: mean + # categories: DTOH + # metrics: + # #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + # - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + # aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw + # gpu-copy-bw2: + # statistics: mean + # categories: HTOD + # metrics: + # #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + # - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + # aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + # # ib-loopback1: + # # statistics: mean + # # categories: RDMA + # # metrics: + # # #- ib-loopback/ib_write_8388608_ib.*_bw + # # - ib-loopback/ib_write_.*_ib.*_bw + # # aggregate: ib-loopback/ib_write_.*_ib(.*)_bw + # ib-loopback2: + # statistics: mean + # categories: RDMA + # metrics: + # #- ib-loopback/ib_write_8388608_ib.*_bw + # - ib-loopback/ib_write_bw_.*:\d+ + # #aggregate: True + # disk: + # statistics: mean + # categories: DISK + # aggregate: disk-benchmark/nvme(\d+n1)_.*_iops + # metrics: + # - disk-benchmark/nvme(\d+n1)_.*_iops diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index 1d58080b8..a7074087a 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -19,7 +19,30 @@ superbench: - resnet_models/.*/return_code - vgg_models/.*/return_code - densenet_models/.*/return_code - - model-benchmarks:.*/return_code + - model-benchmarks:.*/return_code:.* + #- cudnn-function:*.*/return_code # 06:14 + - cublaslt-gemm:*.*/return_code + - cublas-function:*.*/return_code # 05:52 + - matmul:*.*/return_code # 00:25 + - gpu-burn:*.*/return_code # 15:08 + # microbenchmark - communication + - cpu-memory-bw-latency:*.*/return_code # 05:38 + - gpu-copy-bw:*.*/return_code # 08:44 + - computation-communication-overlap:*.*/return_code # 06:30 + - sharding-matmul:*.*/return_code # 00:24 + # microbenchmark - storage + #- disk-benchmark # 18:47 + # model benchmark - inference + # - ort-inference:*.*/return_code # 03:43 + ##tensorrt-inference # 02:03:33 + - dist-inference:*.*/return_code + - cublaslt-gemm:*.*/return_code + kernel_launch_rule_outlier: + function: value + criteria: 'lambda x:x<0.001' + categories: INVESTIGATING + metrics: + - kernel-launch:*.*/.*_time kernel_launch_rule: function: variance criteria: 'lambda x:x>0.05' @@ -33,25 +56,47 @@ superbench: categories: TensorCore metrics: - gemm-flops:*.*/.*ops - cudnn_cublas: + cublaslt_gemm_rule: function: variance - criteria: 'lambda x:x>0.05' - categories: TensorCore - metrics: - - cublas-function:*.*/.*_time - - cudnn-function:*.*/.*_time + criteria: 'lambda x:x<-0.05' + categories: CUBLASLT + metrics: + - cublaslt-gemm:*.*/.*ops + # cublas: + # function: variance + # criteria: 'lambda x:x>0.05' + # categories: CUBLAS + # metrics: + # - cublas-function:*.*/.*_time + # cudnn: + # function: variance + # criteria: 'lambda x:x>0.05' + # categories: CUDNN + # metrics: + # - cudnn-function:*.*/.*_time matmul: function: variance criteria: 'lambda x:x>0.05' - categories: TensorCore + categories: MATMUL metrics: - matmul:*.*/.*_time - - gpu-burn/.* + - sharding-matmul:*.*/.*_time + cpu1: + function: variance + criteria: 'lambda x:x>0.05' + categories: CPU + metrics: - cpu-memory-bw-latency/.*_lat + gpu_burn: + function: value + criteria: 'lambda x:x!=1' + categories: GPUBURN + metrics: + - gpu-burn:*.*/.*_pass cpu: function: variance criteria: 'lambda x:x<-0.05' - categories: TensorCore + categories: CPU metrics: - cpu-memory-bw-latency/.*_bw # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable @@ -65,16 +110,16 @@ superbench: gpu-copy: function: variance criteria: 'lambda x:x<-0.05' - categories: Mem + categories: GPUCOPY metrics: - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable - rdma_loopback_rule: - function: variance - criteria: 'lambda x:x<-0.05' - categories: RDMA - metrics: - - ib-loopback:*.*/ib_write_bw_8388608 + # rdma_loopback_rule: + # function: variance + # criteria: 'lambda x:x<-0.05' + # categories: RDMA + # metrics: + # - ib-loopback:*.*/ib_write_bw_8388608 #- ib-loopback:*.*/ib_write_bw_.* # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable nccl_rule: @@ -82,21 +127,25 @@ superbench: criteria: 'lambda x:x<-0.05' categories: NCCL metrics: - - nccl-bw:.*/allreduce_8589934592_busbw - investigating_rules: + - nccl-bw:*.*/allreduce_.*_busbw + overlap_rules: function: variance criteria: 'lambda x:x>0.05' - categories: INVESTIGATING + categories: OVERLAP metrics: - computation-communication-overlap:*.*/.*_time - - sharding-matmul:*.*/.*_time - - ort-inference/.*_time.* - disk_rule: + investigating_rules: function: variance - criteria: 'lambda x:x<-0.05' - categories: NCCL - metrics: - - disk-benchmark/nvme(\d+n1)_.*_iops + criteria: 'lambda x:x>0.05' + categories: DIST_INFERENCE + metrics: + - dist-inference/.*step_times + # disk_rule: + # function: variance + # criteria: 'lambda x:x<-0.05' + # categories: DISK + # metrics: + # - disk-benchmark/nvme(\d+n1)_.*_iops #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable model_throughput_rule: function: variance @@ -104,11 +153,11 @@ superbench: categories: Model metrics: - gpt_models/.*/.*_train_throughput - - lstm_models/pytorch-lstm/.*_train_throughput - - bert_models/pytorch-bert-.*/.*_train_throughput - - model-benchmarks:.*/pytorch-bert.*/.*_train_throughput - - model-benchmarks:.*/pytorch-gpt.*/.*_train_throughput - - model-benchmarks:.*/pytorch-lstm.*/.*_train_throughput + - lstm_models/.*lstm/.*_train_throughput + - bert_models/.*bert-.*/.*_train_throughput + - model-benchmarks:.*/.*bert.*/.*_train_throughput + - model-benchmarks:.*/.*gpt.*/.*_train_throughput + - model-benchmarks:.*/.*lstm.*/.*_train_throughput # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable resnet_throughput: function: variance @@ -116,26 +165,30 @@ superbench: store: true categories: CNN metrics: - - resnet_models/pytorch-resnet\d*/.*_train_throughput - - model-benchmarks:.*/pytorch-resnet.*/.*_train_throughput + - resnet_models/.*resnet\d*/.*_train_throughput + - model-benchmarks:.*/.*resnet.*/.*_train_throughput vgg_throughput: function: variance criteria: 'lambda x:x<-0.05' store: true categories: CNN metrics: - - vgg_models/pytorch-vgg\d*/.*_train_throughput - - model-benchmarks:.*/pytorch-vgg.*/.*_train_throughput + - vgg_models/.*vgg\d*/.*_train_throughput + - model-benchmarks:.*/.*vgg.*/.*_train_throughput densenet_throughput: function: variance criteria: 'lambda x:x<-0.05' store: true categories: CNN metrics: - - densenet_models/pytorch-densenet\d*/.*_train_throughput - - model-benchmarks:.*/pytorch-densenet.*/.*_train_throughput + - densenet_models/.*densenet\d*/.*_train_throughput + - model-benchmarks:.*/.*densenet.*/.*_train_throughput cnn_throughput_rule: categories: CNN function: multi_rules - criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False' + criteria: 'lambda label:True if label["resnet_throughput"]+label["densenet_throughput"]>=2 else False' + vgg_throughput_rule: + categories: VGG + function: multi_rules + criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False' From 19d7beec155006781eb916416ba9405bf8d3d6d8 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 13 Apr 2023 05:05:05 +0000 Subject: [PATCH 06/12] fix bug --- superbench/analyzer/generate_baseline.py | 3 ++- superbench/analyzer/generate_statistic.py | 17 ++++++++++++++--- superbench/analyzer/rules/analysis_rules.yaml | 14 +++++++------- superbench/analyzer/rules/diagnosis_rules.yaml | 12 ++++++------ 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py index bdab5851b..7b2de74c2 100644 --- a/superbench/analyzer/generate_baseline.py +++ b/superbench/analyzer/generate_baseline.py @@ -48,7 +48,8 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli Returns: tuple: the baseline of the metric, normal data of the metric """ - if single_metric_with_baseline['metrics'][metric] != -1: + if single_metric_with_baseline['metrics'][metric] != None and single_metric_with_baseline['metrics'][metric + ] != -1: return single_metric_with_baseline['metrics'][metric] tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline) tmp_single_metric_with_baseline['metrics'] = {} diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py index 51bf2d75a..13cd146f9 100644 --- a/superbench/analyzer/generate_statistic.py +++ b/superbench/analyzer/generate_statistic.py @@ -143,6 +143,9 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_ (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics ) for index, out in enumerate(outputs): + if not out: + logger.error('Analyzer: filter healthy nodese failed') + return aggregated_df[metrics[index]] = out[1] if plot: plot_steps( @@ -168,15 +171,23 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_ parser.add_argument( '--input_dir', type=str, - default=None, + default='rawdata/', required=False, help='Input directory which stores the results-summary.jsonl.' ) parser.add_argument( - '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.' + '--diagnosis_rule_file', + type=str, + default='rules/diagnosis_rules.yaml', + required=False, + help='The input path of diagnosis rule file.' ) parser.add_argument( - '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.' + '--summary_rule_file', + type=str, + default='rules/analysis_rules.yaml', + required=False, + help='The input path of summary rule file.' ) args = parser.parse_args() diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml index f71d76b80..8632de21a 100644 --- a/superbench/analyzer/rules/analysis_rules.yaml +++ b/superbench/analyzer/rules/analysis_rules.yaml @@ -27,12 +27,12 @@ superbench: aggregate: True metrics: - cublaslt-gemm/fp.*_.*_flops - micro-cublasltbatchflops: - statistics: mean - categories: MICRO - aggregate: True - metrics: - - cublaslt-gemm:bmm/fp.*_.*_flops + # micro-cublasltbatchflops: + # statistics: mean + # categories: MICRO + # aggregate: True + # metrics: + # - cublaslt-gemm:bmm/fp.*_.*_flops micro-aggregation-with-aggregate: statistics: mean categories: MICRO @@ -50,7 +50,7 @@ superbench: - nccl-bw:*.*/allreduce_.*_busbw - sharding-matmul:*.*/.*_time - matmul:*.*/.*_time - - gpu-burn:*.*/.* + #- gpu-burn:*.*/.*_pass - cpu-memory-bw-latency/.*_bw - cpu-memory-bw-latency/.*_lat #- cublas-function:*.*/.*_time diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index a7074087a..435340fe4 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -87,12 +87,12 @@ superbench: categories: CPU metrics: - cpu-memory-bw-latency/.*_lat - gpu_burn: - function: value - criteria: 'lambda x:x!=1' - categories: GPUBURN - metrics: - - gpu-burn:*.*/.*_pass + # gpu_burn: + # function: value + # criteria: 'lambda x:x!=1' + # categories: GPUBURN + # metrics: + # - gpu-burn:*.*/.*_pass cpu: function: variance criteria: 'lambda x:x<-0.05' From 77ef4cc551d662c761407d1f3ef8871ee05d3eef Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 14 Apr 2023 10:08:13 +0000 Subject: [PATCH 07/12] update aggregation rule for baseline generation --- superbench/analyzer/generate_baseline.py | 3 +- .../analyzer/rules/aggregation_rules.yaml | 63 +++++++++---------- superbench/analyzer/rules/analysis_rules.yaml | 12 ++-- 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py index 7b2de74c2..9f82d26f8 100644 --- a/superbench/analyzer/generate_baseline.py +++ b/superbench/analyzer/generate_baseline.py @@ -29,6 +29,7 @@ class BaselineAlgoType(Enum): class GenerateBaseline(DataDiagnosis): """The class to generate baseline for raw data.""" + def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op): """Fix threshold outlier detection algorithm. @@ -243,5 +244,5 @@ def run( # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics. print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.') GenerateBaseline().run( - folder + '/results-summary.jsonl', args.diagnosis_rule_file, folder, 'fix_threshold', args.summary_rule_file + folder + '/results-summary.jsonl', args.summary_rule_file, folder, 'fix_threshold', args.diagnosis_rule_file ) diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml index 6cf1e8df0..db8e7a87d 100644 --- a/superbench/analyzer/rules/aggregation_rules.yaml +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -2,32 +2,21 @@ version: v0.5 superbench: rules: - model-benchmarks-FP32: + model-benchmarks: statistics: - mean - categories: model:FP32 + categories: model metrics: - - model-benchmarks:.*/.*/fp32_train_throughput - - gpt_models/.*/fp32_train_throughput - - lstm_models/pytorch-lstm/fp32_train_throughput - - bert_models/pytorch-bert-.*/fp32_train_throughput - - resnet_models/pytorch-resnet\d*/fp32_train_throughput - - vgg_models/pytorch-vgg\d*/fp32_train_throughput - - densenet_models/.*/fp32_train_throughput - model-benchmarks-FP16: - statistics: mean - categories: model:FP16 - metrics: - - model-benchmarks:.*/.*/fp16_train_throughput - - gpt_models/.*/fp16_train_throughput - - lstm_models/pytorch-lstm/fp16_train_throughput - - bert_models/pytorch-bert-.*/fp16_train_throughput - - resnet_models/pytorch-resnet\d*/fp16_train_throughput - - vgg_models/pytorch-vgg\d*/fp16_train_throughput - - densenet_models/.*/fp16_train_throughput + - model-benchmarks:.*/.*/.*_train_throughput + - gpt_models/.*/.*_train_throughput + - lstm_models/pytorch-lstm/.*_train_throughput + - bert_models/pytorch-bert-.*/.*_train_throughput + - resnet_models/pytorch-resnet\d*/.*_train_throughput + - vgg_models/pytorch-vgg\d*/.*_train_throughput + - densenet_models/.*/.*_train_throughput micro-aggregation: statistics: mean - categories: MICRO + categories: MICRO1 aggregate: True metrics: - gemm-flops:*.*/.*ops @@ -38,37 +27,41 @@ superbench: - cudnn-function:*.*/.*_time - ort-inference/.*_time.* - tensorrt-inference/.*_time.* + - cublaslt-gemm:*.*/.*ops + - dist-inference/.*step_times.* micro-nonaggregation: statistics: mean - categories: MICRO + categories: MICRO2 metrics: - nccl-bw:*.*/allreduce_.*_busbw - rccl-bw:*.*/allreduce_.*_busbw - sharding-matmul:*.*/.*_time - matmul:*.*/.*_time - - gpu-burn/.* + - gpu-burn/.*_pass + - gpu_burn/.*_abort - cpu-memory-bw-latency/.*_bw - cpu-memory-bw-latency/.*_lat gpu-copy-bw: statistics: mean - categories: RDMA + categories: DTOH metrics: - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw - aggregate: gpu-copy-bw:perf/.*gpu(.*)_to_gpu(.*)_write_by_.*_bw - ib-loopback1: + #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw + gpu-copy-bw1: statistics: mean - categories: RDMA + categories: DTOH metrics: - #- ib-loopback/ib_write_8388608_ib.*_bw - - ib-loopback/ib_write_.*_ib.*_bw - aggregate: ib-loopback/ib_write_.*_ib(.*)_bw - ib-loopback2: + #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw + gpu-copy-bw2: statistics: mean - categories: RDMA + categories: HTOD metrics: - #- ib-loopback/ib_write_8388608_ib.*_bw - - ib-loopback/ib_write_bw_.*:\d+ - aggregate: True + #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw + - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw + aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw disk: statistics: mean categories: DISK diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml index 8632de21a..23f8e4c2a 100644 --- a/superbench/analyzer/rules/analysis_rules.yaml +++ b/superbench/analyzer/rules/analysis_rules.yaml @@ -21,12 +21,12 @@ superbench: metrics: - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops - gemm-flops:*.*/.*op - micro-cublasltflops: - statistics: mean - categories: MICRO - aggregate: True - metrics: - - cublaslt-gemm/fp.*_.*_flops + # micro-cublasltflops: + # statistics: mean + # categories: MICRO + # aggregate: True + # metrics: + # - cublaslt-gemm/fp.*_.*_flops # micro-cublasltbatchflops: # statistics: mean # categories: MICRO From 6e21416b588431b1a407024b32832d97dd6a0f15 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 14 Apr 2023 10:10:32 +0000 Subject: [PATCH 08/12] update diagnosis rule for baseline generation --- .../analyzer/rules/diagnosis_rules.yaml | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index 435340fe4..8ece570ab 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -62,18 +62,18 @@ superbench: categories: CUBLASLT metrics: - cublaslt-gemm:*.*/.*ops - # cublas: - # function: variance - # criteria: 'lambda x:x>0.05' - # categories: CUBLAS - # metrics: - # - cublas-function:*.*/.*_time - # cudnn: - # function: variance - # criteria: 'lambda x:x>0.05' - # categories: CUDNN - # metrics: - # - cudnn-function:*.*/.*_time + cublas: + function: variance + criteria: 'lambda x:x>0.05' + categories: CUBLAS + metrics: + - cublas-function:*.*/.*_time + cudnn: + function: variance + criteria: 'lambda x:x>0.05' + categories: CUDNN + metrics: + - cudnn-function:*.*/.*_time matmul: function: variance criteria: 'lambda x:x>0.05' @@ -87,12 +87,12 @@ superbench: categories: CPU metrics: - cpu-memory-bw-latency/.*_lat - # gpu_burn: - # function: value - # criteria: 'lambda x:x!=1' - # categories: GPUBURN - # metrics: - # - gpu-burn:*.*/.*_pass + gpu_burn: + function: value + criteria: 'lambda x:x!=1' + categories: GPUBURN + metrics: + - gpu-burn:*.*/.*_pass cpu: function: variance criteria: 'lambda x:x<-0.05' @@ -140,12 +140,12 @@ superbench: categories: DIST_INFERENCE metrics: - dist-inference/.*step_times - # disk_rule: - # function: variance - # criteria: 'lambda x:x<-0.05' - # categories: DISK - # metrics: - # - disk-benchmark/nvme(\d+n1)_.*_iops + disk_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: DISK + metrics: + - disk-benchmark/nvme(\d+n1)_.*_iops #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable model_throughput_rule: function: variance From 0ddeed88a1ce7b060069f680f166cf65c905dd57 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Fri, 16 Jun 2023 16:08:21 +0800 Subject: [PATCH 09/12] for inference --- superbench/analyzer/generate_statistic.py | 3 ++ .../analyzer/rules/aggregation_rules.yaml | 9 ++++- .../analyzer/rules/diagnosis_rules.yaml | 39 ++++++++++++++++++- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py index 13cd146f9..80ccaa8f8 100644 --- a/superbench/analyzer/generate_statistic.py +++ b/superbench/analyzer/generate_statistic.py @@ -5,6 +5,7 @@ import argparse import os +import natsort as ns from joblib import Parallel, delayed import pandas as pd @@ -72,6 +73,8 @@ def output_excel(self, excel_file, stat_df, digit=2): for benchmark in self._benchmark_metrics_dict: benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]] + #benchmark_df = benchmark_df[,mixedsort(names(benchmark_df))] + benchmark_df = benchmark_df.reindex(ns.natsorted(benchmark_df.columns), axis=1) sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1] benchmark_df.to_excel(writer, sheet_name=sheet_name) worksheet = writer.sheets[sheet_name] diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml index db8e7a87d..3b330b5bb 100644 --- a/superbench/analyzer/rules/aggregation_rules.yaml +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -2,7 +2,7 @@ version: v0.5 superbench: rules: - model-benchmarks: + model-train-benchmarks: statistics: - mean categories: model @@ -14,6 +14,13 @@ superbench: - resnet_models/pytorch-resnet\d*/.*_train_throughput - vgg_models/pytorch-vgg\d*/.*_train_throughput - densenet_models/.*/.*_train_throughput + model-inference-benchmarks: + statistics: + - mean + categories: model + aggregate: True + metrics: + - model-benchmarks:.*/.*/.*_inference_throughput:\d+ micro-aggregation: statistics: mean categories: MICRO1 diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index 8ece570ab..3771566d3 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -191,4 +191,41 @@ superbench: categories: VGG function: multi_rules criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False' - + model_inference_throughput_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Model + metrics: + - model-benchmarks:.*/.*bert.*/.*_inference_throughput + - model-benchmarks:.*/.*gpt.*/.*_inference_throughput + - model-benchmarks:.*/.*lstm.*/.*_inference_throughput + # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable + resnet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*resnet.*/.*_inference_throughput + vgg_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*vgg.*/.*_inference_throughput + densenet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*densenet.*/.*_inference_throughput + cnn_inference_throughput_rule: + categories: CNN + function: multi_rules + criteria: 'lambda label:True if label["resnet_inference_throughput"]+label["densenet_inference_throughput"]>=2 else False' + vgg_inference_throughput_rule: + categories: VGG + function: multi_rules + criteria: 'lambda label:True if label["vgg_inference_throughput"]>=2 else False' From b5d5cc75b7cb9b6d45dc040bd7b0e24fd94e86c3 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Mon, 13 Nov 2023 10:00:56 +0800 Subject: [PATCH 10/12] fix typo --- superbench/analyzer/file_handler.py | 2 +- superbench/analyzer/generate_statistic.py | 2 +- superbench/analyzer/rules/diagnosis_rules.yaml | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py index 2337435ea..f9f4065f9 100644 --- a/superbench/analyzer/file_handler.py +++ b/superbench/analyzer/file_handler.py @@ -39,7 +39,7 @@ def read_raw_data(raw_data_path): raw_data_df = raw_data_df.rename(raw_data_df['node']) raw_data_df = raw_data_df.drop(columns=['node']) except Exception as e: - logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data fomat - {}'.format(str(e))) + logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data format - {}'.format(str(e))) return raw_data_df diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py index 80ccaa8f8..d64813366 100644 --- a/superbench/analyzer/generate_statistic.py +++ b/superbench/analyzer/generate_statistic.py @@ -147,7 +147,7 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_ ) for index, out in enumerate(outputs): if not out: - logger.error('Analyzer: filter healthy nodese failed') + logger.error('Analyzer: filter healthy nodes failed') return aggregated_df[metrics[index]] = out[1] if plot: diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index 3771566d3..cbe7994bd 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -49,6 +49,12 @@ superbench: categories: INVESTIGATING metrics: - kernel-launch:*.*/.*_time + tensorrt_inference: + function: variance + criteria: 'lambda x:x>0.05' + categories: TensorRT + metrics: + - tensorrt-inference/.*_time.* # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable tensor_core_rule: function: variance From 1797a29c41c0c34c2e8e0062c015da46de155881 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Mon, 13 Nov 2023 10:04:06 +0800 Subject: [PATCH 11/12] upgrade packages for analyzer. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b42639eea..636d6119b 100644 --- a/setup.py +++ b/setup.py @@ -160,11 +160,12 @@ def run(self): 'matplotlib>=3.0.0', 'natsort>=7.1.1', 'networkx>=2.5', - 'numpy>=1.19.2', + 'numpy>=1.20.3', 'omegaconf==2.0.6', 'openpyxl>=3.0.7', 'pandas>=1.1.5', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', + 'python-dateutil>=2.8.2' 'pyyaml>=5.3', 'requests>=2.27.1', 'seaborn>=0.11.2', From 6ded58860c3ffa2f58c9804f7b4e247d6d78cbd6 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Mon, 20 Nov 2023 02:56:17 +0800 Subject: [PATCH 12/12] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 636d6119b..4785728fc 100644 --- a/setup.py +++ b/setup.py @@ -165,7 +165,7 @@ def run(self): 'openpyxl>=3.0.7', 'pandas>=1.1.5', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', - 'python-dateutil>=2.8.2' + 'python-dateutil>=2.8.2', 'pyyaml>=5.3', 'requests>=2.27.1', 'seaborn>=0.11.2',