From d4f91d243bcc74b42555a8ad5cc12a69fcbac072 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Mon, 30 May 2022 16:27:42 +0800
Subject: [PATCH 01/12] Generate baseline using mean of fix_threshold algorithm

---
 superbench/analyzer/generate_baseline.py      | 205 ++++++++++++++++++
 .../analyzer/rules/aggregation_rules.yaml     | 102 +++++++++
 .../analyzer/rules/diagnosis_rules.yaml       | 106 +++++++++
 3 files changed, 413 insertions(+)
 create mode 100644 superbench/analyzer/generate_baseline.py
 create mode 100644 superbench/analyzer/rules/aggregation_rules.yaml
 create mode 100644 superbench/analyzer/rules/diagnosis_rules.yaml

diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
new file mode 100644
index 000000000..3a5b730f7
--- /dev/null
+++ b/superbench/analyzer/generate_baseline.py
@@ -0,0 +1,205 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for baseline generation."""
+
+import argparse
+import json
+from pathlib import Path
+import re
+
+
+from joblib import Parallel, delayed
+import pandas as pd
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer import data_analysis
+from superbench.analyzer import DataDiagnosis
+from superbench.analyzer import ResultSummary
+from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+from superbench.benchmarks.context import Enum
+
+
+class BaselineAlgoType(Enum):
+    """The Enum class representing different baseline generation algorithm."""
+
+    MEAN = 'mean'
+    FIX_THRESHOLD = 'fix_threshold'
+
+
+class GenerateBaseline(DataDiagnosis):
+    def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op):
+        """Fix threshold outlier detection algorithm.
+
+        Step 0: Put all data in the collection
+        Step 1: Regenerate the collection
+        Calculate the average number in the collection as the baseline
+        Remove all data which cannot pass the fix threshold based on the new baseline
+        Step 2: If no data has been removed from Step 1, go to Step 3; otherwise, go to Step 1
+        Step 3: Use the baseline and fix threshold for Outlier Detection
+
+        Args:
+            data_series (pd.Series): data the the metric
+            single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format
+            metric (str): the name of the metric to execute the algorithm
+            rule_op (function): diagnosis rule op function
+
+        Returns:
+            number: the baseline of the metric
+        """
+        if single_metric_with_baseline['metrics'][metric] != -1:
+            return single_metric_with_baseline['metrics'][metric]
+        single_metric_with_baseline['metrics'] = {}
+        clean = False
+        while clean is False:
+            clean = True
+            baseline_val = data_series.mean()
+            for val in data_series.index:
+                single_metric_with_baseline['metrics'][metric] = baseline_val
+                if baseline_val == 0:
+                    break
+                data_row = pd.Series([data_series[val]], index=[metric])
+                details = []
+                categories = set()
+                summary_data_row = pd.Series(index=[metric], dtype=float)
+                violated_num = rule_op(data_row, single_metric_with_baseline, summary_data_row, details, categories)
+                if violated_num:
+                    data_series = data_series.drop(val)
+                    clean = False
+        baseline = single_metric_with_baseline['metrics'][metric]
+        return baseline
+
+    def get_aggregate_data(self, raw_data_file, summary_rule_file):
+        """Aggregate raw data according to the summary rule file.
+
+        If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data
+        If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern
+
+        Args:
+            raw_data_file (str): the file name of the raw data file
+            summary_rule_file (str): the file name of the summary rule file
+
+        Returns:
+            DataFrame: aggregated data
+        """
+        self.rs = ResultSummary()
+        rules = self.rs._preprocess(raw_data_file, summary_rule_file)
+        # parse rules for result summary
+        if not self.rs._parse_rules(rules):
+            return
+        aggregated_df = pd.DataFrame()
+        for rule in self.rs._sb_rules:
+            single_metric_rule = self.rs._sb_rules[rule]
+            metrics = list(single_metric_rule['metrics'].keys())
+            data_df_of_rule = self.rs._raw_data_df[metrics]
+            if self.rs._sb_rules[rule]['aggregate']:
+                # if aggregate is True, aggregate in ranks
+                if self.rs._sb_rules[rule]['aggregate'] is True:
+                    data_df_of_rule = data_analysis.aggregate(data_df_of_rule)
+                # if aggregate is not empty and is a pattern in regex, aggregate according to pattern
+                else:
+                    pattern = self.rs._sb_rules[rule]['aggregate']
+                    data_df_of_rule_with_short_name = data_analysis.aggregate(data_df_of_rule, pattern)
+                    data_df_of_rule = pd.DataFrame(columns=metrics)
+                    # restore the columns of data_fd to full metric names
+                    for metric in metrics:
+                        short = ''
+                        match = re.search(pattern, metric)
+                        if match:
+                            metric_in_list = list(metric)
+                            for i in range(1, len(match.groups()) + 1):
+                                metric_in_list[match.start(i):match.end(i)] = '*'
+                            short = ''.join(metric_in_list)
+                        data_df_of_rule[metric] = data_df_of_rule_with_short_name[short]
+            aggregated_df = pd.concat([aggregated_df, data_df_of_rule], axis=1)
+        return aggregated_df
+
+    def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
+        """Generate the baseline in json format.
+
+        Args:
+            algo (str): the algorithm to generate the baseline
+            aggregated_df (DataFrame): aggregated data
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm
+            baseline (dict): existing baseline of some metrics
+
+        Returns:
+            dict: baseline of metrics defined in diagnosis_rule_files for fix_threshold algorithm or defined in rule_summary_files for mean
+        """
+        baseline = {}
+        # re-organize metrics by benchmark names
+        self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
+        if algo == 'mean':
+            mean_df = self._raw_data_df.mean()
+            for metric in self._raw_data_df.columns:
+                if metric in baseline:
+                    return baseline[metric]
+                baseline[metric] = mean_df[metric]
+        elif algo == 'fix_threshold':
+            # read diagnosis rules
+            rules = file_handler.read_rules(diagnosis_rule_file)
+            if not self._parse_rules_and_baseline(rules, baseline):
+                return baseline
+            else:
+                for rule in self._sb_rules:
+                    single_metric_rule = self._sb_rules[rule]
+                    metrics = list(single_metric_rule['metrics'].keys())
+                    function_name = self._sb_rules[rule]['function']
+                    rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
+                    outputs = Parallel(n_jobs=-1)(
+                        delayed(self.fix_threshold_outlier_detection)(
+                            aggregated_df[metric], single_metric_rule, metric, rule_op)
+                        for metric in metrics)
+                    for index, out in enumerate(outputs):
+                        baseline[metrics[index]] = out
+        return baseline
+
+    def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None):
+        """Export baseline to json file.
+
+        If diagnosis_rule_file is None, use mean of the data as baseline.
+        If diagnosis_rule_file is not None, use the rules in diagnosis_rule_file to execute fix_threshold algorithm.
+
+        Args:
+            raw_data_df (DataFrame): raw data
+            output_dir (str): the directory of output file
+        """
+        try:
+            # aggregate results from different devices
+            self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file)
+            # read existing baseline
+            baseline = {}
+            if baseline_file:
+                baseline = file_handler.read_baseline()
+            # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
+            baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
+            # output baseline to json file
+            with open(output_dir+'/baseline.json', mode='w') as f:
+                json.dump(baseline, f, indent=2)
+
+        except Exception as e:
+            logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e)))
+
+
+if __name__ == '__main__':
+    global args
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+            '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.'
+        )
+    parser.add_argument(
+            '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.'
+        )
+    args = parser.parse_args()
+    folder=args.input_dir
+    #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41'
+    if args.algo=='mean':
+        # simply use mean, need result_summary rules to define how to aggregate the metrics.
+        print('Generate baseine using mean of the data.')
+        GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder)
+    elif args.algo=='fix_threshold':
+        # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
+        print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.')
+        GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml',
+                            folder, 'fix_threshold', 'rules/diagnosis_rules.yaml')
diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml
new file mode 100644
index 000000000..c5cb1708d
--- /dev/null
+++ b/superbench/analyzer/rules/aggregation_rules.yaml
@@ -0,0 +1,102 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+    model-benchmarks-FP32:
+      statistics:
+        - mean
+      categories: model:FP32
+      metrics:
+        - model-benchmarks:vgg:float/.*/fp32_train_throughput
+        - model-benchmarks:gpt2-large:float/.*/fp32_train_throughput
+        - model-benchmarks:bert-base:float/.*/fp32_train_throughput
+        - model-benchmarks:bert-large:float/.*/fp32_train_throughput
+        - model-benchmarks:lstm:float/.*/fp32_train_throughput
+        - model-benchmarks:resnet50:float/.*/fp32_train_throughput
+        - model-benchmarks:resnet101:float/.*/fp32_train_throughput
+        - model-benchmarks:resnet152:float/.*/fp32_train_throughput
+        - model-benchmarks:densenet169:float/.*/fp32_train_throughput
+        - model-benchmarks:densenet201:float/.*/fp32_train_throughput
+        - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp32_train_throughput
+        - model-benchmarks:vgg:float/.*/fp32_train_throughput
+        - gpt_models/.*/fp32_train_throughput
+        - lstm_models/pytorch-lstm/fp32_train_throughput
+        - bert_models/pytorch-bert-.*/fp32_train_throughput
+        - resnet_models/pytorch-resnet\d*/fp32_train_throughput
+        - vgg_models/pytorch-vgg\d*/fp32_train_throughput
+        - densenet_models/.*/fp32_train_throughput
+    model-benchmarks-FP16:
+      statistics: mean
+      categories: model:FP16
+      metrics:
+        - model-benchmarks:gpt2-large:half/.*/fp16_train_throughput
+        - model-benchmarks:bert-base:half/.*/fp16_train_throughput
+        - model-benchmarks:bert-large:half/.*/fp16_train_throughput
+        - model-benchmarks:lstm:half/.*/fp16_train_throughput
+        - model-benchmarks:resnet50:half/.*/fp16_train_throughput
+        - model-benchmarks:resnet101:half/.*/fp16_train_throughput
+        - model-benchmarks:resnet152:half/.*/fp16_train_throughput
+        - model-benchmarks:densenet169:half/.*/fp16_train_throughput
+        - model-benchmarks:densenet201:half/.*/fp16_train_throughput
+        - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp16_train_throughput
+        - model-benchmarks:vgg:half/.*/fp16_train_throughput
+        - gpt_models/.*/fp16_train_throughput
+        - lstm_models/pytorch-lstm/fp16_train_throughput
+        - bert_models/pytorch-bert-.*/fp16_train_throughput
+        - resnet_models/pytorch-resnet\d*/fp16_train_throughput
+        - vgg_models/pytorch-vgg\d*/fp16_train_throughput
+        - densenet_models/.*/fp16_train_throughput
+    micro-aggregation:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - gemm-flops:*.*/.*ops
+        - mem-bw:*.*/.*_bw
+        - kernel-launch:*.*/.*_time
+        - computation-communication-overlap:*.*/.*_time
+        - cublas-function:*.*/.*_time
+        - cudnn-function:*.*/.*_time
+    micro-nonaggregation:
+      statistics: mean
+      categories: MICRO
+      metrics:
+        - nccl-bw:*.*/allreduce_.*_busbw
+        - rccl-bw:*.*/allreduce_.*_busbw
+        - sharding-matmul:*.*/.*_time
+        - matmul:*.*/.*_time
+        - gpu-burn/.*
+        - cpu-memory-bw-latency/.*_bw
+        - cpu-memory-bw-latency/.*_lat
+    ib-loopback:
+      statistics: mean
+      categories: RDMA
+      metrics:
+        #- ib-loopback/ib_write_8388608_ib.*_bw
+        - ib-loopback/ib_write_.*_ib.*_bw
+      aggregate: ib-loopback/ib_write_.*_ib(.*)_bw
+    disk:
+      statistics: mean
+      categories: DISK
+      aggregate: disk-benchmark/nvme(.*)_[seq|rand]_.*
+      metrics:
+        - disk-benchmark/nvme.*
+    # ib-loopback1:
+    #   statistics: mean
+    #   categories: RDMA1
+    #   metrics:
+    #     - ib-loopback/ib_write_8388608_ib[0|1|2|3]_bw:([0|1])
+    #     - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3])
+    #   aggregate: ib-loopback/ib_write_.*_ib.*_bw:([0|1|2|3])
+    # ib-loopback2:
+    #   statistics: mean
+    #   categories: RDMA2
+    #   metrics:
+    #     - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3])
+    #   aggregate: ib-loopback/ib_write_.*_ib[4|5|6|7]_bw:([2|3])
+    #   statistics: mean
+    #   categories: RCCL:4nodes
+    #   metrics:
+    #    - rccl-bw:4nodealltoall/alltoall_8589934592_busbw
+    #    - rccl-bw:4node/allreduce_17179869184_busbw
+
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
new file mode 100644
index 000000000..c3b299061
--- /dev/null
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -0,0 +1,106 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+  # Rule 0: If one test fails, label it as Not acceptable
+    falure_rule:
+      function: value
+      criteria: 'lambda x:x>0'
+      categories: FailedTest
+      metrics:
+        - kernel-launch:*.*/return_code
+        - mem-bw:*.*/return_code
+        - gemm-flops:*.*/return_code
+        - ib-loopback:*.*/return_code
+        - nccl-bw:*.*/return_code
+        - gpt_models/.*/return_code
+        - lstm_models/.*/return_code
+        - bert_models/.*/return_code
+        - resnet_models/.*/return_code
+        - vgg_models/.*/return_code
+        - densenet_models/.*/return_code
+        - model-benchmarks:.*/return_code
+  # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable
+    tensor_core_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: TensorCore 
+      metrics:
+        - gemm-flops:*.*/.*ops
+  # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable
+    mem_bw_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Mem
+      metrics:
+        - mem-bw:*.*/h2d_bw
+        - mem-bw:*.*/d2h_bw
+  # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable
+    rdma_loopback_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: RDMA
+      metrics:
+        - ib-loopback:*.*/ib_write_8388608_.*_bw    
+  # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable
+    nccl_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: NCCL
+      metrics:
+        - nccl-bw:default/allreduce_8589934592_busbw
+  # Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
+    model_throughput_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Model
+      metrics:
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/pytorch-lstm/.*_train_throughput
+        - bert_models/pytorch-bert-.*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-bert.*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-gpt.*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-lstm.*/.*_train_throughput
+  # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable
+    resnet_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - resnet_models/pytorch-resnet\d*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-resnet.*/.*_train_throughput
+    vgg_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - vgg_models/pytorch-vgg\d*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-vgg.*/.*_train_throughput
+    densenet_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - densenet_models/pytorch-densenet\d*/.*_train_throughput
+        - model-benchmarks:.*/pytorch-densenet.*/.*_train_throughput
+    cnn_throughput_rule:
+      categories: CNN
+      function: multi_rules
+      criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False'
+  # Rule 10: If temperature of one GPU is > 85 °C, label it as Not acceptable
+    temperature_rule:
+      function: value
+      categories: TEMP 
+      criteria: 'lambda x:x>85'
+      metrics:
+        - monitor/gpu_temperature
+  # Rule 11: If DBE > 0, label it as Not acceptable (Not Support)
+    dbe_rule:
+      function: value
+      categories: TEMP 
+      criteria: 'lambda x:x>85'
+      metrics:   
+         - monitor/gpu_uncorrected_ecc

From d63e9620269045f4b830bbde844a6a8eb951d63b Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Mon, 18 Jul 2022 11:42:19 +0800
Subject: [PATCH 02/12] fix bugs

---
 superbench/analyzer/generate_baseline.py | 51 ++++++++++++++----------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
index 3a5b730f7..e920ca142 100644
--- a/superbench/analyzer/generate_baseline.py
+++ b/superbench/analyzer/generate_baseline.py
@@ -4,11 +4,11 @@
 """A module for baseline generation."""
 
 import argparse
+from copy import deepcopy
 import json
 from pathlib import Path
 import re
 
-
 from joblib import Parallel, delayed
 import pandas as pd
 
@@ -40,35 +40,36 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli
         Step 3: Use the baseline and fix threshold for Outlier Detection
 
         Args:
-            data_series (pd.Series): data the the metric
+            data_series (pd.Series): data of the metric
             single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format
             metric (str): the name of the metric to execute the algorithm
             rule_op (function): diagnosis rule op function
 
         Returns:
-            number: the baseline of the metric
+            tuple: the baseline of the metric, normal data of the metric
         """
         if single_metric_with_baseline['metrics'][metric] != -1:
             return single_metric_with_baseline['metrics'][metric]
-        single_metric_with_baseline['metrics'] = {}
+        tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline)
+        tmp_single_metric_with_baseline['metrics'] = {}
         clean = False
         while clean is False:
             clean = True
             baseline_val = data_series.mean()
             for val in data_series.index:
-                single_metric_with_baseline['metrics'][metric] = baseline_val
+                tmp_single_metric_with_baseline['metrics'][metric] = baseline_val
                 if baseline_val == 0:
                     break
                 data_row = pd.Series([data_series[val]], index=[metric])
                 details = []
                 categories = set()
                 summary_data_row = pd.Series(index=[metric], dtype=float)
-                violated_num = rule_op(data_row, single_metric_with_baseline, summary_data_row, details, categories)
+                violated_num = rule_op(data_row, tmp_single_metric_with_baseline, summary_data_row, details, categories)
                 if violated_num:
                     data_series = data_series.drop(val)
                     clean = False
-        baseline = single_metric_with_baseline['metrics'][metric]
-        return baseline
+        baseline = tmp_single_metric_with_baseline['metrics'][metric]
+        return baseline, data_series
 
     def get_aggregate_data(self, raw_data_file, summary_rule_file):
         """Aggregate raw data according to the summary rule file.
@@ -152,10 +153,11 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
                             aggregated_df[metric], single_metric_rule, metric, rule_op)
                         for metric in metrics)
                     for index, out in enumerate(outputs):
-                        baseline[metrics[index]] = out
+                        baseline[metrics[index]] = out[0]
+                        aggregated_df[metrics[index]] = out[1]
         return baseline
 
-    def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None):
+    def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None, digit=2):
         """Export baseline to json file.
 
         If diagnosis_rule_file is None, use mean of the data as baseline.
@@ -174,9 +176,18 @@ def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean',
                 baseline = file_handler.read_baseline()
             # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
             baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
-            # output baseline to json file
+            for metric in baseline:
+                val = baseline[metric]
+                if isinstance(self._raw_data_df[metric].iloc[0], float):
+                    baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
+                elif isinstance(self._raw_data_df[metric].iloc[0], int):
+                    baseline[metric] = int(val)
+            print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0])
+            print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0].type)
+            baseline = json.dumps(baseline, indent=2, sort_keys=True)
+            baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
             with open(output_dir+'/baseline.json', mode='w') as f:
-                json.dump(baseline, f, indent=2)
+                f.write(baseline)
 
         except Exception as e:
             logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e)))
@@ -186,20 +197,20 @@ def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean',
     global args
     parser = argparse.ArgumentParser()
     parser.add_argument(
-            '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.'
-        )
+        '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.'
+    )
     parser.add_argument(
-            '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.'
-        )
+        '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.'
+    )
     args = parser.parse_args()
-    folder=args.input_dir
+    folder = args.input_dir
     #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41'
-    if args.algo=='mean':
+    if args.algo == 'mean':
         # simply use mean, need result_summary rules to define how to aggregate the metrics.
         print('Generate baseine using mean of the data.')
         GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder)
-    elif args.algo=='fix_threshold':
+    elif args.algo == 'fix_threshold':
         # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
         print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.')
         GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml',
-                            folder, 'fix_threshold', 'rules/diagnosis_rules.yaml')
+                               folder, 'fix_threshold', 'rules/diagnosis_rules.yaml')

From 8b9de38dc4dafdb5cf26327243dc18182d29b12a Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Mon, 18 Jul 2022 15:23:29 +0800
Subject: [PATCH 03/12] update rules

---
 .../analyzer/rules/aggregation_rules.yaml     | 65 ++++++-----------
 .../analyzer/rules/diagnosis_rules.yaml       | 71 ++++++++++++++-----
 2 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml
index c5cb1708d..6cf1e8df0 100644
--- a/superbench/analyzer/rules/aggregation_rules.yaml
+++ b/superbench/analyzer/rules/aggregation_rules.yaml
@@ -7,18 +7,7 @@ superbench:
         - mean
       categories: model:FP32
       metrics:
-        - model-benchmarks:vgg:float/.*/fp32_train_throughput
-        - model-benchmarks:gpt2-large:float/.*/fp32_train_throughput
-        - model-benchmarks:bert-base:float/.*/fp32_train_throughput
-        - model-benchmarks:bert-large:float/.*/fp32_train_throughput
-        - model-benchmarks:lstm:float/.*/fp32_train_throughput
-        - model-benchmarks:resnet50:float/.*/fp32_train_throughput
-        - model-benchmarks:resnet101:float/.*/fp32_train_throughput
-        - model-benchmarks:resnet152:float/.*/fp32_train_throughput
-        - model-benchmarks:densenet169:float/.*/fp32_train_throughput
-        - model-benchmarks:densenet201:float/.*/fp32_train_throughput
-        - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp32_train_throughput
-        - model-benchmarks:vgg:float/.*/fp32_train_throughput
+        - model-benchmarks:.*/.*/fp32_train_throughput
         - gpt_models/.*/fp32_train_throughput
         - lstm_models/pytorch-lstm/fp32_train_throughput
         - bert_models/pytorch-bert-.*/fp32_train_throughput
@@ -29,17 +18,7 @@ superbench:
       statistics: mean
       categories: model:FP16
       metrics:
-        - model-benchmarks:gpt2-large:half/.*/fp16_train_throughput
-        - model-benchmarks:bert-base:half/.*/fp16_train_throughput
-        - model-benchmarks:bert-large:half/.*/fp16_train_throughput
-        - model-benchmarks:lstm:half/.*/fp16_train_throughput
-        - model-benchmarks:resnet50:half/.*/fp16_train_throughput
-        - model-benchmarks:resnet101:half/.*/fp16_train_throughput
-        - model-benchmarks:resnet152:half/.*/fp16_train_throughput
-        - model-benchmarks:densenet169:half/.*/fp16_train_throughput
-        - model-benchmarks:densenet201:half/.*/fp16_train_throughput
-        - model-benchmarks:LongRun_BERTL_models/pytorch-bert-large/fp16_train_throughput
-        - model-benchmarks:vgg:half/.*/fp16_train_throughput
+        - model-benchmarks:.*/.*/fp16_train_throughput
         - gpt_models/.*/fp16_train_throughput
         - lstm_models/pytorch-lstm/fp16_train_throughput
         - bert_models/pytorch-bert-.*/fp16_train_throughput
@@ -57,6 +36,8 @@ superbench:
         - computation-communication-overlap:*.*/.*_time
         - cublas-function:*.*/.*_time
         - cudnn-function:*.*/.*_time
+        - ort-inference/.*_time.*
+        - tensorrt-inference/.*_time.*
     micro-nonaggregation:
       statistics: mean
       categories: MICRO
@@ -68,35 +49,29 @@ superbench:
         - gpu-burn/.*
         - cpu-memory-bw-latency/.*_bw
         - cpu-memory-bw-latency/.*_lat
-    ib-loopback:
+    gpu-copy-bw:
+      statistics: mean
+      categories: RDMA
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+      aggregate: gpu-copy-bw:perf/.*gpu(.*)_to_gpu(.*)_write_by_.*_bw
+    ib-loopback1:
       statistics: mean
       categories: RDMA
       metrics:
         #- ib-loopback/ib_write_8388608_ib.*_bw
         - ib-loopback/ib_write_.*_ib.*_bw
       aggregate: ib-loopback/ib_write_.*_ib(.*)_bw
+    ib-loopback2:
+      statistics: mean
+      categories: RDMA
+      metrics:
+        #- ib-loopback/ib_write_8388608_ib.*_bw
+        - ib-loopback/ib_write_bw_.*:\d+
+      aggregate: True
     disk:
       statistics: mean
       categories: DISK
-      aggregate: disk-benchmark/nvme(.*)_[seq|rand]_.*
+      aggregate: disk-benchmark/nvme(\d+n1)_.*_iops
       metrics:
-        - disk-benchmark/nvme.*
-    # ib-loopback1:
-    #   statistics: mean
-    #   categories: RDMA1
-    #   metrics:
-    #     - ib-loopback/ib_write_8388608_ib[0|1|2|3]_bw:([0|1])
-    #     - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3])
-    #   aggregate: ib-loopback/ib_write_.*_ib.*_bw:([0|1|2|3])
-    # ib-loopback2:
-    #   statistics: mean
-    #   categories: RDMA2
-    #   metrics:
-    #     - ib-loopback/ib_write_8388608_ib[4|5|6|7]_bw:([2|3])
-    #   aggregate: ib-loopback/ib_write_.*_ib[4|5|6|7]_bw:([2|3])
-    #   statistics: mean
-    #   categories: RCCL:4nodes
-    #   metrics:
-    #    - rccl-bw:4nodealltoall/alltoall_8589934592_busbw
-    #    - rccl-bw:4node/allreduce_17179869184_busbw
-
+        - disk-benchmark/nvme(\d+n1)_.*_iops
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index c3b299061..1d58080b8 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -20,13 +20,40 @@ superbench:
         - vgg_models/.*/return_code
         - densenet_models/.*/return_code
         - model-benchmarks:.*/return_code
+    kernel_launch_rule:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: INVESTIGATING
+      metrics:
+        - kernel-launch:*.*/.*_time
   # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable
     tensor_core_rule:
       function: variance
       criteria: 'lambda x:x<-0.05'
-      categories: TensorCore 
+      categories: TensorCore
       metrics:
         - gemm-flops:*.*/.*ops
+    cudnn_cublas:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: TensorCore
+      metrics:
+        - cublas-function:*.*/.*_time
+        - cudnn-function:*.*/.*_time
+    matmul:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: TensorCore
+      metrics:
+        - matmul:*.*/.*_time
+        - gpu-burn/.*
+        - cpu-memory-bw-latency/.*_lat
+    cpu:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: TensorCore
+      metrics:
+        - cpu-memory-bw-latency/.*_bw
   # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable
     mem_bw_rule:
       function: variance
@@ -35,21 +62,42 @@ superbench:
       metrics:
         - mem-bw:*.*/h2d_bw
         - mem-bw:*.*/d2h_bw
+    gpu-copy:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Mem
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
   # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable
     rdma_loopback_rule:
       function: variance
       criteria: 'lambda x:x<-0.05'
       categories: RDMA
       metrics:
-        - ib-loopback:*.*/ib_write_8388608_.*_bw    
+        - ib-loopback:*.*/ib_write_bw_8388608
+        #- ib-loopback:*.*/ib_write_bw_.*
   # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable
     nccl_rule:
       function: variance
       criteria: 'lambda x:x<-0.05'
       categories: NCCL
       metrics:
-        - nccl-bw:default/allreduce_8589934592_busbw
-  # Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
+        - nccl-bw:.*/allreduce_8589934592_busbw
+    investigating_rules:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: INVESTIGATING
+      metrics:
+        - computation-communication-overlap:*.*/.*_time
+        - sharding-matmul:*.*/.*_time
+        - ort-inference/.*_time.*
+    disk_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: NCCL
+      metrics:
+        - disk-benchmark/nvme(\d+n1)_.*_iops
+  #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
     model_throughput_rule:
       function: variance
       criteria: 'lambda x:x<-0.05'
@@ -90,17 +138,4 @@ superbench:
       categories: CNN
       function: multi_rules
       criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False'
-  # Rule 10: If temperature of one GPU is > 85 °C, label it as Not acceptable
-    temperature_rule:
-      function: value
-      categories: TEMP 
-      criteria: 'lambda x:x>85'
-      metrics:
-        - monitor/gpu_temperature
-  # Rule 11: If DBE > 0, label it as Not acceptable (Not Support)
-    dbe_rule:
-      function: value
-      categories: TEMP 
-      criteria: 'lambda x:x>85'
-      metrics:   
-         - monitor/gpu_uncorrected_ecc
+

From 5eb9219e20b2a62ec4e3c381d324bb758ba98bbb Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 13 Apr 2023 02:43:16 +0000
Subject: [PATCH 04/12] add statistic and revise

---
 superbench/analyzer/generate_baseline.py  |  62 ++++++--
 superbench/analyzer/generate_statistic.py | 186 ++++++++++++++++++++++
 2 files changed, 232 insertions(+), 16 deletions(-)
 create mode 100644 superbench/analyzer/generate_statistic.py

diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
index e920ca142..bdab5851b 100644
--- a/superbench/analyzer/generate_baseline.py
+++ b/superbench/analyzer/generate_baseline.py
@@ -6,7 +6,6 @@
 import argparse
 from copy import deepcopy
 import json
-from pathlib import Path
 import re
 
 from joblib import Parallel, delayed
@@ -29,6 +28,7 @@ class BaselineAlgoType(Enum):
 
 
 class GenerateBaseline(DataDiagnosis):
+    """The class to generate baseline for raw data."""
     def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op):
         """Fix threshold outlier detection algorithm.
 
@@ -72,7 +72,7 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli
         return baseline, data_series
 
     def get_aggregate_data(self, raw_data_file, summary_rule_file):
-        """Aggregate raw data according to the summary rule file.
+        r"""Aggregate raw data according to the summary rule file.
 
         If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data
         If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern
@@ -149,15 +149,24 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
                     function_name = self._sb_rules[rule]['function']
                     rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
                     outputs = Parallel(n_jobs=-1)(
-                        delayed(self.fix_threshold_outlier_detection)(
-                            aggregated_df[metric], single_metric_rule, metric, rule_op)
-                        for metric in metrics)
+                        delayed(self.fix_threshold_outlier_detection)
+                        (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
+                    )
                     for index, out in enumerate(outputs):
                         baseline[metrics[index]] = out[0]
                         aggregated_df[metrics[index]] = out[1]
         return baseline
 
-    def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean', diagnosis_rule_file=None, baseline_file=None, digit=2):
+    def run(
+        self,
+        raw_data_file,
+        summary_rule_file,
+        output_dir,
+        algorithm='mean',
+        diagnosis_rule_file=None,
+        baseline_file=None,
+        digit=2
+    ):
         """Export baseline to json file.
 
         If diagnosis_rule_file is None, use mean of the data as baseline.
@@ -165,7 +174,12 @@ def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean',
 
         Args:
             raw_data_df (DataFrame): raw data
-            output_dir (str): the directory of output file
+            summary_rule_file (str): the file name of the summary rule file
+            output_dir (str): the directory to save the baseline file
+            algorithm (str): the algorithm to generate the baseline
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm
+            baseline_file (str): the file name of the baseline file
+            digit (int): the number of digits after the decimal point
         """
         try:
             # aggregate results from different devices
@@ -182,11 +196,14 @@ def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean',
                     baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
                 elif isinstance(self._raw_data_df[metric].iloc[0], int):
                     baseline[metric] = int(val)
-            print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0])
-            print(self._raw_data_df["gpu-burn/gpu_0_pass"].iloc[0].type)
+                else:
+                    try:
+                        baseline[metric] = float(val)
+                    except Exception as e:
+                        logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
             baseline = json.dumps(baseline, indent=2, sort_keys=True)
             baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
-            with open(output_dir+'/baseline.json', mode='w') as f:
+            with open(output_dir + '/baseline.json', mode='w') as f:
                 f.write(baseline)
 
         except Exception as e:
@@ -197,20 +214,33 @@ def run(self, raw_data_file,  summary_rule_file,  output_dir, algorithm='mean',
     global args
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--algo', type=str, default='mean', required=False, help='Algorithm to generate baseline, eg, mean/fix_threshold.'
+        '--algo',
+        type=str,
+        default='fix_threshold',
+        required=False,
+        help='Algorithm to generate baseline, eg, mean/fix_threshold.'
+    )
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default=None,
+        required=False,
+        help='Input directory which stores the results-summary.jsonl.'
+    )
+    parser.add_argument(
+        '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.'
     )
     parser.add_argument(
-        '--input_dir', type=str, default=None, required=False, help='Input directory which stores the results-summary.jsonl.'
+        '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.'
     )
     args = parser.parse_args()
     folder = args.input_dir
-    #folder = '/Users/jiangyt/Documents/000-workspace/raw-data/ndv4/0.5-ndv41'
     if args.algo == 'mean':
         # simply use mean, need result_summary rules to define how to aggregate the metrics.
         print('Generate baseine using mean of the data.')
-        GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml', folder)
     elif args.algo == 'fix_threshold':
         # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
         print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.')
-        GenerateBaseline().run(folder+'/results-summary.jsonl', 'rules/aggregation_rules.yaml',
-                               folder, 'fix_threshold', 'rules/diagnosis_rules.yaml')
+        GenerateBaseline().run(
+            folder + '/results-summary.jsonl', args.diagnosis_rule_file, folder, 'fix_threshold', args.summary_rule_file
+        )
diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py
new file mode 100644
index 000000000..51bf2d75a
--- /dev/null
+++ b/superbench/analyzer/generate_statistic.py
@@ -0,0 +1,186 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for baseline generation."""
+
+import argparse
+import os
+
+from joblib import Parallel, delayed
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer import data_analysis
+from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+from generate_baseline import GenerateBaseline
+
+
+def plot_steps(data, title=None, save_path=None, show=True):
+    """Plot steps.
+
+    Args:
+        data (list): data to plot
+        title (str): title of the plot
+        save_path (str): path to save the plot
+        show (bool): whether to show the plot
+    """
+    plt.figure(figsize=(10, 6))
+    plt.scatter(range(0, len(data)), data)
+    if title:
+        plt.title(title)
+    plt.xlabel('Devices')
+    plt.ylabel('Value')
+    plt.ylim(0, max(data) * 1.1)
+    if save_path is not None:
+        plt.savefig(save_path)
+    if show:
+        plt.show()
+    plt.close()
+
+
+class GenerateStatistics(GenerateBaseline):
+    """GenerateStatistics class to generate statistics for raw data."""
+    def calculate_statistics(self, healthy_df):
+        """Calculate statistics for healthy data.
+
+        Args:
+            healthy_df (DataFrame): healthy data
+
+        Returns:
+            DataFrame: statistics for healthy data
+        """
+        stat_df = data_analysis.statistic(healthy_df)
+        stat_df.loc['(max-min)/max'] = (stat_df.loc['max'] - stat_df.loc['min']) / stat_df.loc['max']
+        stat_df = stat_df.drop(index='1%')
+        stat_df = stat_df.drop(index='5%')
+        stat_df = stat_df.drop(index='95%')
+        stat_df = stat_df.drop(index='99%')
+        return stat_df
+
+    def output_excel(self, excel_file, stat_df, digit=2):
+        """Output excel file.
+
+        Args:
+            excel_file (str): excel file path
+            stat_df (DataFrame): statistics data
+            digit (int): digit to round
+        """
+        try:
+            writer = pd.ExcelWriter(excel_file, engine='xlsxwriter')
+
+            for benchmark in self._benchmark_metrics_dict:
+                benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]]
+                sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1]
+                benchmark_df.to_excel(writer, sheet_name=sheet_name)
+                worksheet = writer.sheets[sheet_name]
+                row_start = 1
+                row_end = max(row_start, len(self._benchmark_metrics_dict[benchmark]))
+                columns = list(benchmark_df.index)
+                col_index = columns.index('(max-min)/max') + 1
+                workbook = writer.book
+                percent_format = workbook.add_format({'num_format': '0.00%'})
+                worksheet.conditional_format(
+                    col_index,
+                    row_start,
+                    col_index,
+                    row_end,    # start_row, start_col, end_row, end_col
+                    {
+                        'type': 'no_blanks',
+                        'format': percent_format
+                    }
+                )
+                num_format = f'0.{digit * "0"}'
+                for col_index in range(2, len(columns)):
+                    round_format = workbook.add_format({'num_format': num_format})
+                    worksheet.conditional_format(
+                        col_index,
+                        row_start,
+                        col_index,
+                        row_end,    # start_row, start_col, end_row, end_col
+                        {
+                            'type': 'no_blanks',
+                            'format': round_format
+                        }
+                    )
+            writer.close()
+        except Exception as e:
+            logger.error('output excel failed: {}'.format(str(e)))
+
+    def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_file=None, digit=2, plot=False):
+        """Run the statistics generation.
+
+        Args:
+            raw_data_file (str): raw data file path
+            output_dir (str): output directory
+            diagnosis_rule_file (str): diagnosis rule file path
+            summary_rule_file (str): summary rule file path
+            digit (int): digit to round
+            plot (bool): whether to plot the data
+        """
+        try:
+            # aggregate results from different devices
+            self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file)
+            # re-organize metrics by benchmark names
+            self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
+            # read existing baseline
+            baseline = {}
+            # read diagnosis rules
+            aggregated_df = self._raw_data_df.copy()
+            rules = file_handler.read_rules(diagnosis_rule_file)
+            if not self._parse_rules_and_baseline(rules, baseline):
+                logger.error('parse rule failed')
+                return None
+            else:
+                for rule in self._sb_rules:
+                    single_metric_rule = self._sb_rules[rule]
+                    metrics = list(single_metric_rule['metrics'].keys())
+                    function_name = self._sb_rules[rule]['function']
+                    rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
+                    outputs = Parallel(n_jobs=-1)(
+                        delayed(self.fix_threshold_outlier_detection)
+                        (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
+                    )
+                    for index, out in enumerate(outputs):
+                        aggregated_df[metrics[index]] = out[1]
+                        if plot:
+                            plot_steps(
+                                out[1].tolist(),
+                                title=metrics[index],
+                                save_path=os.path.join(
+                                    output_dir, 'figures', metrics[index].replace('/', '_').replace(':', '_') + '.png'
+                                ),
+                                show=False
+                            )
+            stat_df = self.calculate_statistics(aggregated_df)
+            excel_file = os.path.join(output_dir, 'benchmark_stability_stat.xlsx')
+            self.output_excel(excel_file, stat_df, digit)
+
+        except Exception as e:
+            logger.error('Analyzer: generate statisitics failed, msg: {}'.format(str(e)))
+
+
+if __name__ == '__main__':
+    global args
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default=None,
+        required=False,
+        help='Input directory which stores the results-summary.jsonl.'
+    )
+    parser.add_argument(
+        '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.'
+    )
+    parser.add_argument(
+        '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.'
+    )
+    args = parser.parse_args()
+
+    # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
+    GenerateStatistics().run(
+        args.input_dir + '/results-summary.jsonl', args.input_dir, args.diagnosis_rule_file, args.summary_rule_file
+    )

From c589e8d3c06aee860ccd7e92369357b86e543ce2 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 13 Apr 2023 02:51:21 +0000
Subject: [PATCH 05/12] add example rules

---
 superbench/analyzer/rules/analysis_rules.yaml | 100 +++++++++++++
 .../analyzer/rules/diagnosis_rules.yaml       | 131 ++++++++++++------
 2 files changed, 192 insertions(+), 39 deletions(-)
 create mode 100644 superbench/analyzer/rules/analysis_rules.yaml

diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml
new file mode 100644
index 000000000..f71d76b80
--- /dev/null
+++ b/superbench/analyzer/rules/analysis_rules.yaml
@@ -0,0 +1,100 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+    model-benchmarks:
+      statistics:
+        - mean
+      categories: models
+      metrics:
+        - model-benchmarks:.*/.*/.*_train_throughput
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/pytorch-lstm/.*_train_throughput
+        - bert_models/pytorch-bert-.*/.*_train_throughput
+        - resnet_models/pytorch-resnet\d*/.*_train_throughput
+        - vgg_models/pytorch-vgg\d*/.*_train_throughput
+        - densenet_models/.*/.*_train_throughput
+    micro-flops:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops
+        - gemm-flops:*.*/.*op
+    micro-cublasltflops:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - cublaslt-gemm/fp.*_.*_flops
+    micro-cublasltbatchflops:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - cublaslt-gemm:bmm/fp.*_.*_flops
+    micro-aggregation-with-aggregate:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - kernel-launch:*.*/.*_time
+        - dist-inference/.*step_times.*
+        - mem-bw:*.*/.*_bw
+        - computation-communication-overlap:*.*/.*_time
+    micro-aggregation-wo-aggregate:
+      statistics: mean
+      categories: MICRO
+      aggregate: False
+      metrics:
+        - nccl-bw:*.*/allreduce_.*_busbw
+        - sharding-matmul:*.*/.*_time
+        - matmul:*.*/.*_time
+        - gpu-burn:*.*/.*
+        - cpu-memory-bw-latency/.*_bw
+        - cpu-memory-bw-latency/.*_lat
+        #- cublas-function:*.*/.*_time
+        #- cudnn-function:*.*/.*_time
+        #- ort-inference/.*_time.*
+        #- tensorrt-inference/.*_time.*
+    gpu-copy-bw:
+      statistics: mean
+      categories: DTOH
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw
+    # gpu-copy-bw1:
+    #   statistics: mean
+    #   categories: DTOH
+    #   metrics:
+    #     #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+    #     - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+    #   aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw
+    # gpu-copy-bw2:
+    #   statistics: mean
+    #   categories: HTOD
+    #   metrics:
+    #     #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+    #     - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+    #   aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+    # # ib-loopback1:
+    # #   statistics: mean
+    # #   categories: RDMA
+    # #   metrics:
+    # #     #- ib-loopback/ib_write_8388608_ib.*_bw
+    # #     - ib-loopback/ib_write_.*_ib.*_bw
+    # #   aggregate: ib-loopback/ib_write_.*_ib(.*)_bw
+    # ib-loopback2:
+    #   statistics: mean
+    #   categories: RDMA
+    #   metrics:
+    #     #- ib-loopback/ib_write_8388608_ib.*_bw
+    #     - ib-loopback/ib_write_bw_.*:\d+
+    #   #aggregate: True
+    # disk:
+    #   statistics: mean
+    #   categories: DISK
+    #   aggregate: disk-benchmark/nvme(\d+n1)_.*_iops
+    #   metrics:
+    #     - disk-benchmark/nvme(\d+n1)_.*_iops
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index 1d58080b8..a7074087a 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -19,7 +19,30 @@ superbench:
         - resnet_models/.*/return_code
         - vgg_models/.*/return_code
         - densenet_models/.*/return_code
-        - model-benchmarks:.*/return_code
+        - model-benchmarks:.*/return_code:.*
+        #- cudnn-function:*.*/return_code                        # 06:14
+        - cublaslt-gemm:*.*/return_code
+        - cublas-function:*.*/return_code                       # 05:52
+        - matmul:*.*/return_code                                # 00:25
+        - gpu-burn:*.*/return_code                              # 15:08
+        # microbenchmark - communication
+        - cpu-memory-bw-latency:*.*/return_code                 # 05:38
+        - gpu-copy-bw:*.*/return_code                     # 08:44
+        - computation-communication-overlap:*.*/return_code     # 06:30
+        - sharding-matmul:*.*/return_code                       # 00:24
+        # microbenchmark - storage
+        #- disk-benchmark                       # 18:47
+        # model benchmark - inference
+        # - ort-inference:*.*/return_code                         # 03:43
+        ##tensorrt-inference                    # 02:03:33
+        - dist-inference:*.*/return_code
+        - cublaslt-gemm:*.*/return_code
+    kernel_launch_rule_outlier:
+      function: value
+      criteria: 'lambda x:x<0.001'
+      categories: INVESTIGATING
+      metrics:
+        - kernel-launch:*.*/.*_time
     kernel_launch_rule:
       function: variance
       criteria: 'lambda x:x>0.05'
@@ -33,25 +56,47 @@ superbench:
       categories: TensorCore
       metrics:
         - gemm-flops:*.*/.*ops
-    cudnn_cublas:
+    cublaslt_gemm_rule:
       function: variance
-      criteria: 'lambda x:x>0.05'
-      categories: TensorCore
-      metrics:
-        - cublas-function:*.*/.*_time
-        - cudnn-function:*.*/.*_time
+      criteria: 'lambda x:x<-0.05'
+      categories: CUBLASLT
+      metrics:
+        - cublaslt-gemm:*.*/.*ops
+    # cublas:
+    #   function: variance
+    #   criteria: 'lambda x:x>0.05'
+    #   categories: CUBLAS
+    #   metrics:
+    #     - cublas-function:*.*/.*_time
+    # cudnn:
+    #   function: variance
+    #   criteria: 'lambda x:x>0.05'
+    #   categories: CUDNN
+    #   metrics:
+    #     - cudnn-function:*.*/.*_time
     matmul:
       function: variance
       criteria: 'lambda x:x>0.05'
-      categories: TensorCore
+      categories: MATMUL
       metrics:
         - matmul:*.*/.*_time
-        - gpu-burn/.*
+        - sharding-matmul:*.*/.*_time
+    cpu1:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CPU
+      metrics:
         - cpu-memory-bw-latency/.*_lat
+    gpu_burn:
+      function: value
+      criteria: 'lambda x:x!=1'
+      categories: GPUBURN
+      metrics:
+        - gpu-burn:*.*/.*_pass
     cpu:
       function: variance
       criteria: 'lambda x:x<-0.05'
-      categories: TensorCore
+      categories: CPU
       metrics:
         - cpu-memory-bw-latency/.*_bw
   # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable
@@ -65,16 +110,16 @@ superbench:
     gpu-copy:
       function: variance
       criteria: 'lambda x:x<-0.05'
-      categories: Mem
+      categories: GPUCOPY
       metrics:
         - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
   # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable
-    rdma_loopback_rule:
-      function: variance
-      criteria: 'lambda x:x<-0.05'
-      categories: RDMA
-      metrics:
-        - ib-loopback:*.*/ib_write_bw_8388608
+    # rdma_loopback_rule:
+    #   function: variance
+    #   criteria: 'lambda x:x<-0.05'
+    #   categories: RDMA
+    #   metrics:
+    #     - ib-loopback:*.*/ib_write_bw_8388608
         #- ib-loopback:*.*/ib_write_bw_.*
   # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable
     nccl_rule:
@@ -82,21 +127,25 @@ superbench:
       criteria: 'lambda x:x<-0.05'
       categories: NCCL
       metrics:
-        - nccl-bw:.*/allreduce_8589934592_busbw
-    investigating_rules:
+        - nccl-bw:*.*/allreduce_.*_busbw
+    overlap_rules:
       function: variance
       criteria: 'lambda x:x>0.05'
-      categories: INVESTIGATING
+      categories: OVERLAP
       metrics:
         - computation-communication-overlap:*.*/.*_time
-        - sharding-matmul:*.*/.*_time
-        - ort-inference/.*_time.*
-    disk_rule:
+    investigating_rules:
       function: variance
-      criteria: 'lambda x:x<-0.05'
-      categories: NCCL
-      metrics:
-        - disk-benchmark/nvme(\d+n1)_.*_iops
+      criteria: 'lambda x:x>0.05'
+      categories: DIST_INFERENCE
+      metrics:
+        - dist-inference/.*step_times
+    # disk_rule:
+    #   function: variance
+    #   criteria: 'lambda x:x<-0.05'
+    #   categories: DISK
+    #   metrics:
+    #     - disk-benchmark/nvme(\d+n1)_.*_iops
   #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
     model_throughput_rule:
       function: variance
@@ -104,11 +153,11 @@ superbench:
       categories: Model
       metrics:
         - gpt_models/.*/.*_train_throughput
-        - lstm_models/pytorch-lstm/.*_train_throughput
-        - bert_models/pytorch-bert-.*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-bert.*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-gpt.*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-lstm.*/.*_train_throughput
+        - lstm_models/.*lstm/.*_train_throughput
+        - bert_models/.*bert-.*/.*_train_throughput
+        - model-benchmarks:.*/.*bert.*/.*_train_throughput
+        - model-benchmarks:.*/.*gpt.*/.*_train_throughput
+        - model-benchmarks:.*/.*lstm.*/.*_train_throughput
   # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable
     resnet_throughput:
       function: variance
@@ -116,26 +165,30 @@ superbench:
       store: true
       categories: CNN
       metrics:
-        - resnet_models/pytorch-resnet\d*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-resnet.*/.*_train_throughput
+        - resnet_models/.*resnet\d*/.*_train_throughput
+        - model-benchmarks:.*/.*resnet.*/.*_train_throughput
     vgg_throughput:
       function: variance
       criteria: 'lambda x:x<-0.05'
       store: true
       categories: CNN
       metrics:
-        - vgg_models/pytorch-vgg\d*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-vgg.*/.*_train_throughput
+        - vgg_models/.*vgg\d*/.*_train_throughput
+        - model-benchmarks:.*/.*vgg.*/.*_train_throughput
     densenet_throughput:
       function: variance
       criteria: 'lambda x:x<-0.05'
       store: true
       categories: CNN
       metrics:
-        - densenet_models/pytorch-densenet\d*/.*_train_throughput
-        - model-benchmarks:.*/pytorch-densenet.*/.*_train_throughput
+        - densenet_models/.*densenet\d*/.*_train_throughput
+        - model-benchmarks:.*/.*densenet.*/.*_train_throughput
     cnn_throughput_rule:
       categories: CNN
       function: multi_rules
-      criteria: 'lambda label:True if label["resnet_throughput"]+label["vgg_throughput"]+label["densenet_throughput"]>=2 else False'
+      criteria: 'lambda label:True if label["resnet_throughput"]+label["densenet_throughput"]>=2 else False'
+    vgg_throughput_rule:
+      categories: VGG
+      function: multi_rules
+      criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False'
 

From 19d7beec155006781eb916416ba9405bf8d3d6d8 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 13 Apr 2023 05:05:05 +0000
Subject: [PATCH 06/12] fix bug

---
 superbench/analyzer/generate_baseline.py       |  3 ++-
 superbench/analyzer/generate_statistic.py      | 17 ++++++++++++++---
 superbench/analyzer/rules/analysis_rules.yaml  | 14 +++++++-------
 superbench/analyzer/rules/diagnosis_rules.yaml | 12 ++++++------
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
index bdab5851b..7b2de74c2 100644
--- a/superbench/analyzer/generate_baseline.py
+++ b/superbench/analyzer/generate_baseline.py
@@ -48,7 +48,8 @@ def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseli
         Returns:
             tuple: the baseline of the metric, normal data of the metric
         """
-        if single_metric_with_baseline['metrics'][metric] != -1:
+        if single_metric_with_baseline['metrics'][metric] != None and single_metric_with_baseline['metrics'][metric
+                                                                                                             ] != -1:
             return single_metric_with_baseline['metrics'][metric]
         tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline)
         tmp_single_metric_with_baseline['metrics'] = {}
diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py
index 51bf2d75a..13cd146f9 100644
--- a/superbench/analyzer/generate_statistic.py
+++ b/superbench/analyzer/generate_statistic.py
@@ -143,6 +143,9 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_
                         (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
                     )
                     for index, out in enumerate(outputs):
+                        if not out:
+                            logger.error('Analyzer: filter healthy nodese failed')
+                            return
                         aggregated_df[metrics[index]] = out[1]
                         if plot:
                             plot_steps(
@@ -168,15 +171,23 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_
     parser.add_argument(
         '--input_dir',
         type=str,
-        default=None,
+        default='rawdata/',
         required=False,
         help='Input directory which stores the results-summary.jsonl.'
     )
     parser.add_argument(
-        '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.'
+        '--diagnosis_rule_file',
+        type=str,
+        default='rules/diagnosis_rules.yaml',
+        required=False,
+        help='The input path of diagnosis rule file.'
     )
     parser.add_argument(
-        '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.'
+        '--summary_rule_file',
+        type=str,
+        default='rules/analysis_rules.yaml',
+        required=False,
+        help='The input path of summary rule file.'
     )
     args = parser.parse_args()
 
diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml
index f71d76b80..8632de21a 100644
--- a/superbench/analyzer/rules/analysis_rules.yaml
+++ b/superbench/analyzer/rules/analysis_rules.yaml
@@ -27,12 +27,12 @@ superbench:
       aggregate: True
       metrics:
         - cublaslt-gemm/fp.*_.*_flops
-    micro-cublasltbatchflops:
-      statistics: mean
-      categories: MICRO
-      aggregate: True
-      metrics:
-        - cublaslt-gemm:bmm/fp.*_.*_flops
+    # micro-cublasltbatchflops:
+    #   statistics: mean
+    #   categories: MICRO
+    #   aggregate: True
+    #   metrics:
+    #     - cublaslt-gemm:bmm/fp.*_.*_flops
     micro-aggregation-with-aggregate:
       statistics: mean
       categories: MICRO
@@ -50,7 +50,7 @@ superbench:
         - nccl-bw:*.*/allreduce_.*_busbw
         - sharding-matmul:*.*/.*_time
         - matmul:*.*/.*_time
-        - gpu-burn:*.*/.*
+        #- gpu-burn:*.*/.*_pass
         - cpu-memory-bw-latency/.*_bw
         - cpu-memory-bw-latency/.*_lat
         #- cublas-function:*.*/.*_time
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index a7074087a..435340fe4 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -87,12 +87,12 @@ superbench:
       categories: CPU
       metrics:
         - cpu-memory-bw-latency/.*_lat
-    gpu_burn:
-      function: value
-      criteria: 'lambda x:x!=1'
-      categories: GPUBURN
-      metrics:
-        - gpu-burn:*.*/.*_pass
+    # gpu_burn:
+    #   function: value
+    #   criteria: 'lambda x:x!=1'
+    #   categories: GPUBURN
+    #   metrics:
+    #     - gpu-burn:*.*/.*_pass
     cpu:
       function: variance
       criteria: 'lambda x:x<-0.05'

From 77ef4cc551d662c761407d1f3ef8871ee05d3eef Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Fri, 14 Apr 2023 10:08:13 +0000
Subject: [PATCH 07/12] update aggregation rule for baseline generation

---
 superbench/analyzer/generate_baseline.py      |  3 +-
 .../analyzer/rules/aggregation_rules.yaml     | 63 +++++++++----------
 superbench/analyzer/rules/analysis_rules.yaml | 12 ++--
 3 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
index 7b2de74c2..9f82d26f8 100644
--- a/superbench/analyzer/generate_baseline.py
+++ b/superbench/analyzer/generate_baseline.py
@@ -29,6 +29,7 @@ class BaselineAlgoType(Enum):
 
 class GenerateBaseline(DataDiagnosis):
     """The class to generate baseline for raw data."""
+
     def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op):
         """Fix threshold outlier detection algorithm.
 
@@ -243,5 +244,5 @@ def run(
         # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
         print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.')
         GenerateBaseline().run(
-            folder + '/results-summary.jsonl', args.diagnosis_rule_file, folder, 'fix_threshold', args.summary_rule_file
+            folder + '/results-summary.jsonl', args.summary_rule_file, folder, 'fix_threshold', args.diagnosis_rule_file
         )
diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml
index 6cf1e8df0..db8e7a87d 100644
--- a/superbench/analyzer/rules/aggregation_rules.yaml
+++ b/superbench/analyzer/rules/aggregation_rules.yaml
@@ -2,32 +2,21 @@
 version: v0.5
 superbench:
   rules:
-    model-benchmarks-FP32:
+    model-benchmarks:
       statistics:
         - mean
-      categories: model:FP32
+      categories: model
       metrics:
-        - model-benchmarks:.*/.*/fp32_train_throughput
-        - gpt_models/.*/fp32_train_throughput
-        - lstm_models/pytorch-lstm/fp32_train_throughput
-        - bert_models/pytorch-bert-.*/fp32_train_throughput
-        - resnet_models/pytorch-resnet\d*/fp32_train_throughput
-        - vgg_models/pytorch-vgg\d*/fp32_train_throughput
-        - densenet_models/.*/fp32_train_throughput
-    model-benchmarks-FP16:
-      statistics: mean
-      categories: model:FP16
-      metrics:
-        - model-benchmarks:.*/.*/fp16_train_throughput
-        - gpt_models/.*/fp16_train_throughput
-        - lstm_models/pytorch-lstm/fp16_train_throughput
-        - bert_models/pytorch-bert-.*/fp16_train_throughput
-        - resnet_models/pytorch-resnet\d*/fp16_train_throughput
-        - vgg_models/pytorch-vgg\d*/fp16_train_throughput
-        - densenet_models/.*/fp16_train_throughput
+        - model-benchmarks:.*/.*/.*_train_throughput
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/pytorch-lstm/.*_train_throughput
+        - bert_models/pytorch-bert-.*/.*_train_throughput
+        - resnet_models/pytorch-resnet\d*/.*_train_throughput
+        - vgg_models/pytorch-vgg\d*/.*_train_throughput
+        - densenet_models/.*/.*_train_throughput
     micro-aggregation:
       statistics: mean
-      categories: MICRO
+      categories: MICRO1
       aggregate: True
       metrics:
         - gemm-flops:*.*/.*ops
@@ -38,37 +27,41 @@ superbench:
         - cudnn-function:*.*/.*_time
         - ort-inference/.*_time.*
         - tensorrt-inference/.*_time.*
+        - cublaslt-gemm:*.*/.*ops
+        - dist-inference/.*step_times.*
     micro-nonaggregation:
       statistics: mean
-      categories: MICRO
+      categories: MICRO2
       metrics:
         - nccl-bw:*.*/allreduce_.*_busbw
         - rccl-bw:*.*/allreduce_.*_busbw
         - sharding-matmul:*.*/.*_time
         - matmul:*.*/.*_time
-        - gpu-burn/.*
+        - gpu-burn/.*_pass
+        - gpu_burn/.*_abort
         - cpu-memory-bw-latency/.*_bw
         - cpu-memory-bw-latency/.*_lat
     gpu-copy-bw:
       statistics: mean
-      categories: RDMA
+      categories: DTOH
       metrics:
         - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
-      aggregate: gpu-copy-bw:perf/.*gpu(.*)_to_gpu(.*)_write_by_.*_bw
-    ib-loopback1:
+        #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw
+    gpu-copy-bw1:
       statistics: mean
-      categories: RDMA
+      categories: DTOH
       metrics:
-        #- ib-loopback/ib_write_8388608_ib.*_bw
-        - ib-loopback/ib_write_.*_ib.*_bw
-      aggregate: ib-loopback/ib_write_.*_ib(.*)_bw
-    ib-loopback2:
+        #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw
+    gpu-copy-bw2:
       statistics: mean
-      categories: RDMA
+      categories: HTOD
       metrics:
-        #- ib-loopback/ib_write_8388608_ib.*_bw
-        - ib-loopback/ib_write_bw_.*:\d+
-      aggregate: True
+        #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
     disk:
       statistics: mean
       categories: DISK
diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml
index 8632de21a..23f8e4c2a 100644
--- a/superbench/analyzer/rules/analysis_rules.yaml
+++ b/superbench/analyzer/rules/analysis_rules.yaml
@@ -21,12 +21,12 @@ superbench:
       metrics:
         - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops
         - gemm-flops:*.*/.*op
-    micro-cublasltflops:
-      statistics: mean
-      categories: MICRO
-      aggregate: True
-      metrics:
-        - cublaslt-gemm/fp.*_.*_flops
+    # micro-cublasltflops:
+    #   statistics: mean
+    #   categories: MICRO
+    #   aggregate: True
+    #   metrics:
+    #     - cublaslt-gemm/fp.*_.*_flops
     # micro-cublasltbatchflops:
     #   statistics: mean
     #   categories: MICRO

From 6e21416b588431b1a407024b32832d97dd6a0f15 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Fri, 14 Apr 2023 10:10:32 +0000
Subject: [PATCH 08/12] update diagnosis rule for baseline generation

---
 .../analyzer/rules/diagnosis_rules.yaml       | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index 435340fe4..8ece570ab 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -62,18 +62,18 @@ superbench:
       categories: CUBLASLT
       metrics:
         - cublaslt-gemm:*.*/.*ops
-    # cublas:
-    #   function: variance
-    #   criteria: 'lambda x:x>0.05'
-    #   categories: CUBLAS
-    #   metrics:
-    #     - cublas-function:*.*/.*_time
-    # cudnn:
-    #   function: variance
-    #   criteria: 'lambda x:x>0.05'
-    #   categories: CUDNN
-    #   metrics:
-    #     - cudnn-function:*.*/.*_time
+    cublas:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CUBLAS
+      metrics:
+        - cublas-function:*.*/.*_time
+    cudnn:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CUDNN
+      metrics:
+        - cudnn-function:*.*/.*_time
     matmul:
       function: variance
       criteria: 'lambda x:x>0.05'
@@ -87,12 +87,12 @@ superbench:
       categories: CPU
       metrics:
         - cpu-memory-bw-latency/.*_lat
-    # gpu_burn:
-    #   function: value
-    #   criteria: 'lambda x:x!=1'
-    #   categories: GPUBURN
-    #   metrics:
-    #     - gpu-burn:*.*/.*_pass
+    gpu_burn:
+      function: value
+      criteria: 'lambda x:x!=1'
+      categories: GPUBURN
+      metrics:
+        - gpu-burn:*.*/.*_pass
     cpu:
       function: variance
       criteria: 'lambda x:x<-0.05'
@@ -140,12 +140,12 @@ superbench:
       categories: DIST_INFERENCE
       metrics:
         - dist-inference/.*step_times
-    # disk_rule:
-    #   function: variance
-    #   criteria: 'lambda x:x<-0.05'
-    #   categories: DISK
-    #   metrics:
-    #     - disk-benchmark/nvme(\d+n1)_.*_iops
+    disk_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: DISK
+      metrics:
+        - disk-benchmark/nvme(\d+n1)_.*_iops
   #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
     model_throughput_rule:
       function: variance

From 0ddeed88a1ce7b060069f680f166cf65c905dd57 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <guzhao@microsoft.com>
Date: Fri, 16 Jun 2023 16:08:21 +0800
Subject: [PATCH 09/12] for inference

---
 superbench/analyzer/generate_statistic.py     |  3 ++
 .../analyzer/rules/aggregation_rules.yaml     |  9 ++++-
 .../analyzer/rules/diagnosis_rules.yaml       | 39 ++++++++++++++++++-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py
index 13cd146f9..80ccaa8f8 100644
--- a/superbench/analyzer/generate_statistic.py
+++ b/superbench/analyzer/generate_statistic.py
@@ -5,6 +5,7 @@
 
 import argparse
 import os
+import natsort as ns
 
 from joblib import Parallel, delayed
 import pandas as pd
@@ -72,6 +73,8 @@ def output_excel(self, excel_file, stat_df, digit=2):
 
             for benchmark in self._benchmark_metrics_dict:
                 benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]]
+                #benchmark_df = benchmark_df[,mixedsort(names(benchmark_df))]
+                benchmark_df = benchmark_df.reindex(ns.natsorted(benchmark_df.columns), axis=1)
                 sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1]
                 benchmark_df.to_excel(writer, sheet_name=sheet_name)
                 worksheet = writer.sheets[sheet_name]
diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml
index db8e7a87d..3b330b5bb 100644
--- a/superbench/analyzer/rules/aggregation_rules.yaml
+++ b/superbench/analyzer/rules/aggregation_rules.yaml
@@ -2,7 +2,7 @@
 version: v0.5
 superbench:
   rules:
-    model-benchmarks:
+    model-train-benchmarks:
       statistics:
         - mean
       categories: model
@@ -14,6 +14,13 @@ superbench:
         - resnet_models/pytorch-resnet\d*/.*_train_throughput
         - vgg_models/pytorch-vgg\d*/.*_train_throughput
         - densenet_models/.*/.*_train_throughput
+    model-inference-benchmarks:
+      statistics:
+        - mean
+      categories: model
+      aggregate: True
+      metrics:
+        - model-benchmarks:.*/.*/.*_inference_throughput:\d+
     micro-aggregation:
       statistics: mean
       categories: MICRO1
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index 8ece570ab..3771566d3 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -191,4 +191,41 @@ superbench:
       categories: VGG
       function: multi_rules
       criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False'
-
+    model_inference_throughput_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Model
+      metrics:
+        - model-benchmarks:.*/.*bert.*/.*_inference_throughput
+        - model-benchmarks:.*/.*gpt.*/.*_inference_throughput
+        - model-benchmarks:.*/.*lstm.*/.*_inference_throughput
+  # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable
+    resnet_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*resnet.*/.*_inference_throughput
+    vgg_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*vgg.*/.*_inference_throughput
+    densenet_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*densenet.*/.*_inference_throughput
+    cnn_inference_throughput_rule:
+      categories: CNN
+      function: multi_rules
+      criteria: 'lambda label:True if label["resnet_inference_throughput"]+label["densenet_inference_throughput"]>=2 else False'
+    vgg_inference_throughput_rule:
+      categories: VGG
+      function: multi_rules
+      criteria: 'lambda label:True if label["vgg_inference_throughput"]>=2 else False'

From b5d5cc75b7cb9b6d45dc040bd7b0e24fd94e86c3 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <guzhao@microsoft.com>
Date: Mon, 13 Nov 2023 10:00:56 +0800
Subject: [PATCH 10/12] fix typo

---
 superbench/analyzer/file_handler.py            | 2 +-
 superbench/analyzer/generate_statistic.py      | 2 +-
 superbench/analyzer/rules/diagnosis_rules.yaml | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py
index 2337435ea..f9f4065f9 100644
--- a/superbench/analyzer/file_handler.py
+++ b/superbench/analyzer/file_handler.py
@@ -39,7 +39,7 @@ def read_raw_data(raw_data_path):
         raw_data_df = raw_data_df.rename(raw_data_df['node'])
         raw_data_df = raw_data_df.drop(columns=['node'])
     except Exception as e:
-        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data fomat - {}'.format(str(e)))
+        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data format - {}'.format(str(e)))
     return raw_data_df
 
 
diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py
index 80ccaa8f8..d64813366 100644
--- a/superbench/analyzer/generate_statistic.py
+++ b/superbench/analyzer/generate_statistic.py
@@ -147,7 +147,7 @@ def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_
                     )
                     for index, out in enumerate(outputs):
                         if not out:
-                            logger.error('Analyzer: filter healthy nodese failed')
+                            logger.error('Analyzer: filter healthy nodes failed')
                             return
                         aggregated_df[metrics[index]] = out[1]
                         if plot:
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
index 3771566d3..cbe7994bd 100644
--- a/superbench/analyzer/rules/diagnosis_rules.yaml
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -49,6 +49,12 @@ superbench:
       categories: INVESTIGATING
       metrics:
         - kernel-launch:*.*/.*_time
+    tensorrt_inference:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: TensorRT
+      metrics:
+        - tensorrt-inference/.*_time.*
   # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable
     tensor_core_rule:
       function: variance

From 1797a29c41c0c34c2e8e0062c015da46de155881 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <guzhao@microsoft.com>
Date: Mon, 13 Nov 2023 10:04:06 +0800
Subject: [PATCH 11/12] upgrade packages for analyzer.

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b42639eea..636d6119b 100644
--- a/setup.py
+++ b/setup.py
@@ -160,11 +160,12 @@ def run(self):
         'matplotlib>=3.0.0',
         'natsort>=7.1.1',
         'networkx>=2.5',
-        'numpy>=1.19.2',
+        'numpy>=1.20.3',
         'omegaconf==2.0.6',
         'openpyxl>=3.0.7',
         'pandas>=1.1.5',
         'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
+        'python-dateutil>=2.8.2'
         'pyyaml>=5.3',
         'requests>=2.27.1',
         'seaborn>=0.11.2',

From 6ded58860c3ffa2f58c9804f7b4e247d6d78cbd6 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <guzhao@microsoft.com>
Date: Mon, 20 Nov 2023 02:56:17 +0800
Subject: [PATCH 12/12] fix typo

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 636d6119b..4785728fc 100644
--- a/setup.py
+++ b/setup.py
@@ -165,7 +165,7 @@ def run(self):
         'openpyxl>=3.0.7',
         'pandas>=1.1.5',
         'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
-        'python-dateutil>=2.8.2'
+        'python-dateutil>=2.8.2',
         'pyyaml>=5.3',
         'requests>=2.27.1',
         'seaborn>=0.11.2',