diff --git a/setup.py b/setup.py
index b42639eea..4785728fc 100644
--- a/setup.py
+++ b/setup.py
@@ -160,11 +160,12 @@ def run(self):
         'matplotlib>=3.0.0',
         'natsort>=7.1.1',
         'networkx>=2.5',
-        'numpy>=1.19.2',
+        'numpy>=1.20.3',
         'omegaconf==2.0.6',
         'openpyxl>=3.0.7',
         'pandas>=1.1.5',
         'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
+        'python-dateutil>=2.8.2',
         'pyyaml>=5.3',
         'requests>=2.27.1',
         'seaborn>=0.11.2',
diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py
index 2337435ea..f9f4065f9 100644
--- a/superbench/analyzer/file_handler.py
+++ b/superbench/analyzer/file_handler.py
@@ -39,7 +39,7 @@ def read_raw_data(raw_data_path):
         raw_data_df = raw_data_df.rename(raw_data_df['node'])
         raw_data_df = raw_data_df.drop(columns=['node'])
     except Exception as e:
-        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data fomat - {}'.format(str(e)))
+        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data format - {}'.format(str(e)))
     return raw_data_df
 
 
diff --git a/superbench/analyzer/generate_baseline.py b/superbench/analyzer/generate_baseline.py
new file mode 100644
index 000000000..9f82d26f8
--- /dev/null
+++ b/superbench/analyzer/generate_baseline.py
@@ -0,0 +1,248 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for baseline generation."""
+
+import argparse
+from copy import deepcopy
+import json
+import re
+
+from joblib import Parallel, delayed
+import pandas as pd
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer import data_analysis
+from superbench.analyzer import DataDiagnosis
+from superbench.analyzer import ResultSummary
+from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+from superbench.benchmarks.context import Enum
+
+
+class BaselineAlgoType(Enum):
+    """The Enum class representing different baseline generation algorithm."""
+
+    MEAN = 'mean'
+    FIX_THRESHOLD = 'fix_threshold'
+
+
+class GenerateBaseline(DataDiagnosis):
+    """The class to generate baseline for raw data."""
+
+    def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op):
+        """Fix threshold outlier detection algorithm.
+
+        Step 0: Put all data in the collection
+        Step 1: Regenerate the collection
+        Calculate the average number in the collection as the baseline
+        Remove all data which cannot pass the fix threshold based on the new baseline
+        Step 2: If no data has been removed from Step 1, go to Step 3; otherwise, go to Step 1
+        Step 3: Use the baseline and fix threshold for Outlier Detection
+
+        Args:
+            data_series (pd.Series): data of the metric
+            single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format
+            metric (str): the name of the metric to execute the algorithm
+            rule_op (function): diagnosis rule op function
+
+        Returns:
+            tuple: the baseline of the metric, normal data of the metric
+        """
+        if single_metric_with_baseline['metrics'][metric] != None and single_metric_with_baseline['metrics'][metric
+                                                                                                             ] != -1:
+            return single_metric_with_baseline['metrics'][metric]
+        tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline)
+        tmp_single_metric_with_baseline['metrics'] = {}
+        clean = False
+        while clean is False:
+            clean = True
+            baseline_val = data_series.mean()
+            for val in data_series.index:
+                tmp_single_metric_with_baseline['metrics'][metric] = baseline_val
+                if baseline_val == 0:
+                    break
+                data_row = pd.Series([data_series[val]], index=[metric])
+                details = []
+                categories = set()
+                summary_data_row = pd.Series(index=[metric], dtype=float)
+                violated_num = rule_op(data_row, tmp_single_metric_with_baseline, summary_data_row, details, categories)
+                if violated_num:
+                    data_series = data_series.drop(val)
+                    clean = False
+        baseline = tmp_single_metric_with_baseline['metrics'][metric]
+        return baseline, data_series
+
+    def get_aggregate_data(self, raw_data_file, summary_rule_file):
+        r"""Aggregate raw data according to the summary rule file.
+
+        If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data
+        If the metric is aggregated by pattern in regex, aggregate the data and copy to all metrics which match this pattern
+
+        Args:
+            raw_data_file (str): the file name of the raw data file
+            summary_rule_file (str): the file name of the summary rule file
+
+        Returns:
+            DataFrame: aggregated data
+        """
+        self.rs = ResultSummary()
+        rules = self.rs._preprocess(raw_data_file, summary_rule_file)
+        # parse rules for result summary
+        if not self.rs._parse_rules(rules):
+            return
+        aggregated_df = pd.DataFrame()
+        for rule in self.rs._sb_rules:
+            single_metric_rule = self.rs._sb_rules[rule]
+            metrics = list(single_metric_rule['metrics'].keys())
+            data_df_of_rule = self.rs._raw_data_df[metrics]
+            if self.rs._sb_rules[rule]['aggregate']:
+                # if aggregate is True, aggregate in ranks
+                if self.rs._sb_rules[rule]['aggregate'] is True:
+                    data_df_of_rule = data_analysis.aggregate(data_df_of_rule)
+                # if aggregate is not empty and is a pattern in regex, aggregate according to pattern
+                else:
+                    pattern = self.rs._sb_rules[rule]['aggregate']
+                    data_df_of_rule_with_short_name = data_analysis.aggregate(data_df_of_rule, pattern)
+                    data_df_of_rule = pd.DataFrame(columns=metrics)
+                    # restore the columns of data_fd to full metric names
+                    for metric in metrics:
+                        short = ''
+                        match = re.search(pattern, metric)
+                        if match:
+                            metric_in_list = list(metric)
+                            for i in range(1, len(match.groups()) + 1):
+                                metric_in_list[match.start(i):match.end(i)] = '*'
+                            short = ''.join(metric_in_list)
+                        data_df_of_rule[metric] = data_df_of_rule_with_short_name[short]
+            aggregated_df = pd.concat([aggregated_df, data_df_of_rule], axis=1)
+        return aggregated_df
+
+    def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
+        """Generate the baseline in json format.
+
+        Args:
+            algo (str): the algorithm to generate the baseline
+            aggregated_df (DataFrame): aggregated data
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm
+            baseline (dict): existing baseline of some metrics
+
+        Returns:
+            dict: baseline of metrics defined in diagnosis_rule_files for fix_threshold algorithm or defined in rule_summary_files for mean
+        """
+        baseline = {}
+        # re-organize metrics by benchmark names
+        self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
+        if algo == 'mean':
+            mean_df = self._raw_data_df.mean()
+            for metric in self._raw_data_df.columns:
+                if metric in baseline:
+                    return baseline[metric]
+                baseline[metric] = mean_df[metric]
+        elif algo == 'fix_threshold':
+            # read diagnosis rules
+            rules = file_handler.read_rules(diagnosis_rule_file)
+            if not self._parse_rules_and_baseline(rules, baseline):
+                return baseline
+            else:
+                for rule in self._sb_rules:
+                    single_metric_rule = self._sb_rules[rule]
+                    metrics = list(single_metric_rule['metrics'].keys())
+                    function_name = self._sb_rules[rule]['function']
+                    rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
+                    outputs = Parallel(n_jobs=-1)(
+                        delayed(self.fix_threshold_outlier_detection)
+                        (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
+                    )
+                    for index, out in enumerate(outputs):
+                        baseline[metrics[index]] = out[0]
+                        aggregated_df[metrics[index]] = out[1]
+        return baseline
+
+    def run(
+        self,
+        raw_data_file,
+        summary_rule_file,
+        output_dir,
+        algorithm='mean',
+        diagnosis_rule_file=None,
+        baseline_file=None,
+        digit=2
+    ):
+        """Export baseline to json file.
+
+        If diagnosis_rule_file is None, use mean of the data as baseline.
+        If diagnosis_rule_file is not None, use the rules in diagnosis_rule_file to execute fix_threshold algorithm.
+
+        Args:
+            raw_data_df (DataFrame): raw data
+            summary_rule_file (str): the file name of the summary rule file
+            output_dir (str): the directory to save the baseline file
+            algorithm (str): the algorithm to generate the baseline
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm
+            baseline_file (str): the file name of the baseline file
+            digit (int): the number of digits after the decimal point
+        """
+        try:
+            # aggregate results from different devices
+            self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file)
+            # read existing baseline
+            baseline = {}
+            if baseline_file:
+                baseline = file_handler.read_baseline()
+            # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
+            baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
+            for metric in baseline:
+                val = baseline[metric]
+                if isinstance(self._raw_data_df[metric].iloc[0], float):
+                    baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
+                elif isinstance(self._raw_data_df[metric].iloc[0], int):
+                    baseline[metric] = int(val)
+                else:
+                    try:
+                        baseline[metric] = float(val)
+                    except Exception as e:
+                        logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
+            baseline = json.dumps(baseline, indent=2, sort_keys=True)
+            baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
+            with open(output_dir + '/baseline.json', mode='w') as f:
+                f.write(baseline)
+
+        except Exception as e:
+            logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e)))
+
+
+if __name__ == '__main__':
+    global args
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--algo',
+        type=str,
+        default='fix_threshold',
+        required=False,
+        help='Algorithm to generate baseline, eg, mean/fix_threshold.'
+    )
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default=None,
+        required=False,
+        help='Input directory which stores the results-summary.jsonl.'
+    )
+    parser.add_argument(
+        '--diagnosis_rule_file', type=str, default=None, required=False, help='The input path of diagnosis rule file.'
+    )
+    parser.add_argument(
+        '--summary_rule_file', type=str, default=None, required=False, help='The input path of summary rule file.'
+    )
+    args = parser.parse_args()
+    folder = args.input_dir
+    if args.algo == 'mean':
+        # simply use mean, need result_summary rules to define how to aggregate the metrics.
+        print('Generate baseine using mean of the data.')
+    elif args.algo == 'fix_threshold':
+        # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
+        print('Generate baseine using fix threshold algorithm, the threshold is defined in rules/diagnosis_rules.yaml.')
+        GenerateBaseline().run(
+            folder + '/results-summary.jsonl', args.summary_rule_file, folder, 'fix_threshold', args.diagnosis_rule_file
+        )
diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py
new file mode 100644
index 000000000..d64813366
--- /dev/null
+++ b/superbench/analyzer/generate_statistic.py
@@ -0,0 +1,200 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for baseline generation."""
+
+import argparse
+import os
+import natsort as ns
+
+from joblib import Parallel, delayed
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer import data_analysis
+from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+from generate_baseline import GenerateBaseline
+
+
+def plot_steps(data, title=None, save_path=None, show=True):
+    """Plot steps.
+
+    Args:
+        data (list): data to plot
+        title (str): title of the plot
+        save_path (str): path to save the plot
+        show (bool): whether to show the plot
+    """
+    plt.figure(figsize=(10, 6))
+    plt.scatter(range(0, len(data)), data)
+    if title:
+        plt.title(title)
+    plt.xlabel('Devices')
+    plt.ylabel('Value')
+    plt.ylim(0, max(data) * 1.1)
+    if save_path is not None:
+        plt.savefig(save_path)
+    if show:
+        plt.show()
+    plt.close()
+
+
+class GenerateStatistics(GenerateBaseline):
+    """GenerateStatistics class to generate statistics for raw data."""
+    def calculate_statistics(self, healthy_df):
+        """Calculate statistics for healthy data.
+
+        Args:
+            healthy_df (DataFrame): healthy data
+
+        Returns:
+            DataFrame: statistics for healthy data
+        """
+        stat_df = data_analysis.statistic(healthy_df)
+        stat_df.loc['(max-min)/max'] = (stat_df.loc['max'] - stat_df.loc['min']) / stat_df.loc['max']
+        stat_df = stat_df.drop(index='1%')
+        stat_df = stat_df.drop(index='5%')
+        stat_df = stat_df.drop(index='95%')
+        stat_df = stat_df.drop(index='99%')
+        return stat_df
+
+    def output_excel(self, excel_file, stat_df, digit=2):
+        """Output excel file.
+
+        Args:
+            excel_file (str): excel file path
+            stat_df (DataFrame): statistics data
+            digit (int): digit to round
+        """
+        try:
+            writer = pd.ExcelWriter(excel_file, engine='xlsxwriter')
+
+            for benchmark in self._benchmark_metrics_dict:
+                benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]]
+                #benchmark_df = benchmark_df[,mixedsort(names(benchmark_df))]
+                benchmark_df = benchmark_df.reindex(ns.natsorted(benchmark_df.columns), axis=1)
+                sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1]
+                benchmark_df.to_excel(writer, sheet_name=sheet_name)
+                worksheet = writer.sheets[sheet_name]
+                row_start = 1
+                row_end = max(row_start, len(self._benchmark_metrics_dict[benchmark]))
+                columns = list(benchmark_df.index)
+                col_index = columns.index('(max-min)/max') + 1
+                workbook = writer.book
+                percent_format = workbook.add_format({'num_format': '0.00%'})
+                worksheet.conditional_format(
+                    col_index,
+                    row_start,
+                    col_index,
+                    row_end,    # start_row, start_col, end_row, end_col
+                    {
+                        'type': 'no_blanks',
+                        'format': percent_format
+                    }
+                )
+                num_format = f'0.{digit * "0"}'
+                for col_index in range(2, len(columns)):
+                    round_format = workbook.add_format({'num_format': num_format})
+                    worksheet.conditional_format(
+                        col_index,
+                        row_start,
+                        col_index,
+                        row_end,    # start_row, start_col, end_row, end_col
+                        {
+                            'type': 'no_blanks',
+                            'format': round_format
+                        }
+                    )
+            writer.close()
+        except Exception as e:
+            logger.error('output excel failed: {}'.format(str(e)))
+
+    def run(self, raw_data_file, output_dir, diagnosis_rule_file=None, summary_rule_file=None, digit=2, plot=False):
+        """Run the statistics generation.
+
+        Args:
+            raw_data_file (str): raw data file path
+            output_dir (str): output directory
+            diagnosis_rule_file (str): diagnosis rule file path
+            summary_rule_file (str): summary rule file path
+            digit (int): digit to round
+            plot (bool): whether to plot the data
+        """
+        try:
+            # aggregate results from different devices
+            self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file)
+            # re-organize metrics by benchmark names
+            self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
+            # read existing baseline
+            baseline = {}
+            # read diagnosis rules
+            aggregated_df = self._raw_data_df.copy()
+            rules = file_handler.read_rules(diagnosis_rule_file)
+            if not self._parse_rules_and_baseline(rules, baseline):
+                logger.error('parse rule failed')
+                return None
+            else:
+                for rule in self._sb_rules:
+                    single_metric_rule = self._sb_rules[rule]
+                    metrics = list(single_metric_rule['metrics'].keys())
+                    function_name = self._sb_rules[rule]['function']
+                    rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
+                    outputs = Parallel(n_jobs=-1)(
+                        delayed(self.fix_threshold_outlier_detection)
+                        (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
+                    )
+                    for index, out in enumerate(outputs):
+                        if not out:
+                            logger.error('Analyzer: filter healthy nodes failed')
+                            return
+                        aggregated_df[metrics[index]] = out[1]
+                        if plot:
+                            plot_steps(
+                                out[1].tolist(),
+                                title=metrics[index],
+                                save_path=os.path.join(
+                                    output_dir, 'figures', metrics[index].replace('/', '_').replace(':', '_') + '.png'
+                                ),
+                                show=False
+                            )
+            stat_df = self.calculate_statistics(aggregated_df)
+            excel_file = os.path.join(output_dir, 'benchmark_stability_stat.xlsx')
+            self.output_excel(excel_file, stat_df, digit)
+
+        except Exception as e:
+            logger.error('Analyzer: generate statisitics failed, msg: {}'.format(str(e)))
+
+
+if __name__ == '__main__':
+    global args
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default='rawdata/',
+        required=False,
+        help='Input directory which stores the results-summary.jsonl.'
+    )
+    parser.add_argument(
+        '--diagnosis_rule_file',
+        type=str,
+        default='rules/diagnosis_rules.yaml',
+        required=False,
+        help='The input path of diagnosis rule file.'
+    )
+    parser.add_argument(
+        '--summary_rule_file',
+        type=str,
+        default='rules/analysis_rules.yaml',
+        required=False,
+        help='The input path of summary rule file.'
+    )
+    args = parser.parse_args()
+
+    # use fix threshold method, need result_summary rules to define how to aggregate the metrics and diagnosis_rules.yaml to define the rules for the metrics.
+    GenerateStatistics().run(
+        args.input_dir + '/results-summary.jsonl', args.input_dir, args.diagnosis_rule_file, args.summary_rule_file
+    )
diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml
new file mode 100644
index 000000000..3b330b5bb
--- /dev/null
+++ b/superbench/analyzer/rules/aggregation_rules.yaml
@@ -0,0 +1,77 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+    model-train-benchmarks:
+      statistics:
+        - mean
+      categories: model
+      metrics:
+        - model-benchmarks:.*/.*/.*_train_throughput
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/pytorch-lstm/.*_train_throughput
+        - bert_models/pytorch-bert-.*/.*_train_throughput
+        - resnet_models/pytorch-resnet\d*/.*_train_throughput
+        - vgg_models/pytorch-vgg\d*/.*_train_throughput
+        - densenet_models/.*/.*_train_throughput
+    model-inference-benchmarks:
+      statistics:
+        - mean
+      categories: model
+      aggregate: True
+      metrics:
+        - model-benchmarks:.*/.*/.*_inference_throughput:\d+
+    micro-aggregation:
+      statistics: mean
+      categories: MICRO1
+      aggregate: True
+      metrics:
+        - gemm-flops:*.*/.*ops
+        - mem-bw:*.*/.*_bw
+        - kernel-launch:*.*/.*_time
+        - computation-communication-overlap:*.*/.*_time
+        - cublas-function:*.*/.*_time
+        - cudnn-function:*.*/.*_time
+        - ort-inference/.*_time.*
+        - tensorrt-inference/.*_time.*
+        - cublaslt-gemm:*.*/.*ops
+        - dist-inference/.*step_times.*
+    micro-nonaggregation:
+      statistics: mean
+      categories: MICRO2
+      metrics:
+        - nccl-bw:*.*/allreduce_.*_busbw
+        - rccl-bw:*.*/allreduce_.*_busbw
+        - sharding-matmul:*.*/.*_time
+        - matmul:*.*/.*_time
+        - gpu-burn/.*_pass
+        - gpu_burn/.*_abort
+        - cpu-memory-bw-latency/.*_bw
+        - cpu-memory-bw-latency/.*_lat
+    gpu-copy-bw:
+      statistics: mean
+      categories: DTOH
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw
+    gpu-copy-bw1:
+      statistics: mean
+      categories: DTOH
+      metrics:
+        #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw
+    gpu-copy-bw2:
+      statistics: mean
+      categories: HTOD
+      metrics:
+        #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+    disk:
+      statistics: mean
+      categories: DISK
+      aggregate: disk-benchmark/nvme(\d+n1)_.*_iops
+      metrics:
+        - disk-benchmark/nvme(\d+n1)_.*_iops
diff --git a/superbench/analyzer/rules/analysis_rules.yaml b/superbench/analyzer/rules/analysis_rules.yaml
new file mode 100644
index 000000000..23f8e4c2a
--- /dev/null
+++ b/superbench/analyzer/rules/analysis_rules.yaml
@@ -0,0 +1,100 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+    model-benchmarks:
+      statistics:
+        - mean
+      categories: models
+      metrics:
+        - model-benchmarks:.*/.*/.*_train_throughput
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/pytorch-lstm/.*_train_throughput
+        - bert_models/pytorch-bert-.*/.*_train_throughput
+        - resnet_models/pytorch-resnet\d*/.*_train_throughput
+        - vgg_models/pytorch-vgg\d*/.*_train_throughput
+        - densenet_models/.*/.*_train_throughput
+    micro-flops:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - cublaslt-gemm/fp8.*_0_8192_8192_8192_flops
+        - gemm-flops:*.*/.*op
+    # micro-cublasltflops:
+    #   statistics: mean
+    #   categories: MICRO
+    #   aggregate: True
+    #   metrics:
+    #     - cublaslt-gemm/fp.*_.*_flops
+    # micro-cublasltbatchflops:
+    #   statistics: mean
+    #   categories: MICRO
+    #   aggregate: True
+    #   metrics:
+    #     - cublaslt-gemm:bmm/fp.*_.*_flops
+    micro-aggregation-with-aggregate:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - kernel-launch:*.*/.*_time
+        - dist-inference/.*step_times.*
+        - mem-bw:*.*/.*_bw
+        - computation-communication-overlap:*.*/.*_time
+    micro-aggregation-wo-aggregate:
+      statistics: mean
+      categories: MICRO
+      aggregate: False
+      metrics:
+        - nccl-bw:*.*/allreduce_.*_busbw
+        - sharding-matmul:*.*/.*_time
+        - matmul:*.*/.*_time
+        #- gpu-burn:*.*/.*_pass
+        - cpu-memory-bw-latency/.*_bw
+        - cpu-memory-bw-latency/.*_lat
+        #- cublas-function:*.*/.*_time
+        #- cudnn-function:*.*/.*_time
+        #- ort-inference/.*_time.*
+        #- tensorrt-inference/.*_time.*
+    gpu-copy-bw:
+      statistics: mean
+      categories: DTOH
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+        #- gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+      aggregate: gpu-copy-bw:perf/gpu(.*)_to_gpu(.*)_by_.*_bw
+    # gpu-copy-bw1:
+    #   statistics: mean
+    #   categories: DTOH
+    #   metrics:
+    #     #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+    #     - gpu-copy-bw:perf/gpu.*_to_cpu_by_sm_under_numa.*_bw
+    #   aggregate: gpu-copy-bw:perf/gpu(.*)_to_cpu_by_sm_under_numa.*_bw
+    # gpu-copy-bw2:
+    #   statistics: mean
+    #   categories: HTOD
+    #   metrics:
+    #     #- gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+    #     - gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+    #   aggregate: gpu-copy-bw:perf/cpu_to_gpu(.*)_by_sm_under_numa.*_bw
+    # # ib-loopback1:
+    # #   statistics: mean
+    # #   categories: RDMA
+    # #   metrics:
+    # #     #- ib-loopback/ib_write_8388608_ib.*_bw
+    # #     - ib-loopback/ib_write_.*_ib.*_bw
+    # #   aggregate: ib-loopback/ib_write_.*_ib(.*)_bw
+    # ib-loopback2:
+    #   statistics: mean
+    #   categories: RDMA
+    #   metrics:
+    #     #- ib-loopback/ib_write_8388608_ib.*_bw
+    #     - ib-loopback/ib_write_bw_.*:\d+
+    #   #aggregate: True
+    # disk:
+    #   statistics: mean
+    #   categories: DISK
+    #   aggregate: disk-benchmark/nvme(\d+n1)_.*_iops
+    #   metrics:
+    #     - disk-benchmark/nvme(\d+n1)_.*_iops
diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml
new file mode 100644
index 000000000..cbe7994bd
--- /dev/null
+++ b/superbench/analyzer/rules/diagnosis_rules.yaml
@@ -0,0 +1,237 @@
+# SuperBench rules
+version: v0.5
+superbench:
+  rules:
+  # Rule 0: If one test fails, label it as Not acceptable
+    falure_rule:
+      function: value
+      criteria: 'lambda x:x>0'
+      categories: FailedTest
+      metrics:
+        - kernel-launch:*.*/return_code
+        - mem-bw:*.*/return_code
+        - gemm-flops:*.*/return_code
+        - ib-loopback:*.*/return_code
+        - nccl-bw:*.*/return_code
+        - gpt_models/.*/return_code
+        - lstm_models/.*/return_code
+        - bert_models/.*/return_code
+        - resnet_models/.*/return_code
+        - vgg_models/.*/return_code
+        - densenet_models/.*/return_code
+        - model-benchmarks:.*/return_code:.*
+        #- cudnn-function:*.*/return_code                        # 06:14
+        - cublaslt-gemm:*.*/return_code
+        - cublas-function:*.*/return_code                       # 05:52
+        - matmul:*.*/return_code                                # 00:25
+        - gpu-burn:*.*/return_code                              # 15:08
+        # microbenchmark - communication
+        - cpu-memory-bw-latency:*.*/return_code                 # 05:38
+        - gpu-copy-bw:*.*/return_code                     # 08:44
+        - computation-communication-overlap:*.*/return_code     # 06:30
+        - sharding-matmul:*.*/return_code                       # 00:24
+        # microbenchmark - storage
+        #- disk-benchmark                       # 18:47
+        # model benchmark - inference
+        # - ort-inference:*.*/return_code                         # 03:43
+        ##tensorrt-inference                    # 02:03:33
+        - dist-inference:*.*/return_code
+        - cublaslt-gemm:*.*/return_code
+    kernel_launch_rule_outlier:
+      function: value
+      criteria: 'lambda x:x<0.001'
+      categories: INVESTIGATING
+      metrics:
+        - kernel-launch:*.*/.*_time
+    kernel_launch_rule:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: INVESTIGATING
+      metrics:
+        - kernel-launch:*.*/.*_time
+    tensorrt_inference:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: TensorRT
+      metrics:
+        - tensorrt-inference/.*_time.*
+  # Rule 1: If TensorCore test suffers > 5% downgrade, label it as Not acceptable
+    tensor_core_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: TensorCore
+      metrics:
+        - gemm-flops:*.*/.*ops
+    cublaslt_gemm_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: CUBLASLT
+      metrics:
+        - cublaslt-gemm:*.*/.*ops
+    cublas:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CUBLAS
+      metrics:
+        - cublas-function:*.*/.*_time
+    cudnn:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CUDNN
+      metrics:
+        - cudnn-function:*.*/.*_time
+    matmul:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: MATMUL
+      metrics:
+        - matmul:*.*/.*_time
+        - sharding-matmul:*.*/.*_time
+    cpu1:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: CPU
+      metrics:
+        - cpu-memory-bw-latency/.*_lat
+    gpu_burn:
+      function: value
+      criteria: 'lambda x:x!=1'
+      categories: GPUBURN
+      metrics:
+        - gpu-burn:*.*/.*_pass
+    cpu:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: CPU
+      metrics:
+        - cpu-memory-bw-latency/.*_bw
+  # Rule 2: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as Not acceptable
+    mem_bw_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Mem
+      metrics:
+        - mem-bw:*.*/h2d_bw
+        - mem-bw:*.*/d2h_bw
+    gpu-copy:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: GPUCOPY
+      metrics:
+        - gpu-copy-bw:perf/.*gpu._to_gpu.*_write_by_.*_bw
+  # Rule 3: If ib-loopback test with 8M suffers > 5% downgrade, label it as Not acceptable
+    # rdma_loopback_rule:
+    #   function: variance
+    #   criteria: 'lambda x:x<-0.05'
+    #   categories: RDMA
+    #   metrics:
+    #     - ib-loopback:*.*/ib_write_bw_8388608
+        #- ib-loopback:*.*/ib_write_bw_.*
+  # Rule 4: If nccl-bw:default with 8GB suffers > 5% downgrade, label it as Not acceptable
+    nccl_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: NCCL
+      metrics:
+        - nccl-bw:*.*/allreduce_.*_busbw
+    overlap_rules:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: OVERLAP
+      metrics:
+        - computation-communication-overlap:*.*/.*_time
+    investigating_rules:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: DIST_INFERENCE
+      metrics:
+        - dist-inference/.*step_times
+    disk_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: DISK
+      metrics:
+        - disk-benchmark/nvme(\d+n1)_.*_iops
+  #Rule 5,6,7: If BERT or GPT-2 or LSTM suffers > 5% downgrade, label it as Not acceptable
+    model_throughput_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Model
+      metrics:
+        - gpt_models/.*/.*_train_throughput
+        - lstm_models/.*lstm/.*_train_throughput
+        - bert_models/.*bert-.*/.*_train_throughput
+        - model-benchmarks:.*/.*bert.*/.*_train_throughput
+        - model-benchmarks:.*/.*gpt.*/.*_train_throughput
+        - model-benchmarks:.*/.*lstm.*/.*_train_throughput
+  # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable
+    resnet_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - resnet_models/.*resnet\d*/.*_train_throughput
+        - model-benchmarks:.*/.*resnet.*/.*_train_throughput
+    vgg_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - vgg_models/.*vgg\d*/.*_train_throughput
+        - model-benchmarks:.*/.*vgg.*/.*_train_throughput
+    densenet_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - densenet_models/.*densenet\d*/.*_train_throughput
+        - model-benchmarks:.*/.*densenet.*/.*_train_throughput
+    cnn_throughput_rule:
+      categories: CNN
+      function: multi_rules
+      criteria: 'lambda label:True if label["resnet_throughput"]+label["densenet_throughput"]>=2 else False'
+    vgg_throughput_rule:
+      categories: VGG
+      function: multi_rules
+      criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False'
+    model_inference_throughput_rule:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Model
+      metrics:
+        - model-benchmarks:.*/.*bert.*/.*_inference_throughput
+        - model-benchmarks:.*/.*gpt.*/.*_inference_throughput
+        - model-benchmarks:.*/.*lstm.*/.*_inference_throughput
+  # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable
+    resnet_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*resnet.*/.*_inference_throughput
+    vgg_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*vgg.*/.*_inference_throughput
+    densenet_inference_throughput:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      store: true
+      categories: CNN
+      metrics:
+        - model-benchmarks:.*/.*densenet.*/.*_inference_throughput
+    cnn_inference_throughput_rule:
+      categories: CNN
+      function: multi_rules
+      criteria: 'lambda label:True if label["resnet_inference_throughput"]+label["densenet_inference_throughput"]>=2 else False'
+    vgg_inference_throughput_rule:
+      categories: VGG
+      function: multi_rules
+      criteria: 'lambda label:True if label["vgg_inference_throughput"]>=2 else False'