From 9f4880cb8ef56b0ef900ad11d28b51b27994a526 Mon Sep 17 00:00:00 2001
From: guoshzhao <guzhao@microsoft.com>
Date: Wed, 22 Nov 2023 14:42:32 +0800
Subject: [PATCH] Analyzer - Generate baseline given results from multiple
 nodes.  (#575)

**Description**
Generate baseline given results from multiple nodes.

**Major Revision**
- Add sub command `sb result generate-baseline`
- Add UT and docs

---------

Co-authored-by: 454314380 <454314380@qq.com>
Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
---
 dockerfile/cuda11.1.1.dockerfile              |   3 +-
 docs/cli.md                                   |  47 +++++
 docs/user-tutorial/baseline-generation.md     |  49 +++++
 superbench/analyzer/__init__.py               |   6 +-
 superbench/analyzer/baseline_generation.py    | 194 ++++++++++++++++++
 superbench/analyzer/file_handler.py           |   2 +-
 superbench/cli/_commands.py                   |  13 ++
 superbench/cli/_help.py                       |  21 ++
 superbench/cli/_result_handler.py             |  43 ++++
 tests/analyzer/test_generate_baseline.py      |  76 +++++++
 ...est_generate_baseline_diagnosis_rules.yaml |  23 +++
 .../test_generate_baseline_results.jsonl      |   3 +
 .../test_generate_baseline_summary_rules.yaml |  11 +
 website/sidebars.js                           |   1 +
 14 files changed, 489 insertions(+), 3 deletions(-)
 create mode 100644 docs/user-tutorial/baseline-generation.md
 create mode 100644 superbench/analyzer/baseline_generation.py
 create mode 100644 tests/analyzer/test_generate_baseline.py
 create mode 100644 tests/analyzer/test_generate_baseline_diagnosis_rules.yaml
 create mode 100644 tests/analyzer/test_generate_baseline_results.jsonl
 create mode 100644 tests/analyzer/test_generate_baseline_summary_rules.yaml

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index 6b3a2acb2..060985db1 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -149,7 +149,8 @@ ADD third_party third_party
 RUN make -C third_party cuda
 
 ADD . .
-RUN python3 -m pip install --no-cache-dir .[nvworker] && \
+RUN python3 -m pip install --upgrade setuptools==65.7 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
     make cppbuild && \
     make postinstall && \
     rm -rf .git
diff --git a/docs/cli.md b/docs/cli.md
index 1f6b13a7a..b35595ccb 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -295,6 +295,53 @@ Run result summary and output the results in html format:
 sb result summary --data-file outputs/results-summary.jsonl --rule-file rule.yaml --output-file-format html
 ```
 
+### `sb result generate-baseline`
+
+Generate the baseline file automatically from multiple machines results according to rules defined in rule file.
+
+```bash title="SB CLI"
+sb result generate-baseline --data-file
+                            --summary-rule-file
+                            [--diagnosis-rule-file]
+                            [--baseline-file]
+                            [--decimal-place-value]
+                            [--output-dir]
+```
+
+#### Required arguments
+
+| Name                        | Description                             |
+|-----------------------------|-----------------------------------------|
+| `--data-file` `-d`          | Path to raw data file.                  |
+| `--summary-rule-file` `-sr` | Path to summary rule file.              |
+
+#### Optional arguments
+
+| Name                          | Default | Description                                                                 |
+|-------------------------------|---------|-----------------------------------------------------------------------------|
+| `--diagnosis-rule-file` `-dr` | `None`  | Path to diagnosis rule file. Default: None.                                 |
+| `--baseline-file` `-b`        | `None`  | Path to previous baseline file. Default: None.                              |
+| `--decimal-place-value`       | 2       | Number of valid decimal places to show in output. Default: 2.               |
+| `--output-dir`                | `None`  | Path to output directory, outputs/{datetime} will be used if not specified. |
+
+#### Global arguments
+
+| Name          | Default | Description        |
+|---------------|---------|--------------------|
+| `--help` `-h` | N/A     | Show help message. |
+
+#### Examples
+
+Run result generate-baseline to generate baseline.json file:
+```bash title="SB CLI"
+sb result generate-baseline --data-file outputs/results-summary.jsonl --summary-rule-file summary-rule.yaml --diagnosis-rule-file diagnosis-rule.yaml
+```
+
+Run result generate-baseline and merge with previous baseline:
+```bash title="SB CLI"
+sb result generate-baseline --data-file outputs/results-summary.jsonl --summary-rule-file summary-rule.yaml --diagnosis-rule-file diagnosis-rule.yaml --baseline-file previous-baseline.json
+```
+
 ### `sb run`
 
 Run the SuperBench benchmarks distributedly.
diff --git a/docs/user-tutorial/baseline-generation.md b/docs/user-tutorial/baseline-generation.md
new file mode 100644
index 000000000..8e020293e
--- /dev/null
+++ b/docs/user-tutorial/baseline-generation.md
@@ -0,0 +1,49 @@
+---
+id: baseline-generation
+---
+
+# Baseline Generation
+
+## Introduction
+
+This tool is to generate a baseline json file based on the raw benchmark results of multiple machines.
+
+## Usage
+
+1. [Install SuperBench](../getting-started/installation.mdx) on the local machine.
+
+2. Prepare the raw data and rule files on the local machine.
+
+3. Generate the baseline file automatically using `sb result generate-baseline` command. The detailed command can be found from [SuperBench CLI](../cli.md).
+
+  ```bash
+  sb result generate-baseline --data-file ./results-summary.jsonl --summary-rule-file ./summary-rule.yaml --diagnosis-rule-file ./diagnosis-rule.yaml --output-dir ${output-dir}
+  ```
+
+4. Find the output result file named 'baseline.json' under ${output_dir}.
+
+## Input
+
+The input includes 4 files:
+
+- **Raw Data**: jsonl file including multiple nodes' results automatically generated by SuperBench runner.
+
+:::tip Tips
+Raw data file can be found at ${output-dir}/results-summary.jsonl after each successful run.
+:::
+
+- **Summary Rule File**: It uses YAML format and defines how to generate the result summary including how to classify the metrics and what statistical methods (P50, mean, etc.) are applied.
+
+- **Diagnosis Rule File(optional)**: It uses YAML format and includes each metrics' rules to filter defective machines for diagnosis, and will not filter machines if not specified.
+
+- **Previous Baseline File(optional)**: It is baseline file in json format that got from previous run and plan to merge into the latest baseline.
+
+### Rule File
+
+**Summary Rule File** is the same with rule file defined in [Result Summary](./result-summary.md)
+
+**Diagnosis Rule File** is the same with rule file defined in [Data Diagnosis](./data-diagnosis.md)
+
+## Output
+
+The baseline file (baseline.json) from multiple machines will be generated under ${output_dir}.
diff --git a/superbench/analyzer/__init__.py b/superbench/analyzer/__init__.py
index f4e27944d..38ec50b93 100644
--- a/superbench/analyzer/__init__.py
+++ b/superbench/analyzer/__init__.py
@@ -8,5 +8,9 @@
 from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
 from superbench.analyzer.summary_op import SummaryOp, SummaryType
 from superbench.analyzer.result_summary import ResultSummary
+from superbench.analyzer.baseline_generation import BaselineGeneration
 
-__all__ = ['DataDiagnosis', 'DiagnosisRuleType', 'RuleOp', 'RuleBase', 'SummaryOp', 'SummaryType', 'ResultSummary']
+__all__ = [
+    'DataDiagnosis', 'DiagnosisRuleType', 'RuleOp', 'RuleBase', 'SummaryOp', 'SummaryType', 'ResultSummary',
+    'BaselineGeneration'
+]
diff --git a/superbench/analyzer/baseline_generation.py b/superbench/analyzer/baseline_generation.py
new file mode 100644
index 000000000..b6844c0cb
--- /dev/null
+++ b/superbench/analyzer/baseline_generation.py
@@ -0,0 +1,194 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for baseline generation."""
+
+from copy import deepcopy
+from pathlib import Path
+import json
+import re
+
+from joblib import Parallel, delayed
+import pandas as pd
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer import data_analysis
+from superbench.analyzer import DataDiagnosis
+from superbench.analyzer import ResultSummary
+from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+
+
+class BaselineGeneration(DataDiagnosis):
+    """The class to generate baseline for raw data."""
+    def fix_threshold_outlier_detection(self, data_series, single_metric_with_baseline, metric, rule_op):
+        """Fix threshold outlier detection algorithm.
+
+        Step 0: Put all data in the collection.
+        Step 1: Regenerate the collection.
+          Calculate the average number in the collection as the baseline.
+          Remove all data which cannot pass the fix threshold based on the new baseline.
+        Step 2: If no data has been removed from Step 1, go to Step 3; otherwise, go to Step 1.
+        Step 3: Use the baseline and fix threshold for Outlier Detection.
+
+        Args:
+            data_series (pd.Series): data of the metric.
+            single_metric_with_baseline (dict): baseline of the single metric in 'metrics' in 2-layer dict format.
+            metric (str): the name of the metric to execute the algorithm.
+            rule_op (function): diagnosis rule op function.
+
+        Returns:
+            tuple: the baseline of the metric, normal data of the metric.
+        """
+        if single_metric_with_baseline['metrics'][metric] \
+                is not None and single_metric_with_baseline['metrics'][metric] != -1:
+            return single_metric_with_baseline['metrics'][metric]
+        tmp_single_metric_with_baseline = deepcopy(single_metric_with_baseline)
+        tmp_single_metric_with_baseline['metrics'] = {}
+        clean = False
+        while clean is False:
+            clean = True
+            baseline_val = data_series.mean()
+            for val in data_series.index:
+                tmp_single_metric_with_baseline['metrics'][metric] = baseline_val
+                if baseline_val == 0:
+                    break
+                data_row = pd.Series([data_series[val]], index=[metric])
+                details = []
+                categories = set()
+                summary_data_row = pd.Series(index=[metric], dtype=float)
+                violated_num = rule_op(data_row, tmp_single_metric_with_baseline, summary_data_row, details, categories)
+                if violated_num:
+                    data_series = data_series.drop(val)
+                    clean = False
+        baseline = tmp_single_metric_with_baseline['metrics'][metric]
+        return baseline, data_series
+
+    def get_aggregate_data(self, raw_data_file, summary_rule_file):
+        r"""Aggregate raw data according to the summary rule file.
+
+        If the metric is aggregated by rank (:\d+), remove the rank info to generate the metric name and aggregate data.
+        If the metric is aggregated by regex pattern, aggregate the data and copy to all metrics matches this pattern.
+
+        Args:
+            raw_data_file (str): the file name of the raw data file.
+            summary_rule_file (str): the file name of the summary rule file.
+
+        Returns:
+            DataFrame: aggregated data
+        """
+        self.rs = ResultSummary()
+        rules = self.rs._preprocess(raw_data_file, summary_rule_file)
+        # parse rules for result summary
+        if not self.rs._parse_rules(rules):
+            return
+        aggregated_df = pd.DataFrame()
+        for rule in self.rs._sb_rules:
+            single_metric_rule = self.rs._sb_rules[rule]
+            metrics = list(single_metric_rule['metrics'].keys())
+            data_df_of_rule = self.rs._raw_data_df[metrics]
+            if self.rs._sb_rules[rule]['aggregate']:
+                # if aggregate is True, aggregate in ranks
+                if self.rs._sb_rules[rule]['aggregate'] is True:
+                    data_df_of_rule = data_analysis.aggregate(data_df_of_rule)
+                # if aggregate is not empty and is a pattern in regex, aggregate according to pattern
+                else:
+                    pattern = self.rs._sb_rules[rule]['aggregate']
+                    data_df_of_rule_with_short_name = data_analysis.aggregate(data_df_of_rule, pattern)
+                    data_df_of_rule = pd.DataFrame(columns=metrics)
+                    # restore the columns of data_fd to full metric names
+                    for metric in metrics:
+                        short = ''
+                        match = re.search(pattern, metric)
+                        if match:
+                            metric_in_list = list(metric)
+                            for i in range(1, len(match.groups()) + 1):
+                                metric_in_list[match.start(i):match.end(i)] = '*'
+                            short = ''.join(metric_in_list)
+                        data_df_of_rule[metric] = data_df_of_rule_with_short_name[short]
+            aggregated_df = pd.concat([aggregated_df, data_df_of_rule], axis=1)
+        return aggregated_df
+
+    def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
+        """Generate the baseline in json format.
+
+        Args:
+            algo (str): the algorithm to generate the baseline.
+            aggregated_df (DataFrame): aggregated data.
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm.
+            baseline (dict): existing baseline of some metrics.
+
+        Returns:
+            dict: baseline of metrics defined in diagnosis_rule_files for fix_threshold algorithm or
+                  defined in rule_summary_files for mean.
+        """
+        # re-organize metrics by benchmark names
+        self._benchmark_metrics_dict = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
+        if algo == 'mean':
+            mean_df = self._raw_data_df.mean()
+            for metric in self._raw_data_df.columns:
+                if metric in baseline:
+                    return baseline[metric]
+                baseline[metric] = mean_df[metric]
+        elif algo == 'fix_threshold':
+            # read diagnosis rules
+            rules = file_handler.read_rules(diagnosis_rule_file)
+            if not self._parse_rules_and_baseline(rules, baseline):
+                return baseline
+            else:
+                for rule in self._sb_rules:
+                    single_metric_rule = self._sb_rules[rule]
+                    metrics = list(single_metric_rule['metrics'].keys())
+                    function_name = self._sb_rules[rule]['function']
+                    rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
+                    outputs = Parallel(n_jobs=-1)(
+                        delayed(self.fix_threshold_outlier_detection)
+                        (aggregated_df[metric], single_metric_rule, metric, rule_op) for metric in metrics
+                    )
+                    for index, out in enumerate(outputs):
+                        baseline[metrics[index]] = out[0]
+                        aggregated_df[metrics[index]] = out[1]
+        return baseline
+
+    def run(
+        self, raw_data_file, summary_rule_file, diagnosis_rule_file, pre_baseline_file, algorithm, output_dir, digit=2
+    ):
+        """Export baseline to json file.
+
+        Args:
+            raw_data_file (str): Path to raw data jsonl file.
+            summary_rule_file (str): the file name of the summary rule file.
+            diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm.
+            pre_baseline_file (str): the file name of the previous baseline file.
+            algorithm (str): the algorithm to generate the baseline.
+            output_dir (str): the directory to save the baseline file.
+            digit (int): the number of digits after the decimal point.
+        """
+        try:
+            # aggregate results from different devices
+            self._raw_data_df = self.get_aggregate_data(raw_data_file, summary_rule_file)
+            # read existing baseline
+            baseline = {}
+            if pre_baseline_file:
+                baseline = file_handler.read_baseline(pre_baseline_file)
+            # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
+            baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
+            for metric in baseline:
+                val = baseline[metric]
+                if metric in self._raw_data_df:
+                    if isinstance(self._raw_data_df[metric].iloc[0], float):
+                        baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
+                    elif isinstance(self._raw_data_df[metric].iloc[0], int):
+                        baseline[metric] = int(val)
+                    else:
+                        try:
+                            baseline[metric] = float(val)
+                        except Exception as e:
+                            logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
+            baseline = json.dumps(baseline, indent=2, sort_keys=True)
+            baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
+            with (Path(output_dir) / 'baseline.json').open('w') as f:
+                f.write(baseline)
+
+        except Exception as e:
+            logger.error('Analyzer: generate baseline failed, msg: {}'.format(str(e)))
diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py
index 2337435ea..f9f4065f9 100644
--- a/superbench/analyzer/file_handler.py
+++ b/superbench/analyzer/file_handler.py
@@ -39,7 +39,7 @@ def read_raw_data(raw_data_path):
         raw_data_df = raw_data_df.rename(raw_data_df['node'])
         raw_data_df = raw_data_df.drop(columns=['node'])
     except Exception as e:
-        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data fomat - {}'.format(str(e)))
+        logger.log_and_raise(exception=IOError, msg='Analyzer: invalid raw data format - {}'.format(str(e)))
     return raw_data_df
 
 
diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py
index 2122034a3..e8ae9af05 100644
--- a/superbench/cli/_commands.py
+++ b/superbench/cli/_commands.py
@@ -31,6 +31,7 @@ def load_command_table(self, args):
         with CommandGroup(self, 'result', 'superbench.cli._result_handler#{}') as g:
             g.command('diagnosis', 'diagnosis_command_handler')
             g.command('summary', 'summary_command_handler')
+            g.command('generate-baseline', 'generate_baseline_command_handler')
         return super().load_command_table(args)
 
     def load_arguments(self, command):
@@ -77,6 +78,18 @@ def load_arguments(self, command):
         with ArgumentsContext(self, 'result') as ac:
             ac.argument('raw_data_file', options_list=('--data-file', '-d'), type=str, help='Path to raw data file.')
             ac.argument('rule_file', options_list=('--rule-file', '-r'), type=str, help='Path to rule file.')
+            ac.argument(
+                'summary_rule_file',
+                options_list=('--summary-rule-file', '-sr'),
+                type=str,
+                help='Path to summary rule file.'
+            )
+            ac.argument(
+                'diagnosis_rule_file',
+                options_list=('--diagnosis-rule-file', '-dr'),
+                type=str,
+                help='Path to diagnosis rule file.'
+            )
             ac.argument(
                 'baseline_file', options_list=('--baseline-file', '-b'), type=str, help='Path to baseline file.'
             )
diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py
index fb7f87973..8f7b0874e 100644
--- a/superbench/cli/_help.py
+++ b/superbench/cli/_help.py
@@ -187,6 +187,27 @@
             --output-file-format html
 """.format(cli_name=CLI_NAME)
 
+helps['result generate-baseline'] = """
+    type: command
+    short-summary: >
+        Generate the baseline of benchmarking results from jsonline file
+        according to rules defined in rule file.
+    examples:
+        - name: run result generate-baseline to generate baseline.json file
+          text: >
+            {cli_name} result generate-baseline
+            --data-file outputs/results-summary.jsonl
+            --summary-rule-file summary-rule.yaml
+            --diagnosis-rule-file diagnosis-rule.yaml
+        - name: run result generate-baseline and merge with previous baseline
+          text: >
+            {cli_name} result generate-baseline
+            --data-file outputs/results-summary.jsonl
+            --summary-rule-file summary-rule.yaml
+            --diagnosis-rule-file diagnosis-rule.yaml
+            --baseline-file previous-baseline.json
+""".format(cli_name=CLI_NAME)
+
 
 class SuperBenchCLIHelp(CLIHelp):
     """SuperBench CLI help loader."""
diff --git a/superbench/cli/_result_handler.py b/superbench/cli/_result_handler.py
index 94515a9d3..7d90bc2b6 100644
--- a/superbench/cli/_result_handler.py
+++ b/superbench/cli/_result_handler.py
@@ -7,6 +7,7 @@
 
 from superbench.analyzer import DataDiagnosis
 from superbench.analyzer import ResultSummary
+from superbench.analyzer import BaselineGeneration
 from superbench.common.utils import create_sb_output_dir
 from superbench.cli._handler import check_argument_file
 
@@ -73,3 +74,45 @@ def summary_command_handler(raw_data_file, rule_file, output_dir=None, output_fi
         ResultSummary().run(raw_data_file, rule_file, sb_output_dir, output_file_format, decimal_place_value)
     except Exception as ex:
         raise RuntimeError('Failed to run summary command.') from ex
+
+
+def generate_baseline_command_handler(
+    raw_data_file,
+    summary_rule_file,
+    diagnosis_rule_file=None,
+    baseline_file=None,
+    output_dir=None,
+    decimal_place_value=2
+):
+    """Run result generate-baseline.
+
+    If diagnosis_rule_file is None, use mean of the data as baseline.
+    If diagnosis_rule_file is not None, use the rules in diagnosis_rule_file to execute fix_threshold algorithm.
+
+    Args:
+        raw_data_file (str): Path to raw data jsonl file.
+        summary_rule_file (str): the file name of the summary rule file.
+        diagnosis_rule_file (str): the file name of the diagnosis rules which used in fix_threshold algorithm.
+        baseline_file (str): the file name of the previous baseline file that plan to merge with current baseline.
+        output_dir (str): the directory to save the baseline file.
+        decimal_place_value (int): the number of digits after the decimal point.
+    """
+    try:
+        # Create output directory
+        sb_output_dir = create_sb_output_dir(output_dir)
+        # Check arguments
+        check_argument_file('raw_data_file', raw_data_file)
+        check_argument_file('rule_file', summary_rule_file)
+        algorithm = 'mean'
+        if diagnosis_rule_file:
+            algorithm = 'fix_threshold'
+            check_argument_file('rule_file', diagnosis_rule_file)
+        if baseline_file:
+            check_argument_file('baseline_file', baseline_file)
+        # Run result generate-baseline
+        BaselineGeneration().run(
+            raw_data_file, summary_rule_file, diagnosis_rule_file, baseline_file, algorithm, sb_output_dir,
+            decimal_place_value
+        )
+    except Exception as ex:
+        raise RuntimeError('Failed to run generate-baseline command.') from ex
diff --git a/tests/analyzer/test_generate_baseline.py b/tests/analyzer/test_generate_baseline.py
new file mode 100644
index 000000000..27eacd420
--- /dev/null
+++ b/tests/analyzer/test_generate_baseline.py
@@ -0,0 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for BaselineGeneration module."""
+
+import unittest
+import json
+from pathlib import Path
+
+from superbench.analyzer import BaselineGeneration
+import superbench.analyzer.file_handler as file_handler
+
+
+class TestBaselineGeneration(unittest.TestCase):
+    """Test for BaselineGeneration class."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        self.parent_path = Path(__file__).parent
+        self.test_raw_data = str(self.parent_path / 'test_generate_baseline_results.jsonl')
+        self.test_summary_rule_file = str(self.parent_path / 'test_generate_baseline_summary_rules.yaml')
+        self.test_diagnosis_rule_file = str(self.parent_path / 'test_generate_baseline_diagnosis_rules.yaml')
+        self.output_baseline_file = str(self.parent_path / 'baseline.json')
+        self.pre_baseline_file = str(self.parent_path / 'pre_baseline.json')
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        for file in [self.output_baseline_file, self.pre_baseline_file]:
+            p = Path(file)
+            if p.is_file():
+                p.unlink()
+
+    def test_baseline_generation_run(self):
+        """Test for the run process of result generate-baseline."""
+        # Test - generate baseline without previous baseline
+        BaselineGeneration().run(
+            self.test_raw_data,
+            self.test_summary_rule_file,
+            self.test_diagnosis_rule_file,
+            None,
+            'fix_threshold',
+            str(self.parent_path),
+            digit=2
+        )
+        baseline = file_handler.read_baseline(self.output_baseline_file)
+        expected_baseline = {
+            'kernel-launch/event_time': 0.0055,
+            'kernel-launch/wall_time': 0.009,
+            'mem-bw/d2h_bw': 26.22,
+            'mem-bw/h2d_bw': 26.07
+        }
+        assert (expected_baseline == baseline)
+
+        # Test - generate baseline with previous baseline
+        pre_baseline = {'gemm-flops/FP32': 18318.4, 'gemm-flops/FP16': 33878}
+        with open(self.pre_baseline_file, 'w') as f:
+            json.dump(pre_baseline, f)
+
+        BaselineGeneration().run(
+            self.test_raw_data,
+            self.test_summary_rule_file,
+            self.test_diagnosis_rule_file,
+            self.pre_baseline_file,
+            'fix_threshold',
+            str(self.parent_path),
+            digit=2
+        )
+        baseline = file_handler.read_baseline(self.output_baseline_file)
+        expected_baseline = {
+            'kernel-launch/event_time': 0.0055,
+            'kernel-launch/wall_time': 0.009,
+            'mem-bw/d2h_bw': 26.22,
+            'mem-bw/h2d_bw': 26.07,
+            'gemm-flops/FP32': 18318.4,
+            'gemm-flops/FP16': 33878
+        }
+        assert (expected_baseline == baseline)
diff --git a/tests/analyzer/test_generate_baseline_diagnosis_rules.yaml b/tests/analyzer/test_generate_baseline_diagnosis_rules.yaml
new file mode 100644
index 000000000..2a8dbd9a9
--- /dev/null
+++ b/tests/analyzer/test_generate_baseline_diagnosis_rules.yaml
@@ -0,0 +1,23 @@
+# SuperBench rules
+version: v0.8
+superbench:
+  rules:
+    rule0:
+      function: variance
+      criteria: 'lambda x:x>0.05'
+      categories: KernelLaunch
+      metrics:
+        - kernel-launch:*.*/.*_time
+    rule1:
+      function: variance
+      criteria: 'lambda x:x<-0.05'
+      categories: Mem
+      metrics:
+        - mem-bw:*.*/.*_bw
+    failure_rule:
+      function: value
+      criteria: 'lambda x:x>0'
+      categories: FailedTest
+      metrics:
+        - kernel-launch:*.*/return_code
+        - mem-bw:*.*/return_code
diff --git a/tests/analyzer/test_generate_baseline_results.jsonl b/tests/analyzer/test_generate_baseline_results.jsonl
new file mode 100644
index 000000000..87437ecc2
--- /dev/null
+++ b/tests/analyzer/test_generate_baseline_results.jsonl
@@ -0,0 +1,3 @@
+{"node": "sb-validation-01", "kernel-launch/return_code:0": 0, "kernel-launch/return_code:1": 0, "kernel-launch/return_code:2": 0, "kernel-launch/return_code:3": 0, "kernel-launch/return_code:4": 0, "kernel-launch/return_code:5": 0, "kernel-launch/return_code:6": 0, "kernel-launch/return_code:7": 0, "kernel-launch/event_time:0": 0.00542, "kernel-launch/event_time:1": 0.00559, "kernel-launch/event_time:2": 0.0054, "kernel-launch/event_time:3": 0.00537, "kernel-launch/event_time:4": 0.00562, "kernel-launch/event_time:5": 0.00558, "kernel-launch/event_time:6": 0.00557, "kernel-launch/event_time:7": 0.00556, "kernel-launch/wall_time:0": 0.00883, "kernel-launch/wall_time:1": 0.00898, "kernel-launch/wall_time:2": 0.0093, "kernel-launch/wall_time:3": 0.00928, "kernel-launch/wall_time:4": 0.00873, "kernel-launch/wall_time:5": 0.0088, "kernel-launch/wall_time:6": 0.00877, "kernel-launch/wall_time:7": 0.00875, "mem-bw/return_code:0": 0, "mem-bw/return_code:1": 0, "mem-bw/return_code:2": 0, "mem-bw/return_code:3": 0, "mem-bw/return_code:4": 0, "mem-bw/return_code:5": 0, "mem-bw/return_code:6": 0, "mem-bw/return_code:7": 0, "mem-bw/h2d_bw:0": 26.1, "mem-bw/h2d_bw:1": 26.1, "mem-bw/h2d_bw:2": 26.2, "mem-bw/h2d_bw:3": 26.1, "mem-bw/h2d_bw:4": 26.1, "mem-bw/h2d_bw:5": 26.1, "mem-bw/h2d_bw:6": 26.1, "mem-bw/h2d_bw:7": 26.1, "mem-bw/d2h_bw:0": 26.3, "mem-bw/d2h_bw:1": 26.3, "mem-bw/d2h_bw:2": 26.3, "mem-bw/d2h_bw:3": 26.3, "mem-bw/d2h_bw:4": 26.3, "mem-bw/d2h_bw:5": 26.3, "mem-bw/d2h_bw:6": 26.3, "mem-bw/d2h_bw:7": 26.3}
+{"node": "sb-validation-02", "kernel-launch/return_code:0": 0, "kernel-launch/return_code:1": 0, "kernel-launch/return_code:2": 0, "kernel-launch/return_code:3": 0, "kernel-launch/return_code:4": 0, "kernel-launch/return_code:5": 0, "kernel-launch/return_code:6": 0, "kernel-launch/return_code:7": 0, "kernel-launch/event_time:0": 0.00542, "kernel-launch/event_time:1": 0.00531, "kernel-launch/event_time:2": 0.00538, "kernel-launch/event_time:3": 0.00544, "kernel-launch/event_time:4": 0.00559, "kernel-launch/event_time:5": 0.0055, "kernel-launch/event_time:6": 0.00557, "kernel-launch/event_time:7": 0.00562, "kernel-launch/wall_time:0": 0.00884, "kernel-launch/wall_time:1": 0.00927, "kernel-launch/wall_time:2": 0.0089, "kernel-launch/wall_time:3": 0.00933, "kernel-launch/wall_time:4": 0.00882, "kernel-launch/wall_time:5": 0.0093, "kernel-launch/wall_time:6": 0.00884, "kernel-launch/wall_time:7": 0.00884, "mem-bw/return_code:0": 0, "mem-bw/return_code:1": 0, "mem-bw/return_code:2": 0, "mem-bw/return_code:3": 0, "mem-bw/return_code:4": 0, "mem-bw/return_code:5": 0, "mem-bw/return_code:6": 0, "mem-bw/return_code:7": 0, "mem-bw/h2d_bw:0": 26.1, "mem-bw/h2d_bw:1": 26.1, "mem-bw/h2d_bw:2": 26.1, "mem-bw/h2d_bw:3": 26.2, "mem-bw/h2d_bw:4": 26.1, "mem-bw/h2d_bw:5": 26.1, "mem-bw/h2d_bw:6": 26.1, "mem-bw/h2d_bw:7": 26.1, "mem-bw/d2h_bw:0": 26.3, "mem-bw/d2h_bw:1": 26.3, "mem-bw/d2h_bw:2": 26.3, "mem-bw/d2h_bw:3": 26.3, "mem-bw/d2h_bw:4": 26.2, "mem-bw/d2h_bw:5": 26.2, "mem-bw/d2h_bw:6": 26.3, "mem-bw/d2h_bw:7": 26.3}
+{"node": "sb-validation-03", "kernel-launch/return_code:0": 0, "kernel-launch/return_code:1": 0, "kernel-launch/return_code:2": 0, "kernel-launch/return_code:3": 0, "kernel-launch/return_code:4": 0, "kernel-launch/return_code:5": 0, "kernel-launch/return_code:6": 0, "kernel-launch/return_code:7": 0, "kernel-launch/event_time:0": 0.00561, "kernel-launch/event_time:1": 0.00564, "kernel-launch/event_time:2": 0.00585, "kernel-launch/event_time:3": 0.00553, "kernel-launch/event_time:4": 0.00542, "kernel-launch/event_time:5": 0.00542, "kernel-launch/event_time:6": 0.00542, "kernel-launch/event_time:7": 0.00538, "kernel-launch/wall_time:0": 0.0089, "kernel-launch/wall_time:1": 0.00928, "kernel-launch/wall_time:2": 0.00954, "kernel-launch/wall_time:3": 0.00938, "kernel-launch/wall_time:4": 0.00885, "kernel-launch/wall_time:5": 0.00931, "kernel-launch/wall_time:6": 0.0088, "kernel-launch/wall_time:7": 0.00877, "mem-bw/return_code:0": 0, "mem-bw/return_code:1": 0, "mem-bw/return_code:2": 0, "mem-bw/return_code:3": 0, "mem-bw/return_code:4": 0, "mem-bw/return_code:5": 0, "mem-bw/return_code:6": 0, "mem-bw/return_code:7": 0, "mem-bw/h2d_bw:0": 26.0, "mem-bw/h2d_bw:1": 25.9, "mem-bw/h2d_bw:2": 25.8, "mem-bw/h2d_bw:3": 25.8, "mem-bw/h2d_bw:4": 26.1, "mem-bw/h2d_bw:5": 26.1, "mem-bw/h2d_bw:6": 26.1, "mem-bw/h2d_bw:7": 26.1, "mem-bw/d2h_bw:0": 26.1, "mem-bw/d2h_bw:1": 26.2, "mem-bw/d2h_bw:2": 26.0, "mem-bw/d2h_bw:3": 26.2, "mem-bw/d2h_bw:4": 26.1, "mem-bw/d2h_bw:5": 26.2, "mem-bw/d2h_bw:6": 25.9, "mem-bw/d2h_bw:7": 25.9}
diff --git a/tests/analyzer/test_generate_baseline_summary_rules.yaml b/tests/analyzer/test_generate_baseline_summary_rules.yaml
new file mode 100644
index 000000000..d82b2cb63
--- /dev/null
+++ b/tests/analyzer/test_generate_baseline_summary_rules.yaml
@@ -0,0 +1,11 @@
+# SuperBench rules
+version: v0.8
+superbench:
+  rules:
+    micro-aggregation:
+      statistics: mean
+      categories: MICRO
+      aggregate: True
+      metrics:
+        - mem-bw:*.*/.*_bw
+        - kernel-launch:*.*/.*_time
diff --git a/website/sidebars.js b/website/sidebars.js
index 391b150ab..44f42763a 100644
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -33,6 +33,7 @@ module.exports = {
         'user-tutorial/system-config',
         'user-tutorial/data-diagnosis',
         'user-tutorial/result-summary',
+        'user-tutorial/baseline-generation',
         'user-tutorial/monitor',
         'user-tutorial/container-images',
       ],