diff --git a/superbench/analyzer/generate_statistic.py b/superbench/analyzer/generate_statistic.py index 13cd146f9..80ccaa8f8 100644 --- a/superbench/analyzer/generate_statistic.py +++ b/superbench/analyzer/generate_statistic.py @@ -5,6 +5,7 @@ import argparse import os +import natsort as ns from joblib import Parallel, delayed import pandas as pd @@ -72,6 +73,8 @@ def output_excel(self, excel_file, stat_df, digit=2): for benchmark in self._benchmark_metrics_dict: benchmark_df = stat_df[self._benchmark_metrics_dict[benchmark]] + #benchmark_df = benchmark_df[,mixedsort(names(benchmark_df))] + benchmark_df = benchmark_df.reindex(ns.natsorted(benchmark_df.columns), axis=1) sheet_name = benchmark if len(benchmark) <= 30 else benchmark.split('-')[-1] benchmark_df.to_excel(writer, sheet_name=sheet_name) worksheet = writer.sheets[sheet_name] diff --git a/superbench/analyzer/rules/aggregation_rules.yaml b/superbench/analyzer/rules/aggregation_rules.yaml index db8e7a87d..3b330b5bb 100644 --- a/superbench/analyzer/rules/aggregation_rules.yaml +++ b/superbench/analyzer/rules/aggregation_rules.yaml @@ -2,7 +2,7 @@ version: v0.5 superbench: rules: - model-benchmarks: + model-train-benchmarks: statistics: - mean categories: model @@ -14,6 +14,13 @@ superbench: - resnet_models/pytorch-resnet\d*/.*_train_throughput - vgg_models/pytorch-vgg\d*/.*_train_throughput - densenet_models/.*/.*_train_throughput + model-inference-benchmarks: + statistics: + - mean + categories: model + aggregate: True + metrics: + - model-benchmarks:.*/.*/.*_inference_throughput:\d+ micro-aggregation: statistics: mean categories: MICRO1 diff --git a/superbench/analyzer/rules/diagnosis_rules.yaml b/superbench/analyzer/rules/diagnosis_rules.yaml index 8ece570ab..3771566d3 100644 --- a/superbench/analyzer/rules/diagnosis_rules.yaml +++ b/superbench/analyzer/rules/diagnosis_rules.yaml @@ -191,4 +191,41 @@ superbench: categories: VGG function: multi_rules criteria: 'lambda label:True if label["vgg_throughput"]>=2 else False' - + model_inference_throughput_rule: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Model + metrics: + - model-benchmarks:.*/.*bert.*/.*_inference_throughput + - model-benchmarks:.*/.*gpt.*/.*_inference_throughput + - model-benchmarks:.*/.*lstm.*/.*_inference_throughput + # Rule 8: If 2+ CNN models suffer > 5% downgrade, label it as Not acceptable + resnet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*resnet.*/.*_inference_throughput + vgg_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*vgg.*/.*_inference_throughput + densenet_inference_throughput: + function: variance + criteria: 'lambda x:x<-0.05' + store: true + categories: CNN + metrics: + - model-benchmarks:.*/.*densenet.*/.*_inference_throughput + cnn_inference_throughput_rule: + categories: CNN + function: multi_rules + criteria: 'lambda label:True if label["resnet_inference_throughput"]+label["densenet_inference_throughput"]>=2 else False' + vgg_inference_throughput_rule: + categories: VGG + function: multi_rules + criteria: 'lambda label:True if label["vgg_inference_throughput"]>=2 else False'