Bug: Executor - Fix executor for Benchmark Execution Without Explicit…

… Framework Field (#636) **Description** Fix executor for Benchmark Execution Without Explicit Framework Field
microsoft · Aug 20, 2024 · 96cc4d9 · 96cc4d9
1 parent 7af75df
commit 96cc4d9
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 24 deletions.
diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py
@@ -228,29 +228,16 @@ def exec(self):
                     logger.warning('Monitor can not support CPU platform.')
 
             benchmark_real_name = benchmark_name.split(':')[0]
-            if 'frameworks' in benchmark_config:
-                for framework in benchmark_config.frameworks or [Framework.NONE.value]:
-                    if benchmark_real_name == 'model-benchmarks' or (
-                        ':' not in benchmark_name and benchmark_name.endswith('_models')
-                    ):
-                        for model in benchmark_config.models:
-                            full_name = f'{benchmark_name}/{framework}-{model}'
-                            logger.info('Executor is going to execute %s.', full_name)
-                            context = BenchmarkRegistry.create_benchmark_context(
-                                model,
-                                platform=self.__get_platform(),
-                                framework=Framework(framework.lower()),
-                                parameters=self.__get_arguments(
-                                    {} if 'parameters' not in benchmark_config else benchmark_config.parameters
-                                )
-                            )
-                            result = self.__exec_benchmark(full_name, context)
-                            benchmark_results.append(result)
-                    else:
-                        full_name = benchmark_name
+            frameworks = benchmark_config.get('frameworks', [Framework.NONE.value])
+            for framework in frameworks:
+                if benchmark_real_name == 'model-benchmarks' or (
+                    ':' not in benchmark_name and benchmark_name.endswith('_models')
+                ):
+                    for model in benchmark_config.models:
+                        full_name = f'{benchmark_name}/{framework}-{model}'
                         logger.info('Executor is going to execute %s.', full_name)
                         context = BenchmarkRegistry.create_benchmark_context(
-                            benchmark_real_name,
+                            model,
                             platform=self.__get_platform(),
                             framework=Framework(framework.lower()),
                             parameters=self.__get_arguments(
@@ -259,6 +246,18 @@ def exec(self):
                         )
                         result = self.__exec_benchmark(full_name, context)
                         benchmark_results.append(result)
+                else:
+                    full_name = benchmark_name
+                    logger.info('Executor is going to execute %s.', full_name)
+                    context = BenchmarkRegistry.create_benchmark_context(
+                        benchmark_real_name,
+                        platform=self.__get_platform(),
+                        framework=Framework(framework.lower()),
+                        parameters=self.
+                        __get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters)
+                    )
+                    result = self.__exec_benchmark(full_name, context)
+                    benchmark_results.append(result)
 
             if monitor:
                 monitor.stop()

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
@@ -84,7 +84,7 @@ def __validate_sb_config(self):    # noqa: C901
                     if 'proc_num' not in mode:
                         self._sb_benchmarks[name].modes[idx].proc_num = 8
                 elif mode.name == 'mpi':
-                    if 'machinefile' not in mode:
+                    if 'mca' not in mode:
                         self._sb_benchmarks[name].modes[idx].mca = {
                             'pml': 'ob1',
                             'btl': '^openib',
@@ -448,7 +448,7 @@ def _run_proc(self, benchmark_name, mode, vars):
             mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index})
         logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)
 
-        timeout = self._sb_benchmarks[benchmark_name].get('timeout', 60)
+        timeout = self._sb_benchmarks[benchmark_name].get('timeout', None)
         if isinstance(timeout, int):
             timeout = max(timeout, 60)
 

diff --git a/tests/executor/test_executor.py b/tests/executor/test_executor.py
@@ -166,5 +166,7 @@ def test_exec_default_benchmarks(self, mock_launch_benchmark):
             self.assertTrue(p.is_dir())
             self.assertTrue((p / 'results.json').is_file())
             with (p / 'results.json').open() as f:
-                for result in json.load(f):
+                results = json.load(f)
+                self.assertTrue(len(results) > 0)
+                for result in results:
                     self.assertIn(benchmark_name, result['name'])
diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py
@@ -41,6 +41,22 @@ def test_set_logger(self):
         expected_log_file = Path(self.runner._sb_output_dir) / 'sb-run.log'
         self.assertTrue(expected_log_file.is_file())
 
+    def test_validate_sb_config(self):
+        """Test validate_sb_config."""
+        self.runner._SuperBenchRunner__validate_sb_config()
+        self.assertIn('env', self.runner._sb_config.superbench)
+        for name in self.runner._sb_benchmarks:
+            self.assertIn('modes', self.runner._sb_config.superbench.benchmarks[name])
+            for mode in self.runner._sb_config.superbench.benchmarks[name].modes:
+                self.assertIn('env', mode)
+                if mode.name == 'local':
+                    self.assertIn('proc_num', mode)
+                    self.assertIn('prefix', mode)
+                if mode.name == 'torch.distributed':
+                    self.assertIn('proc_num', mode)
+                if mode.name == 'mpi':
+                    self.assertIn('mca', mode)
+
     def test_get_failure_count(self):
         """Test get_failure_count."""
         self.assertEqual(0, self.runner.get_failure_count())
@@ -410,3 +426,30 @@ def test_generate_metric_name(self):
                         test_case['run_count'], test_case['curr_rank'], test_case['curr_run']
                     ), test_case['expected']
                 )
+
+    def test_run_proc_timeout(self):
+        """Test run_proc_ timeout."""
+        self.runner._sb_benchmarks = {
+            'benchmark1': {
+                'timeout': 120
+            },
+            'benchmark2': {
+                'timeout': None
+            },
+            'benchmark3': {
+                'timeout': 30
+            },
+        }
+
+        test_cases = [
+            ('benchmark1', 120),
+            ('benchmark2', None),
+            ('benchmark3', 60),
+        ]
+
+        for benchmark_name, expected_timeout in test_cases:
+            with self.subTest(benchmark_name=benchmark_name):
+                timeout = self.runner._sb_benchmarks[benchmark_name].get('timeout', None)
+                if isinstance(timeout, int):
+                    timeout = max(timeout, 60)
+                self.assertEqual(timeout, expected_timeout)