From 43d162298633811cdfb97544c0c6167d17fce37a Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 2 Apr 2024 15:06:57 +0800
Subject: [PATCH 01/30] add llmsuite

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_parser.py              |  67 ++++++++
 python/fate_test/scripts/_utils.py       |  12 +-
 python/fate_test/scripts/cli.py          |   2 +
 python/fate_test/scripts/llmsuite_cli.py | 206 +++++++++++++++++++++++
 4 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 python/fate_test/scripts/llmsuite_cli.py

diff --git a/python/fate_test/_parser.py b/python/fate_test/_parser.py
index 55e2aa3..8c18958 100644
--- a/python/fate_test/_parser.py
+++ b/python/fate_test/_parser.py
@@ -344,7 +344,74 @@ def load(path: Path):
         suite = PerformanceSuite(dataset, pipeline_jobs, path)
         return suite
 
+"""
+class LlmJob(object):
+    def __init__(self, job_name: str, script_path: Path, conf_path: Path,
+                 loader: str, loader_conf_path: Path, tasks: typing.List[str], include_path: Path):
+        self.job_name = job_name
+        self.script_path = script_path
+        self.conf_path = conf_path
+        self.loader = loader
+        self.loader_conf_path = loader_conf_path
+        self.tasks = tasks
+        self.include_path = include_path
+
+
+class LlmPair(object):
+    def __init__(
+            self, pair_name: str, jobs: typing.List[LlmJob]
+    ):
+        self.pair_name = pair_name
+        self.jobs = jobs
+
+
+class LlmSuite(object):
+    def __init__(
+            self, pairs: typing.List[LlmPair], path: Path
+    ):
+        self.pairs = pairs
+        self.path = path
+
+    @staticmethod
+    def load(path: Path):
+        with path.open("r") as f:
+            testsuite_config = yaml.safe_load(f)
+
+        pairs = []
+        for pair_name, pair_configs in testsuite_config.items():
+            jobs = []
+            for job_name, job_configs in pair_configs.items():
+                script_path = path.parent.joinpath(job_configs["script"]).resolve()
+                if job_configs.get("conf"):
+                    conf_path = path.parent.joinpath(job_configs["conf"]).resolve()
+                else:
+                    conf_path = ""
+                loader = job_configs.get("loader", "")
+                if job_configs.get("loader_conf"):
+                    loader_conf_path = path.parent.joinpath(job_configs["loader_conf"]).resolve()
+                else:
+                    loader_conf_path = ""
+                tasks = job_configs.get("tasks", [])
+                include_path = job_configs.get("include_path", "")
+                if include_path and not os.path.isabs(include_path):
+                    include_path = path.parent.joinpath(job_configs["include_path"]).resolve()
+
+                jobs.append(
+                    LlmJob(
+                        job_name=job_name, script_path=script_path, conf_path=conf_path,
+                        loader=loader, loader_conf_path=loader_conf_path, tasks=tasks, include_path=include_path
+                    )
+                )
+
+            pairs.append(
+                LlmPair(
+                    pair_name=pair_name, jobs=jobs
+                )
+            )
+        suite = LlmSuite(pairs=pairs, path=path)
+        return suite
 
+"""
 def non_success_summary():
     status = {}
     for job in _config.non_success_jobs:
diff --git a/python/fate_test/scripts/_utils.py b/python/fate_test/scripts/_utils.py
index b3f60e3..576d83c 100644
--- a/python/fate_test/scripts/_utils.py
+++ b/python/fate_test/scripts/_utils.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 
 import click
+from fate_llm.utils import LlmSuite
 
 from fate_test._client import Clients
 from fate_test._config import Config
@@ -19,7 +20,7 @@ def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num,
     from fate_test.scripts import generate_mock_data
 
     def _find_testsuite_files(path):
-        suffix = ["testsuite.yaml", "benchmark.yaml", "performance.yaml"]
+        suffix = ["testsuite.yaml", "benchmark.yaml", "performance.yaml", "llmsuite.yaml"]
         if isinstance(path, str):
             path = Path(path)
         if path.is_file():
@@ -85,6 +86,8 @@ def _find_testsuite_files(path):
                 suite = BenchmarkSuite.load(suite_path.resolve())
             elif suite_type == "performance":
                 suite = PerformanceSuite.load(suite_path.resolve())
+            elif suite_type == "llmsuite":
+                suite = LlmSuite.load(suite_path.resolve())
             else:
                 raise ValueError(f"Unsupported suite type: {suite_type}. Only accept type 'testsuite' or 'benchmark'.")
         except Exception as e:
@@ -207,3 +210,10 @@ def _update_data_config(suite, partitions=None):
         for data in suite.dataset:
             data.config['partitions'] = partitions
             data.partitions = partitions
+
+
+def _obtain_model_output_path(config, job_id, task_name, client, role, party_id):
+
+    output_path = os.path.join(config.data_base_dir, "fate_flow",
+                             "model", job_id, role, party_id, task_name, "0", "output", "output_model")
+    return output_path
diff --git a/python/fate_test/scripts/cli.py b/python/fate_test/scripts/cli.py
index f59bd6c..48bcfaf 100644
--- a/python/fate_test/scripts/cli.py
+++ b/python/fate_test/scripts/cli.py
@@ -20,6 +20,7 @@
 from fate_test.scripts.benchmark_cli import run_benchmark
 from fate_test.scripts.config_cli import config_group
 from fate_test.scripts.data_cli import data_group
+from fate_test.scripts.llmsuite_cli import run_llmsuite
 # from fate_test.scripts.flow_test_cli import flow_group
 from fate_test.scripts.performance_cli import run_task
 # from fate_test.scripts.quick_test_cli import unittest_group
@@ -32,6 +33,7 @@
     "performance": run_task,
     "benchmark-quality": run_benchmark,
     "data": data_group,
+    "llmsuite": run_llmsuite
     # "unittest": unittest_group
 }
 
diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
new file mode 100644
index 0000000..2c94fe2
--- /dev/null
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -0,0 +1,206 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import time
+import uuid
+from datetime import timedelta
+from inspect import signature
+
+import click
+import yaml
+from fate_llm.scripts.eval_cli import run_job_eval
+
+from fate_test._client import Clients
+from fate_test._config import Config
+from fate_test._io import LOGGER, echo
+from fate_test.scripts._options import SharedOptions
+from fate_test.scripts._utils import _load_testsuites, _load_module_from_script
+from fate_test.utils import extract_job_status
+
+"""
+@click.option('-uj', '--update-job-parameters', default="{}", type=str,
+              help="a json string that represents mapping for replacing fields in job conf, example format: "'{job_name: param_name1: param_val1, param_name2=param_val2}'")
+"""
+@click.command("llmsuite")
+@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True,
+              metavar="<include>",
+              help="include *llmsuite.yaml under these paths")
+@click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True,
+              help="exclude *llmsuite.yaml under these paths")
+@click.option('-a', '--algorithm-suite', type=str, multiple=True,
+              help="run built-in algorithm suite, if given, ignore include/exclude")
+@click.option('-p', '--task-cores', type=int, help="processors per node")
+@click.option('-m', '--timeout', type=int,
+              help="maximum running time of job")
+@click.option("-g", '--glob', type=str,
+              help="glob string to filter sub-directory of path specified by <include>")
+@click.option("--provider", type=str,
+              help="Select the fate version, for example: fate@2.0-beta")
+@click.option('-c', '--eval-config', optional=True, type=click.Path(exists=True), help='Path to FATE Llm evaluation config. '
+                                                        'If none, use default config.')
+@click.option('--skip-evaluate', is_flag=True, default=False,
+              help="skip evaluation after training model")
+@SharedOptions.get_shared_options(hidden=True)
+@click.pass_context
+def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_cores, timeout, eval_config, skip_evaluate, **kwargs):
+    """
+    process llmsuite
+    """
+    ctx.obj.update(**kwargs)
+    ctx.obj.post_process()
+    config_inst = ctx.obj["config"]
+    if ctx.obj["engine_run"][0] is not None:
+        config_inst.update_conf(engine_run=dict(ctx.obj["engine_run"]))
+    if task_cores is not None:
+        config_inst.update_conf(task_cores=task_cores)
+    if timeout is not None:
+        config_inst.update_conf(timeout=timeout)
+
+
+    namespace = ctx.obj["namespace"]
+    yes = ctx.obj["yes"]
+    data_namespace_mangling = ctx.obj["namespace_mangling"]
+    # prepare output dir and json hooks
+    # _add_replace_hook(replace)
+    echo.welcome()
+    echo.echo(f"llmsuite namespace: {namespace}", fg='red')
+    echo.echo("loading llmsuites:")
+    if algorithm_suite:
+        #@todo: find built-in llmsuite path
+        algorithm_suite_path = [None]
+        suites = _load_testsuites(includes=algorithm_suite_path, excludes=None, glob=None, provider=provider,
+                                  suffix="llmsuite.yaml", suite_type="llmsuite")
+    else:
+        suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, provider=provider,
+                                  suffix="llmsuite.yaml", suite_type="llmsuite")
+    for suite in suites:
+        echo.echo(f"\tllm groups({len(suite.pairs)}) {suite.path}")
+    if not yes and not click.confirm("running?"):
+        return
+
+    echo.stdout_newline()
+    # with Clients(config_inst) as client:
+    client = Clients(config_inst)
+
+    for i, suite in enumerate(suites):
+        # noinspection PyBroadException
+        try:
+            start = time.time()
+            echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red')
+            os.environ['enable_pipeline_job_info_callback'] = '1'
+            try:
+                if eval_config:
+                    config = {}
+                    if eval_config is not None:
+                        with eval_config.open("r") as f:
+                            config.update(yaml.safe_load(f))
+                    eval_conf = config
+                else:
+                    from fate_llm.utils.config import default_eval_config
+                    eval_conf = default_eval_config()
+                _run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
+                                    skip_evaluate, eval_conf)
+            except Exception as e:
+                raise RuntimeError(f"exception occur while running benchmark jobs for {suite.path}") from e
+
+            echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red')
+        except Exception:
+            exception_id = uuid.uuid1()
+            echo.echo(f"exception in {suite.path}, exception_id={exception_id}")
+            LOGGER.exception(f"exception id: {exception_id}")
+        finally:
+            echo.stdout_newline()
+    # non_success_summary()
+    echo.farewell()
+    echo.echo(f"llmsuite namespace: {namespace}", fg='red')
+
+
+@LOGGER.catch
+def _run_llmsuite_pairs(config: Config, suite, namespace: str,
+                        data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: dict):
+    client = clients['guest_0']
+    guest_party_id = config.parties.role_to_party("guest")[0]
+    # pipeline demo goes here
+    pair_n = len(suite.pairs)
+    # fate_base = config.fate_base
+    # PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python")
+    # os.environ['PYTHONPATH'] = PYTHONPATH
+    suite_results = dict()
+    for i, pair in enumerate(suite.pairs):
+        echo.echo(f"Running [{i + 1}/{pair_n}] group: {pair.pair_name}")
+        job_n = len(pair.jobs)
+        # time_dict = dict()
+        job_results = dict()
+        for j, job in enumerate(pair.jobs):
+            echo.echo(f"Running [{j + 1}/{job_n}] job: {job.job_name}")
+
+            def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None):
+                exception_id = str(uuid.uuid1())
+                # suite.update_status(job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
+                #                    event=event, time_elapsed=time_elapsed)
+                echo.file(f"exception({exception_id}), error message:\n{err_msg}")
+
+            job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path
+            param = Config.load_from_file(conf_path)
+            mod = _load_module_from_script(script_path)
+            input_params = signature(mod.main).parameters
+
+            try:
+                # @todo: add update status api to suite
+                _run_mod(mod, input_params, config, param, namespace, data_namespace_mangling)
+                job_info = os.environ.get("pipeline_job_info")
+                job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
+                """suite.update_status(job_name=job_name, job_id=job_id, status=status,
+                                    time_elapsed=time_elapsed,
+                                    event=event)"""
+                if not skip_evaluate:
+                    # @todo: load model with flow api & record evaluate result
+                    job.pretrained_model_path, job.heft_path = None, None
+                    result = run_job_eval(job, eval_conf)
+                    job_results[job_name] = result
+                os.environ.pop("pipeline_job_info")
+
+            except Exception as e:
+                job_info = os.environ.get("pipeline_job_info")
+                if job_info is None:
+                    job_id, status, time_elapsed, event = None, 'failed', None, None
+                else:
+                    job_id, status, time_elapsed, event = extract_job_status(job_info, client,
+                                                                             guest_party_id)
+                _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
+                os.environ.pop("pipeline_job_info")
+                continue
+        suite_results[pair.pair_name] = job_results
+        for job_name, result in job_results.items():
+            echo.echo(f"Job: {job_name}")
+            echo.echo(result)
+        # todo: record time elapse
+
+
+def _run_mod(mod, input_params, config, param, namespace, data_namespace_mangling):
+    if len(input_params) == 1:
+        mod.main(param=param)
+    elif len(input_params) == 2:
+        mod.main(config=config, param=param)
+    # pipeline script
+    elif len(input_params) == 3:
+        if data_namespace_mangling:
+            mod.main(config=config, param=param, namespace=f"_{namespace}")
+        else:
+            mod.main(config=config, param=param)
+    else:
+        mod.main()

From 72120b4e9e222365e1a6f86648bb714820c3bdb6 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 3 Apr 2024 16:53:07 +0800
Subject: [PATCH 02/30] add non success job summary for llmsuite

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_io.py                  |   4 +
 python/fate_test/_parser.py              |  79 ++-----------
 python/fate_test/scripts/_utils.py       |   9 +-
 python/fate_test/scripts/llmsuite_cli.py | 136 ++++++++++++++---------
 4 files changed, 105 insertions(+), 123 deletions(-)

diff --git a/python/fate_test/_io.py b/python/fate_test/_io.py
index edfaeee..47ee682 100644
--- a/python/fate_test/_io.py
+++ b/python/fate_test/_io.py
@@ -32,6 +32,10 @@ def echo(cls, message, **kwargs):
         click.secho(message, **kwargs)
         click.secho(message, file=cls._file, **kwargs)
 
+    @classmethod
+    def sep_line(cls):
+        click.secho("-------------------------------------------------")
+
     @classmethod
     def file(cls, message, **kwargs):
         click.secho(message, file=cls._file, **kwargs)
diff --git a/python/fate_test/_parser.py b/python/fate_test/_parser.py
index 8c18958..da4918c 100644
--- a/python/fate_test/_parser.py
+++ b/python/fate_test/_parser.py
@@ -344,74 +344,7 @@ def load(path: Path):
         suite = PerformanceSuite(dataset, pipeline_jobs, path)
         return suite
 
-"""
-class LlmJob(object):
-    def __init__(self, job_name: str, script_path: Path, conf_path: Path,
-                 loader: str, loader_conf_path: Path, tasks: typing.List[str], include_path: Path):
-        self.job_name = job_name
-        self.script_path = script_path
-        self.conf_path = conf_path
-        self.loader = loader
-        self.loader_conf_path = loader_conf_path
-        self.tasks = tasks
-        self.include_path = include_path
-
-
-class LlmPair(object):
-    def __init__(
-            self, pair_name: str, jobs: typing.List[LlmJob]
-    ):
-        self.pair_name = pair_name
-        self.jobs = jobs
-
-
-class LlmSuite(object):
-    def __init__(
-            self, pairs: typing.List[LlmPair], path: Path
-    ):
-        self.pairs = pairs
-        self.path = path
-
-    @staticmethod
-    def load(path: Path):
-        with path.open("r") as f:
-            testsuite_config = yaml.safe_load(f)
-
-        pairs = []
-        for pair_name, pair_configs in testsuite_config.items():
-            jobs = []
-            for job_name, job_configs in pair_configs.items():
-                script_path = path.parent.joinpath(job_configs["script"]).resolve()
-                if job_configs.get("conf"):
-                    conf_path = path.parent.joinpath(job_configs["conf"]).resolve()
-                else:
-                    conf_path = ""
-                loader = job_configs.get("loader", "")
-                if job_configs.get("loader_conf"):
-                    loader_conf_path = path.parent.joinpath(job_configs["loader_conf"]).resolve()
-                else:
-                    loader_conf_path = ""
-                tasks = job_configs.get("tasks", [])
-                include_path = job_configs.get("include_path", "")
-                if include_path and not os.path.isabs(include_path):
-                    include_path = path.parent.joinpath(job_configs["include_path"]).resolve()
 
-                jobs.append(
-                    LlmJob(
-                        job_name=job_name, script_path=script_path, conf_path=conf_path,
-                        loader=loader, loader_conf_path=loader_conf_path, tasks=tasks, include_path=include_path
-                    )
-                )
-
-            pairs.append(
-                LlmPair(
-                    pair_name=pair_name, jobs=jobs
-                )
-            )
-        suite = LlmSuite(pairs=pairs, path=path)
-        return suite
-
-"""
 def non_success_summary():
     status = {}
     for job in _config.non_success_jobs:
@@ -476,3 +409,15 @@ def _hook(d):
         return d
 
     return _hook
+
+
+def record_non_success_jobs(suite, suite_file=None):
+    for status in suite.get_final_status().values():
+        if isinstance(status.status, str) and status.status != "success":
+            status.suite_file = suite_file
+            _config.non_success_jobs.append(status)
+        if isinstance(status.status, list):
+            for job_status in status.status:
+                if job_status.status != "success":
+                    status.suite_file = suite_file
+                    _config.non_success_jobs.append(status)
diff --git a/python/fate_test/scripts/_utils.py b/python/fate_test/scripts/_utils.py
index 576d83c..0b6117a 100644
--- a/python/fate_test/scripts/_utils.py
+++ b/python/fate_test/scripts/_utils.py
@@ -12,7 +12,8 @@
 from fate_test._config import Config
 from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse
 from fate_test._io import echo, LOGGER, set_logger
-from fate_test._parser import Testsuite, BenchmarkSuite, PerformanceSuite, DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK
+from fate_test._parser import (Testsuite, BenchmarkSuite, PerformanceSuite, FinalStatus,
+                               DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK)
 
 
 def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type,
@@ -88,6 +89,12 @@ def _find_testsuite_files(path):
                 suite = PerformanceSuite.load(suite_path.resolve())
             elif suite_type == "llmsuite":
                 suite = LlmSuite.load(suite_path.resolve())
+                suite_status = {}
+                for pair in suite.pairs:
+                    for job in pair.jobs:
+                        if not job.evaluate_only:
+                            suite_status[f"{pair.pair_name}-{job.job_name}"] = FinalStatus(f"{pair.pair_name}-{job.job_name}")
+                suite._final_status = suite_status
             else:
                 raise ValueError(f"Unsupported suite type: {suite_type}. Only accept type 'testsuite' or 'benchmark'.")
         except Exception as e:
diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 2c94fe2..67ffba5 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -23,10 +23,12 @@
 import click
 import yaml
 from fate_llm.scripts.eval_cli import run_job_eval
+from fate_llm.utils.llm_evaluator import aggregate_table
 
 from fate_test._client import Clients
 from fate_test._config import Config
 from fate_test._io import LOGGER, echo
+from fate_test._parser import record_non_success_jobs, non_success_summary
 from fate_test.scripts._options import SharedOptions
 from fate_test.scripts._utils import _load_testsuites, _load_module_from_script
 from fate_test.utils import extract_job_status
@@ -50,8 +52,8 @@
               help="glob string to filter sub-directory of path specified by <include>")
 @click.option("--provider", type=str,
               help="Select the fate version, for example: fate@2.0-beta")
-@click.option('-c', '--eval-config', optional=True, type=click.Path(exists=True), help='Path to FATE Llm evaluation config. '
-                                                        'If none, use default config.')
+@click.option('--eval-config', type=click.Path(exists=True),
+              help='Path to FATE Llm evaluation config. If none, use default config.')
 @click.option('--skip-evaluate', is_flag=True, default=False,
               help="skip evaluation after training model")
 @SharedOptions.get_shared_options(hidden=True)
@@ -80,7 +82,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.echo(f"llmsuite namespace: {namespace}", fg='red')
     echo.echo("loading llmsuites:")
     if algorithm_suite:
-        #@todo: find built-in llmsuite path
+        # @todo: find built-in llmsuite path
         algorithm_suite_path = [None]
         suites = _load_testsuites(includes=algorithm_suite_path, excludes=None, glob=None, provider=provider,
                                   suffix="llmsuite.yaml", suite_type="llmsuite")
@@ -103,19 +105,17 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
             echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red')
             os.environ['enable_pipeline_job_info_callback'] = '1'
             try:
-                if eval_config:
-                    config = {}
-                    if eval_config is not None:
-                        with eval_config.open("r") as f:
-                            config.update(yaml.safe_load(f))
-                    eval_conf = config
-                else:
+                if not eval_config:
                     from fate_llm.utils.config import default_eval_config
-                    eval_conf = default_eval_config()
+                    eval_config = default_eval_config()
+
+                eval_config_dict = {}
+                with eval_config.open("r") as f:
+                    eval_config_dict.update(yaml.safe_load(f))
                 _run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
-                                    skip_evaluate, eval_conf)
+                                    skip_evaluate, eval_config_dict)
             except Exception as e:
-                raise RuntimeError(f"exception occur while running benchmark jobs for {suite.path}") from e
+                raise RuntimeError(f"exception occur while running llmsuite jobs for {suite.path}") from e
 
             echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red')
         except Exception:
@@ -124,14 +124,17 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
             LOGGER.exception(f"exception id: {exception_id}")
         finally:
             echo.stdout_newline()
-    # non_success_summary()
+        suite_file = str(suite.path).split("/")[-1]
+        record_non_success_jobs(suite, suite_file)
+    non_success_summary()
     echo.farewell()
     echo.echo(f"llmsuite namespace: {namespace}", fg='red')
 
 
 @LOGGER.catch
 def _run_llmsuite_pairs(config: Config, suite, namespace: str,
-                        data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: dict):
+                        data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: dict,
+                        output_path: str = None):
     client = clients['guest_0']
     guest_party_id = config.parties.role_to_party("guest")[0]
     # pipeline demo goes here
@@ -150,57 +153,80 @@ def _run_llmsuite_pairs(config: Config, suite, namespace: str,
 
             def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None):
                 exception_id = str(uuid.uuid1())
-                # suite.update_status(job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
-                #                    event=event, time_elapsed=time_elapsed)
+                suite.update_status(job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
+                                   event=event, time_elapsed=time_elapsed)
                 echo.file(f"exception({exception_id}), error message:\n{err_msg}")
-
-            job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path
-            param = Config.load_from_file(conf_path)
-            mod = _load_module_from_script(script_path)
-            input_params = signature(mod.main).parameters
-
-            try:
-                # @todo: add update status api to suite
-                _run_mod(mod, input_params, config, param, namespace, data_namespace_mangling)
-                job_info = os.environ.get("pipeline_job_info")
-                job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
-                """suite.update_status(job_name=job_name, job_id=job_id, status=status,
-                                    time_elapsed=time_elapsed,
-                                    event=event)"""
+            # evaluate_only
+            if job.evaluate_only and not skip_evaluate:
+                job_results[job.job_name] = run_job_eval(job, eval_conf)
+            # run pipeline job then evaluate
+            else:
+                job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path
+                param = Config.load_from_file(conf_path)
+                mod = _load_module_from_script(script_path)
+                input_params = signature(mod.main).parameters
+
+                try:
+                    # todo: add update status api to suite
+                    # pipeline should return pretrained model path
+                    pretrained_model_path = _run_mod(mod, input_params, config, param,
+                                                     namespace, data_namespace_mangling)
+                    job.pretrained_model_path = pretrained_model_path
+                    job_info = os.environ.get("pipeline_job_info")
+                    job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
+                    suite.update_status(job_name=job_name, job_id=job_id, status=status,
+                                        time_elapsed=time_elapsed,
+                                        event=event)
+
+                except Exception as e:
+                    job_info = os.environ.get("pipeline_job_info")
+                    if job_info is None:
+                        job_id, status, time_elapsed, event = None, 'failed', None, None
+                    else:
+                        job_id, status, time_elapsed, event = extract_job_status(job_info, client,
+                                                                                 guest_party_id)
+                    _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
+                    os.environ.pop("pipeline_job_info")
+                    continue
                 if not skip_evaluate:
-                    # @todo: load model with flow api & record evaluate result
-                    job.pretrained_model_path, job.heft_path = None, None
-                    result = run_job_eval(job, eval_conf)
-                    job_results[job_name] = result
+                    model_task_name = "nn_0"
+                    if job.model_task_name:
+                        model_task_name = job.model_task_name
+                    peft_path = os.path.join(config.fate_base, "fate_flow", "model", job_id,
+                                             "guest", guest_party_id, model_task_name,
+                                             "0", "output", "output_model", "model_directory")
+                    job.peft_path = peft_path
+                    try:
+                        result = run_job_eval(job, eval_conf)
+                        job_results[job_name] = result
+                    except Exception as e:
+                        _raise(f"evaluate failed: {e}")
                 os.environ.pop("pipeline_job_info")
-
-            except Exception as e:
-                job_info = os.environ.get("pipeline_job_info")
-                if job_info is None:
-                    job_id, status, time_elapsed, event = None, 'failed', None, None
-                else:
-                    job_id, status, time_elapsed, event = extract_job_status(job_info, client,
-                                                                             guest_party_id)
-                _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
-                os.environ.pop("pipeline_job_info")
-                continue
         suite_results[pair.pair_name] = job_results
-        for job_name, result in job_results.items():
-            echo.echo(f"Job: {job_name}")
-            echo.echo(result)
-        # todo: record time elapse
+    suite_writers = aggregate_table(suite_results)
+    for pair_name, pair_writer in suite_writers.items():
+        echo.sep_line()
+        echo.echo(f"Pair: {pair_name}")
+        echo.sep_line()
+        echo.echo(pair_writer.dumps())
+        echo.stdout_newline()
+
+    if output_path:
+        with open(output_path, 'w') as f:
+            for pair_name, pair_writer in suite_writers.items():
+                pair_writer.dumps(f)
 
 
 def _run_mod(mod, input_params, config, param, namespace, data_namespace_mangling):
     if len(input_params) == 1:
-        mod.main(param=param)
+        return mod.main(param=param)
     elif len(input_params) == 2:
-        mod.main(config=config, param=param)
+        return mod.main(config=config, param=param)
     # pipeline script
     elif len(input_params) == 3:
         if data_namespace_mangling:
-            mod.main(config=config, param=param, namespace=f"_{namespace}")
+            return mod.main(config=config, param=param, namespace=f"_{namespace}")
         else:
-            mod.main(config=config, param=param)
+            return mod.main(config=config, param=param)
     else:
-        mod.main()
+        return mod.main()

From 7ff4bfa2db0d82a3631aab8efebdd9667a3ed7b5 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Mon, 20 May 2024 14:14:53 +0800
Subject: [PATCH 03/30] update import path for fate llm evaluate

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 67ffba5..c5fab89 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -22,8 +22,8 @@
 
 import click
 import yaml
-from fate_llm.scripts.eval_cli import run_job_eval
-from fate_llm.utils.llm_evaluator import aggregate_table
+from fate_llm.evaluate.scripts.eval_cli import run_job_eval
+from fate_llm.evaluate.utils.llm_evaluator import aggregate_table
 
 from fate_test._client import Clients
 from fate_test._config import Config
@@ -106,7 +106,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
             os.environ['enable_pipeline_job_info_callback'] = '1'
             try:
                 if not eval_config:
-                    from fate_llm.utils.config import default_eval_config
+                    from fate_llm.evaluate.utils.config import default_eval_config
                     eval_config = default_eval_config()
 
                 eval_config_dict = {}

From b59678a727596a93438269352a2d78c8c8b64703 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Mon, 20 May 2024 14:20:06 +0800
Subject: [PATCH 04/30] update import path for fate llm evaluate

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fate_test/scripts/_utils.py b/python/fate_test/scripts/_utils.py
index 0b6117a..56e7b28 100644
--- a/python/fate_test/scripts/_utils.py
+++ b/python/fate_test/scripts/_utils.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 
 import click
-from fate_llm.utils import LlmSuite
 
 from fate_test._client import Clients
 from fate_test._config import Config
@@ -88,6 +87,7 @@ def _find_testsuite_files(path):
             elif suite_type == "performance":
                 suite = PerformanceSuite.load(suite_path.resolve())
             elif suite_type == "llmsuite":
+                from fate_llm.evaluate.utils import LlmSuite
                 suite = LlmSuite.load(suite_path.resolve())
                 suite_status = {}
                 for pair in suite.pairs:

From 5d021504bc51b04026e694deffd54018eca72490 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Mon, 20 May 2024 15:52:49 +0800
Subject: [PATCH 05/30] allow optionally import fate-llm in fate-test scripts
 add cli to control optional import of extra packages

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/cli.py        |  8 ++++++--
 python/fate_test/scripts/config_cli.py | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/python/fate_test/scripts/cli.py b/python/fate_test/scripts/cli.py
index 48bcfaf..b01e6ce 100644
--- a/python/fate_test/scripts/cli.py
+++ b/python/fate_test/scripts/cli.py
@@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import os
 
 import click
 
@@ -20,7 +21,6 @@
 from fate_test.scripts.benchmark_cli import run_benchmark
 from fate_test.scripts.config_cli import config_group
 from fate_test.scripts.data_cli import data_group
-from fate_test.scripts.llmsuite_cli import run_llmsuite
 # from fate_test.scripts.flow_test_cli import flow_group
 from fate_test.scripts.performance_cli import run_task
 # from fate_test.scripts.quick_test_cli import unittest_group
@@ -33,10 +33,14 @@
     "performance": run_task,
     "benchmark-quality": run_benchmark,
     "data": data_group,
-    "llmsuite": run_llmsuite
+
     # "unittest": unittest_group
 }
 
+if os.environ.get("INCLUDE_FATE_LLM", None):
+    from fate_test.scripts.llmsuite_cli import run_llmsuite
+    commands["llmsuite"] = run_llmsuite
+
 commands_alias = {
     "bq": "benchmark-quality",
     "bp": "performance"
diff --git a/python/fate_test/scripts/config_cli.py b/python/fate_test/scripts/config_cli.py
index 55f0b4c..4cfcfdb 100644
--- a/python/fate_test/scripts/config_cli.py
+++ b/python/fate_test/scripts/config_cli.py
@@ -13,10 +13,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+import os
 from pathlib import Path
 
 import click
+
 from fate_test._client import Clients
 from fate_test._config import create_config, default_config, parse_config
 from fate_test.scripts._options import SharedOptions
@@ -77,3 +78,16 @@ def _config(ctx, **kwargs):
                 click.echo(f"[X]connection fail, role is {r}, exception is {e.args}")
             else:
                 click.echo(f"[✓]connection {address} ok, fate version is {version}, role is {r}")
+
+
+@config_group.command(name="enable")
+@click.option('-i', '--include', required=True, type=str, multiple=True,
+              help="packages to be loaded in FATE-Test scripts")
+def _enable(include):
+    """
+    allow import of extra packages, currently only for FATE-Llm
+    """
+    for p in include:
+        if isinstance(p, str) and p.lower() == "fate-llm":
+            os.environ['INCLUDE_FATE_LLM'] = "1"
+    click.echo(f"FATE-Test will allow import {include}.")

From 86ea69294c109da12a157b1df7b77d2e840d67b1 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 21 May 2024 15:33:53 +0800
Subject: [PATCH 06/30] use template for loading fate-llm trained model

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index c5fab89..d7b6e14 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -22,8 +22,6 @@
 
 import click
 import yaml
-from fate_llm.evaluate.scripts.eval_cli import run_job_eval
-from fate_llm.evaluate.utils.llm_evaluator import aggregate_table
 
 from fate_test._client import Clients
 from fate_test._config import Config
@@ -135,6 +133,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
 def _run_llmsuite_pairs(config: Config, suite, namespace: str,
                         data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: dict,
                         output_path: str = None):
+    from fate_llm.evaluate.scripts.eval_cli import run_job_eval
     client = clients['guest_0']
     guest_party_id = config.parties.role_to_party("guest")[0]
     # pipeline demo goes here
@@ -192,9 +191,16 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                     model_task_name = "nn_0"
                     if job.model_task_name:
                         model_task_name = job.model_task_name
-                    peft_path = os.path.join(config.fate_base, "fate_flow", "model", job_id,
+                    from lm_eval.utils import apply_template
+                    peft_path = apply_template(job.peft_path_format,
+                                               {"fate_base": config.fate_base,
+                                                "job_id": job_id,
+                                                "party_id": guest_party_id,
+                                                "model_task_name": model_task_name}
+                                               )
+                    """peft_path = os.path.join(config.fate_base, "fate_flow", "model", job_id,
                                              "guest", guest_party_id, model_task_name,
-                                             "0", "output", "output_model", "model_directory")
+                                             "0", "output", "output_model", "model_directory")"""
                     job.peft_path = peft_path
                     try:
                         result = run_job_eval(job, eval_conf)
@@ -203,6 +209,8 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                         _raise(f"evaluate failed: {e}")
                 os.environ.pop("pipeline_job_info")
         suite_results[pair.pair_name] = job_results
+
+    from fate_llm.evaluate.utils.llm_evaluator import aggregate_table
     suite_writers = aggregate_table(suite_results)
     for pair_name, pair_writer in suite_writers.items():
         echo.sep_line()

From a81f944f0074d377ff42febb3158e89c2c3dea1b Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 21 May 2024 20:12:40 +0800
Subject: [PATCH 07/30] fix record

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index d7b6e14..851de9e 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -152,7 +152,7 @@ def _run_llmsuite_pairs(config: Config, suite, namespace: str,
 
             def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None):
                 exception_id = str(uuid.uuid1())
-                suite.update_status(job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
+                suite.update_status(pair_name=pair.pair_name, job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
                                    event=event, time_elapsed=time_elapsed)
                 echo.file(f"exception({exception_id}), error message:\n{err_msg}")
             # evaluate_only
@@ -173,7 +173,7 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                     job.pretrained_model_path = pretrained_model_path
                     job_info = os.environ.get("pipeline_job_info")
                     job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
-                    suite.update_status(job_name=job_name, job_id=job_id, status=status,
+                    suite.update_status(pair_name=pair.pair_name, job_name=job_name, job_id=job_id, status=status,
                                         time_elapsed=time_elapsed,
                                         event=event)
 

From 57613124c905e1c7ff737068a4c5d1bfd3aba98b Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 21 May 2024 20:51:03 +0800
Subject: [PATCH 08/30] fix apply template

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 851de9e..081a9b8 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -194,7 +194,7 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                     from lm_eval.utils import apply_template
                     peft_path = apply_template(job.peft_path_format,
                                                {"fate_base": config.fate_base,
-                                                "job_id": job_id,
+                                                "job_id": job_id[0],
                                                 "party_id": guest_party_id,
                                                 "model_task_name": model_task_name}
                                                )

From c4e3341019356ab16726a10435f6dd2f8b712b9c Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 22 May 2024 10:08:01 +0800
Subject: [PATCH 09/30] add init tasks when using llmsuite

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 081a9b8..b6406bb 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -95,7 +95,8 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.stdout_newline()
     # with Clients(config_inst) as client:
     client = Clients(config_inst)
-
+    from fate_llm.evaluate.utils import llm_evaluator
+    llm_evaluator.init_tasks()
     for i, suite in enumerate(suites):
         # noinspection PyBroadException
         try:

From 16d01b76f982682c164a6f377dca76e277420d04 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 22 May 2024 17:21:30 +0800
Subject: [PATCH 10/30] add pellm suite to algorithm default suites

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index b6406bb..6a6e5b9 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -80,9 +80,15 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.echo(f"llmsuite namespace: {namespace}", fg='red')
     echo.echo("loading llmsuites:")
     if algorithm_suite:
-        # @todo: find built-in llmsuite path
-        algorithm_suite_path = [None]
-        suites = _load_testsuites(includes=algorithm_suite_path, excludes=None, glob=None, provider=provider,
+        algorithm_suite_path_dict = {"pellm": os.path.join(ctx.obj.get("fate_base"), "fate_llm", "examples")}
+        suite_paths = []
+        for alg in algorithm_suite:
+            algorithm_suite_path = algorithm_suite_path_dict.get(alg, None)
+            if algorithm_suite_path is None:
+                echo.echo(f"algorithm suite {alg} not found", fg='red')
+            else:
+                suite_paths.append(algorithm_suite_path)
+        suites = _load_testsuites(includes=suite_paths, excludes=None, glob=None, provider=provider,
                                   suffix="llmsuite.yaml", suite_type="llmsuite")
     else:
         suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, provider=provider,
@@ -137,7 +143,6 @@ def _run_llmsuite_pairs(config: Config, suite, namespace: str,
     from fate_llm.evaluate.scripts.eval_cli import run_job_eval
     client = clients['guest_0']
     guest_party_id = config.parties.role_to_party("guest")[0]
-    # pipeline demo goes here
     pair_n = len(suite.pairs)
     # fate_base = config.fate_base
     # PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python")
@@ -167,14 +172,14 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                 input_params = signature(mod.main).parameters
 
                 try:
-                    # todo: add update status api to suite
                     # pipeline should return pretrained model path
                     pretrained_model_path = _run_mod(mod, input_params, config, param,
                                                      namespace, data_namespace_mangling)
                     job.pretrained_model_path = pretrained_model_path
                     job_info = os.environ.get("pipeline_job_info")
                     job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
-                    suite.update_status(pair_name=pair.pair_name, job_name=job_name, job_id=job_id, status=status,
+                    suite.update_status(pair_name=pair.pair_name, job_name=job_name,
+                                        job_id=job_id, status=status,
                                         time_elapsed=time_elapsed,
                                         event=event)
 
@@ -183,8 +188,7 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                     if job_info is None:
                         job_id, status, time_elapsed, event = None, 'failed', None, None
                     else:
-                        job_id, status, time_elapsed, event = extract_job_status(job_info, client,
-                                                                                 guest_party_id)
+                        job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
                     _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
                     os.environ.pop("pipeline_job_info")
                     continue

From a96827f97af7b4076a61d9694270b41aa6bb49bd Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 22 May 2024 17:22:31 +0800
Subject: [PATCH 11/30] use consts to record include_fate_llm var

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_config.py            | 2 +-
 python/fate_test/fate_test_config.yaml | 4 ++--
 python/fate_test/scripts/cli.py        | 4 ++--
 python/fate_test/scripts/config_cli.py | 4 ++--
 python/fate_test/utils.py              | 3 +++
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/fate_test/_config.py b/python/fate_test/_config.py
index d3a6012..e5b2009 100644
--- a/python/fate_test/_config.py
+++ b/python/fate_test/_config.py
@@ -44,7 +44,7 @@
 
 # directory where FATE code locates, default installation location={FATE}/fate
 # python/ml -> $fate_base/python/ml
-fate_base: path(FATE)/fate
+fate_base: path(FATE)/
 
 # whether to delete data in suites after all jobs done
 clean_data: true
diff --git a/python/fate_test/fate_test_config.yaml b/python/fate_test/fate_test_config.yaml
index 7bb641f..f086391 100644
--- a/python/fate_test/fate_test_config.yaml
+++ b/python/fate_test/fate_test_config.yaml
@@ -19,7 +19,7 @@ all_examples_data_config: examples/data/upload_config/all_examples_data_testsuit
 
 # directory where FATE code locates, default installation location={FATE}/fate
 # python/ml -> $fate_base/python/ml
-fate_base: path(FATE)/fate
+fate_base: path(FATE)/
 
 # whether to delete data in suites after all jobs done
 clean_data: true
@@ -34,4 +34,4 @@ services:
   - flow_services:
       - { address: 127.0.0.1:9380, parties: [ '9999', '10000' ] }
     serving_setting:
-      address: 127.0.0.1:8059
\ No newline at end of file
+      address: 127.0.0.1:8059
diff --git a/python/fate_test/scripts/cli.py b/python/fate_test/scripts/cli.py
index b01e6ce..ac3f821 100644
--- a/python/fate_test/scripts/cli.py
+++ b/python/fate_test/scripts/cli.py
@@ -13,7 +13,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
 
 import click
 
@@ -37,7 +36,8 @@
     # "unittest": unittest_group
 }
 
-if os.environ.get("INCLUDE_FATE_LLM", None):
+from fate_test import utils
+if utils.INCLUDE_FATE_LLM:
     from fate_test.scripts.llmsuite_cli import run_llmsuite
     commands["llmsuite"] = run_llmsuite
 
diff --git a/python/fate_test/scripts/config_cli.py b/python/fate_test/scripts/config_cli.py
index 4cfcfdb..87a86a7 100644
--- a/python/fate_test/scripts/config_cli.py
+++ b/python/fate_test/scripts/config_cli.py
@@ -13,11 +13,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
 from pathlib import Path
 
 import click
 
+from fate_test import utils
 from fate_test._client import Clients
 from fate_test._config import create_config, default_config, parse_config
 from fate_test.scripts._options import SharedOptions
@@ -89,5 +89,5 @@ def _enable(include):
     """
     for p in include:
         if isinstance(p, str) and p.lower() == "fate-llm":
-            os.environ['INCLUDE_FATE_LLM'] = "1"
+            utils.INCLUDE_FATE_LLM = '1'
     click.echo(f"FATE-Test will allow import {include}.")
diff --git a/python/fate_test/utils.py b/python/fate_test/utils.py
index 12a9e39..677c757 100644
--- a/python/fate_test/utils.py
+++ b/python/fate_test/utils.py
@@ -32,6 +32,9 @@
 RELATIVE = "relative"
 ABSOLUTE = "absolute"
 
+DEFAULT_INCLUDE_FATE_LLM = None
+INCLUDE_FATE_LLM = os.getenv("INCLUDE_FATE_LLM") or DEFAULT_INCLUDE_FATE_LLM
+
 
 class TxtStyle:
     TRUE_VAL = Fore.GREEN

From ebf1843677e7d04c304f97eb57dea630ca6ef235 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 22 May 2024 20:36:58 +0800
Subject: [PATCH 12/30] add doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test.md         |  14 +++-
 doc/fate_test_command.md | 151 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 1 deletion(-)

diff --git a/doc/fate_test.md b/doc/fate_test.md
index 746dfbd..071e9b7 100644
--- a/doc/fate_test.md
+++ b/doc/fate_test.md
@@ -88,4 +88,16 @@ shown in last step
   ```bash
   fate_test data generate -i <path contains *performance.yaml> -ng 10000 -fg 10 -fh 10 -m 1.0 --upload-data
   fate_test performance -i <path contains *performance.yaml> --skip-data
-  ```
\ No newline at end of file
+  ```
+
+- [llm-suite](./fate_test_command.md#fate-llmsuite): used for running FATE-Llm testsuites, collection of FATE-Llm jobs and/or evaluations
+  
+  Before running llmsuite for the first time, make sure to install FATE-Llm and allow its import in FATE-Test scripts:
+
+  ```bash
+  fate_test config include fate-llm
+  ```
+
+  ```bash
+  fate_test llmsuite -i <path contains *llmsuite.yaml>
+  ```
diff --git a/doc/fate_test_command.md b/doc/fate_test_command.md
index dabe682..21acc0c 100644
--- a/doc/fate_test_command.md
+++ b/doc/fate_test_command.md
@@ -867,3 +867,154 @@ fate_test data --help
     data after generate and upload dataset in testsuites
     *path1*
 
+
+## FATE Llmsuite
+
+FATE Llmsuite is used for running a collection of FATE-Llm jobs in sequence and then evaluate them on user-specified tasks.
+It also allows users to compare the results of different llm jobs.
+
+### command options
+
+```bash
+fate_test llmsuite --help
+```
+
+1. include:
+
+   ```bash
+   fate_test llmsuite -i <path1 contains *llmsuite.yaml>
+   ```
+
+   will run llm testsuites in
+   *path1*
+
+2. exclude:
+
+   ```bash
+   fate_test llmsuite -i <path1 contains *llmsuite.yaml> -e <path2 to exclude> -e <path3 to exclude> ...
+   ```
+
+   will run llm testsuites in *path1* but not in *path2* and *path3*
+
+3. glob:
+
+   ```bash
+   fate_test llmsuite -i <path1 contains *llmsuite.yaml> -g "hetero*"
+   ```
+
+   will run llm testsuites in sub directory start with *hetero* of
+   *path1*
+
+4. algorithm-suite:
+
+   ```bash
+   fate_test llmsuite -a pellm'
+   ```
+
+   will run built-in 'pellm' llm testsuite, which will train and evaluate a FATE-Llm model and a zero-shot model
+
+5. timeout:
+
+   ```bash
+   fate_test llmsuite -i <path1 contains *llmsuite.yaml> -m 3600
+   ```
+
+   will run llm testsuites in *path1* and timeout when job does not finish
+   within 3600s; if tasks need more time, use a larger threshold
+
+6. task-cores
+
+   ```bash
+   fate_test llmsuite -i <path1 contains *llmsuite.yaml> -p 4
+   ```
+
+   will run llm testsuites in *path1* with script config "task-cores" set to 4
+
+7. eval-config:
+
+    ```bash
+    fate_test llmsuite -i <path1 contains *llmsuite.yaml> --eval-config <path2>
+    ```
+
+   will run llm testsuites in *path1* with evaluation configuration set to *path2*
+
+8. skip-evaluate:
+
+    ```bash
+    fate_test llmsuite -i <path1 contains *llmsuite.yaml> --skip-evaluate
+    ```
+
+    will run llm testsuites in *path1* without running evaluation
+
+9. provider:
+
+    ```bash
+    fate_test llmsuite -i <path1 contains *llmsuite.yaml> --provider <provider_name>
+    ```
+
+    will run llm testsuites in *path1* with FATE provider set to *provider_name*
+
+10. yes:
+
+    ```bash
+    fate_test llmsuite -i <path1 contains *llmsuite.yaml> --yes
+    ```
+
+    will run llm testsuites in *path1* directly, skipping double check
+
+
+### FATE-Llm job configuration
+
+Configuration of jobs should be specified in a llm testsuite whose
+file name ends with "\*llmsuite.yaml". For llm testsuite example,
+please refer [here](https://github.com/FederatedAI/FATE-LLM).
+
+A FATE-Llm testsuite includes the following elements:
+
+- job group: each group includes arbitrary number of jobs with paths
+  to corresponding script and configuration
+
+    - job: name of evaluation job to be run, must be unique within each group
+      list
+
+        - script: path to [testing script](#testing-script), should be
+          relative to testsuite, optional for evaluation-only jobs
+        - conf: path to job configuration file for script, should be
+          relative to testsuite, optional for evaluation-only jobs
+        - pretrained: path to pretrained model, should be relative to
+          testsuite, optional for jobs needed to run FATE-Llm training job, where the 
+          script should return path to the pretrained model
+        - peft: path to peft file, should be relative to testsuite, 
+          optional for jobs needed to run FATE-Llm training job
+        - tasks: list of tasks to be evaluated, optional for jobs skipping evaluation
+        - include_path: should be specified if tasks are user-defined
+        - eval_conf: path to evaluation configuration file, should be
+          relative to testsuite; if not provided, will use default conf
+
+      ```yaml
+          bloom_lora:
+            pretrained: "/data/cephfs/llm/models/bloom-560m"
+            script: "./test_bloom_lora.py"
+            conf: "./bloom_lora_config.yaml"
+            peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory"
+            tasks:
+              - "dolly-15k"
+
+      ```
+
+    -
+
+  ```yaml
+     hetero_nn_sshe_binary_0:
+      bloom_lora: 
+        pretrained: "/data/cephfs/llm/models/bloom-560m"
+        script: "./test_bloom_lora.py"
+        conf: "./bloom_lora_config.yaml"
+        peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory"
+        tasks:
+          - "dolly-15k"
+      bloom_zero_shot:
+        pretrained: "/data/cephfs/llm/models/bloom-560m"
+        tasks:
+          - "dolly-15k"
+  ```

From f61546c18cbfd46054b91b3e1726921efae1e763 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 22 May 2024 20:44:39 +0800
Subject: [PATCH 13/30] add doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test_command.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fate_test_command.md b/doc/fate_test_command.md
index 21acc0c..55f0138 100644
--- a/doc/fate_test_command.md
+++ b/doc/fate_test_command.md
@@ -1002,7 +1002,7 @@ A FATE-Llm testsuite includes the following elements:
 
       ```
 
-    -
+- llm suite
 
   ```yaml
      hetero_nn_sshe_binary_0:

From c36998a68679a3f6ac41da17223cd545adce1c49 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Fri, 24 May 2024 16:34:47 +0800
Subject: [PATCH 14/30] edit doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test_command.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/fate_test_command.md b/doc/fate_test_command.md
index 55f0138..2b5be41 100644
--- a/doc/fate_test_command.md
+++ b/doc/fate_test_command.md
@@ -978,10 +978,11 @@ A FATE-Llm testsuite includes the following elements:
       list
 
         - script: path to [testing script](#testing-script), should be
-          relative to testsuite, optional for evaluation-only jobs
+          relative to testsuite, optional for evaluation-only jobs;
+          note that pretrained model, if available, should be returned at the end of the script
         - conf: path to job configuration file for script, should be
           relative to testsuite, optional for evaluation-only jobs
-        - pretrained: path to pretrained model, should be relative to
+        - pretrained: path to pretrained model, should be either model name from Huggingface or relative path to
           testsuite, optional for jobs needed to run FATE-Llm training job, where the 
           script should return path to the pretrained model
         - peft: path to peft file, should be relative to testsuite, 
@@ -993,7 +994,7 @@ A FATE-Llm testsuite includes the following elements:
 
       ```yaml
           bloom_lora:
-            pretrained: "/data/cephfs/llm/models/bloom-560m"
+            pretrained: "models/bloom-560m"
             script: "./test_bloom_lora.py"
             conf: "./bloom_lora_config.yaml"
             peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory"
@@ -1007,14 +1008,14 @@ A FATE-Llm testsuite includes the following elements:
   ```yaml
      hetero_nn_sshe_binary_0:
       bloom_lora: 
-        pretrained: "/data/cephfs/llm/models/bloom-560m"
+        pretrained: "bloom-560m"
         script: "./test_bloom_lora.py"
         conf: "./bloom_lora_config.yaml"
         peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory"
         tasks:
           - "dolly-15k"
       bloom_zero_shot:
-        pretrained: "/data/cephfs/llm/models/bloom-560m"
+        pretrained: "bloom-560m"
         tasks:
           - "dolly-15k"
   ```

From bec9bec9ca121ba455face76c7c9f115d6e07ba7 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 28 May 2024 17:34:47 +0800
Subject: [PATCH 15/30] edit doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test_command.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fate_test_command.md b/doc/fate_test_command.md
index 2b5be41..971511c 100644
--- a/doc/fate_test_command.md
+++ b/doc/fate_test_command.md
@@ -908,7 +908,7 @@ fate_test llmsuite --help
 4. algorithm-suite:
 
    ```bash
-   fate_test llmsuite -a pellm'
+   fate_test llmsuite -a "pellm"
    ```
 
    will run built-in 'pellm' llm testsuite, which will train and evaluate a FATE-Llm model and a zero-shot model

From 91e9e54e330cd36f59b6243651a885a2221d3e4e Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 30 May 2024 15:39:23 +0800
Subject: [PATCH 16/30] lazy import FATE-Test subcommands rename subcommands
 entry point functions

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_config.py                   |  2 +-
 ...chmark_cli.py => benchmark_quality_cli.py} |  4 +-
 python/fate_test/scripts/cli.py               | 60 ++++++++++---------
 python/fate_test/scripts/llmsuite_cli.py      |  6 +-
 python/fate_test/scripts/performance_cli.py   |  2 +-
 .../{testsuite_cli.py => suite_cli.py}        |  9 ---
 python/setup.py                               |  2 +-
 7 files changed, 38 insertions(+), 47 deletions(-)
 rename python/fate_test/scripts/{benchmark_cli.py => benchmark_quality_cli.py} (97%)
 rename python/fate_test/scripts/{testsuite_cli.py => suite_cli.py} (94%)

diff --git a/python/fate_test/_config.py b/python/fate_test/_config.py
index e5b2009..e00b82d 100644
--- a/python/fate_test/_config.py
+++ b/python/fate_test/_config.py
@@ -49,7 +49,7 @@
 # whether to delete data in suites after all jobs done
 clean_data: true
 
-# participating parties' id and correponding flow service ip & port information
+# participating parties' id and corresponding flow service ip & port information
 parties:
   guest: ['9999']
   host: ['10000', '9999']
diff --git a/python/fate_test/scripts/benchmark_cli.py b/python/fate_test/scripts/benchmark_quality_cli.py
similarity index 97%
rename from python/fate_test/scripts/benchmark_cli.py
rename to python/fate_test/scripts/benchmark_quality_cli.py
index fa6b155..c171ebf 100644
--- a/python/fate_test/scripts/benchmark_cli.py
+++ b/python/fate_test/scripts/benchmark_quality_cli.py
@@ -45,8 +45,8 @@
 @click.option("--enable-clean-data", "clean_data", flag_value=True, default=None)
 @SharedOptions.get_shared_options(hidden=True)
 @click.pass_context
-def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, storage_tag, history_tag, match_details,
-                  task_cores, timeout, **kwargs):
+def run_benchmark_quality(ctx, include, exclude, glob, skip_data, tol, clean_data, storage_tag, history_tag, match_details,
+                          task_cores, timeout, **kwargs):
     """
     process benchmark suite, alias: bq
     """
diff --git a/python/fate_test/scripts/cli.py b/python/fate_test/scripts/cli.py
index ac3f821..56df9ac 100644
--- a/python/fate_test/scripts/cli.py
+++ b/python/fate_test/scripts/cli.py
@@ -14,32 +14,11 @@
 #  limitations under the License.
 #
 
+import os
+
 import click
 
 from fate_test.scripts._options import SharedOptions
-from fate_test.scripts.benchmark_cli import run_benchmark
-from fate_test.scripts.config_cli import config_group
-from fate_test.scripts.data_cli import data_group
-# from fate_test.scripts.flow_test_cli import flow_group
-from fate_test.scripts.performance_cli import run_task
-# from fate_test.scripts.quick_test_cli import unittest_group
-# from fate_test.scripts.secure_protocol_cli import secure_protocol_group
-from fate_test.scripts.testsuite_cli import run_suite
-
-commands = {
-    "config": config_group,
-    "suite": run_suite,
-    "performance": run_task,
-    "benchmark-quality": run_benchmark,
-    "data": data_group,
-
-    # "unittest": unittest_group
-}
-
-from fate_test import utils
-if utils.INCLUDE_FATE_LLM:
-    from fate_test.scripts.llmsuite_cli import run_llmsuite
-    commands["llmsuite"] = run_llmsuite
 
 commands_alias = {
     "bq": "benchmark-quality",
@@ -48,16 +27,39 @@
 
 
 class MultiCLI(click.MultiCommand):
+    def __init__(self, *args, **kwargs):
+        super(MultiCLI, self).__init__(*args, **kwargs)
+        self.plugin_folder = os.path.dirname(__file__)
+        """self._commands =  {
+            "config": config_group,
+            "suite": run_suite,
+            "performance": run_task,
+            "benchmark-quality": run_benchmark,
+            "data": data_group}
+        self._load_extra_commands()
+        
+    def _load_extra_commands(self):
+        from fate_test.scripts.llmsuite_cli import run_llmsuite
+        self._commands["llmsuite"] = run_llmsuite"""
 
     def list_commands(self, ctx):
-        return list(commands)
+        rv = []
+        for filename in os.listdir(self.plugin_folder):
+            if filename.endswith("_cli.py"):
+                rv.append(filename[:-7])
+        rv.sort()
+        print(f"rv: {rv}")
+        return rv
 
     def get_command(self, ctx, name):
-        if name not in commands and name in commands_alias:
-            name = commands_alias[name]
-        if name not in commands:
-            ctx.fail("No such command '{}'.".format(name))
-        return commands[name]
+        name = commands_alias.get(name, name).replace("-", "_")
+        ns = {}
+        fn = os.path.join(self.plugin_folder, name + "_cli.py")
+        with open(fn) as f:
+            code = compile(f.read(), fn, 'exec')
+            eval(code, ns, ns)
+        command_name = f"{name}_group" if name in ["data", "config"] else f"run_{name}"
+        return ns[command_name]
 
 
 @click.command(cls=MultiCLI, help="A collection of useful tools to running FATE's test.",
diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 6a6e5b9..1a6b0b1 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -31,10 +31,7 @@
 from fate_test.scripts._utils import _load_testsuites, _load_module_from_script
 from fate_test.utils import extract_job_status
 
-"""
-@click.option('-uj', '--update-job-parameters', default="{}", type=str,
-              help="a json string that represents mapping for replacing fields in job conf, example format: "'{job_name: param_name1: param_val1, param_name2=param_val2}'")
-"""
+
 @click.command("llmsuite")
 @click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True,
               metavar="<include>",
@@ -101,6 +98,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.stdout_newline()
     # with Clients(config_inst) as client:
     client = Clients(config_inst)
+    print(f"\n called import llm evaluator\n")
     from fate_llm.evaluate.utils import llm_evaluator
     llm_evaluator.init_tasks()
     for i, suite in enumerate(suites):
diff --git a/python/fate_test/scripts/performance_cli.py b/python/fate_test/scripts/performance_cli.py
index 2f0d151..cc5afc7 100644
--- a/python/fate_test/scripts/performance_cli.py
+++ b/python/fate_test/scripts/performance_cli.py
@@ -54,7 +54,7 @@
 @click.option("--disable-clean-data", "clean_data", flag_value=False, default=None)
 @SharedOptions.get_shared_options(hidden=True)
 @click.pass_context
-def run_task(ctx, job_type, include, timeout, epochs,
+def run_performance(ctx, job_type, include, timeout, epochs,
              max_depth, num_trees, task_cores, storage_tag, history_tag, skip_data, clean_data, **kwargs):
     """
     Test the performance of big data tasks, alias: bp
diff --git a/python/fate_test/scripts/testsuite_cli.py b/python/fate_test/scripts/suite_cli.py
similarity index 94%
rename from python/fate_test/scripts/testsuite_cli.py
rename to python/fate_test/scripts/suite_cli.py
index 82194de..7235c7d 100644
--- a/python/fate_test/scripts/testsuite_cli.py
+++ b/python/fate_test/scripts/suite_cli.py
@@ -30,15 +30,6 @@
 from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script
 from fate_test.utils import extract_job_status
 
-"""
-@click.option('-uj', '--update-job-parameters', default="{}", type=JSON_STRING,
-              help="a json string represents mapping for replacing fields in conf.job_parameters")
-@click.option('-uc', '--update-component-parameters', default="{}", type=JSON_STRING,
-              help="a json string represents mapping for replacing fields in conf.component_parameters")
-@click.option('-m', '--timeout', type=int, default=3600, help="maximun running time of job")
-@click.option('-p', '--task-cores', type=int, help="processors per node")
-"""
-
 
 @click.command("suite")
 @click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="<include>",
diff --git a/python/setup.py b/python/setup.py
index d26a151..de698d5 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,7 +20,7 @@
 
 setup_kwargs = {
     "name": "fate-test",
-    "version": "2.1.0",
+    "version": "2.2.0",
     "description": "test tools for FATE",
     "long_description": 'FATE Test\n=========\n\nA collection of useful tools to running FATE\'s test.\n\n.. image:: images/tutorial.gif\n   :align: center\n   :alt: tutorial\n\nquick start\n-----------\n\n1. (optional) create virtual env\n\n   .. code-block:: bash\n\n      python -m venv venv\n      source venv/bin/activate\n      pip install -U pip\n\n\n2. install fate_test\n\n   .. code-block:: bash\n\n      pip install fate_test\n      fate_test --help\n\n\n3. edit default fate_test_config.yaml\n\n   .. code-block:: bash\n\n      # edit priority config file with system default editor\n      # filling some field according to comments\n      fate_test config edit\n\n4. configure FATE-Pipeline and FATE-Flow Commandline server setting\n\n.. code-block:: bash\n\n      # configure FATE-Pipeline server setting\n      pipeline init --port 9380 --ip 127.0.0.1\n      # configure FATE-Flow Commandline server setting\n      flow init --port 9380 --ip 127.0.0.1\n\n5. run some fate_test suite\n\n   .. code-block:: bash\n\n      fate_test suite -i <path contains *testsuite.json>\n\n\n6. run some fate_test benchmark\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path contains *benchmark.json>\n\n7. useful logs or exception will be saved to logs dir with namespace shown in last step\n\ndevelop install\n---------------\nIt is more convenient to use the editable mode during development: replace step 2 with flowing steps\n\n.. code-block:: bash\n\n   pip install -e ${FATE}/python/fate_client && pip install -e ${FATE}/python/fate_test\n\n\n\ncommand types\n-------------\n\n- suite: used for running testsuites, collection of FATE jobs\n\n  .. code-block:: bash\n\n     fate_test suite -i <path contains *testsuite.json>\n\n\n- benchmark-quality used for comparing modeling quality between FATE and other machine learning systems\n\n  .. code-block:: bash\n\n      fate_test benchmark-quality -i <path contains *benchmark.json>\n\n\n\nconfiguration by examples\n--------------------------\n\n1. no need ssh tunnel:\n\n   - 9999, service: service_a\n   - 10000, service: service_b\n\n   and both service_a, service_b can be requested directly:\n\n   .. code-block:: yaml\n\n      work_mode: 1 # 0 for standalone, 1 for cluster\n      data_base_dir: <path_to_data>\n      parties:\n        guest: [10000]\n        host: [9999, 10000]\n        arbiter: [9999]\n      services:\n        - flow_services:\n          - {address: service_a, parties: [9999]}\n          - {address: service_b, parties: [10000]}\n\n2. need ssh tunnel:\n\n   - 9999, service: service_a\n   - 10000, service: service_b\n\n   service_a, can be requested directly while service_b don\'t,\n   but you can request service_b in other node, say B:\n\n   .. code-block:: yaml\n\n      work_mode: 0 # 0 for standalone, 1 for cluster\n      data_base_dir: <path_to_data>\n      parties:\n        guest: [10000]\n        host: [9999, 10000]\n        arbiter: [9999]\n      services:\n        - flow_services:\n          - {address: service_a, parties: [9999]}\n        - flow_services:\n          - {address: service_b, parties: [10000]}\n          ssh_tunnel: # optional\n          enable: true\n          ssh_address: <ssh_ip_to_B>:<ssh_port_to_B>\n          ssh_username: <ssh_username_to B>\n          ssh_password: # optional\n          ssh_priv_key: "~/.ssh/id_rsa"\n\n\nTestsuite\n---------\n\nTestsuite is used for running a collection of jobs in sequence. Data used for jobs could be uploaded before jobs are\nsubmitted, and are cleaned when jobs finished. This tool is useful for FATE\'s release test.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n      fate_test suite --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json>\n\n   will run testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will run testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -g "hetero*"\n\n   will run testsuites in sub directory start with *hetero* of *path1*\n\n4. replace:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -r \'{"maxIter": 5}\'\n\n   will find all key-value pair with key "maxIter" in `data conf` or `conf` or `dsl` and replace the value with 5\n\n\n5. skip-data:\n\n   .. code-block:: bash\n\n       fate_test suite -i <path1 contains *testsuite.json> --skip-data\n\n   will run testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --yes\n\n   will run testsuites in *path1* directly, skipping double check\n\n7. skip-dsl-jobs:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --skip-dsl-jobs\n\n   will run testsuites in *path1* but skip all *tasks* in testsuites. It\'s would be useful when only pipeline tasks needed.\n\n8. skip-pipeline-jobs:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --skip-pipeline-jobs\n\n   will run testsuites in *path1* but skip all *pipeline tasks* in testsuites. It\'s would be useful when only dsl tasks needed.\n\n\nBenchmark Quality\n------------------\n\nBenchmark-quality is used for comparing modeling quality between FATE\nand other machine learning systems. Benchmark produces a metrics comparison\nsummary for each benchmark job group.\n\n.. code-block:: bash\n\n   fate_test benchmark-quality -i examples/benchmark_quality/hetero_linear_regression\n\n.. code-block:: bash\n\n    +-------+--------------------------------------------------------------+\n    |  Data |                             Name                             |\n    +-------+--------------------------------------------------------------+\n    | train | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n    |  test | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n    +-------+--------------------------------------------------------------+\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    |             Model Name             | explained_variance |      r2_score      | root_mean_squared_error |  mean_squared_error |\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    | local-linear_regression-regression | 0.9035168452250094 | 0.9035070863155368 |   0.31340413289880553   | 0.09822215051805216 |\n    | FATE-linear_regression-regression  | 0.903146386539082  | 0.9031411831961411 |    0.3139977881119483   | 0.09859461093919596 |\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    +-------------------------+-----------+\n    |          Metric         | All Match |\n    +-------------------------+-----------+\n    |    explained_variance   |    True   |\n    |         r2_score        |    True   |\n    | root_mean_squared_error |    True   |\n    |    mean_squared_error   |    True   |\n    +-------------------------+-----------+\n\ncommand options\n~~~~~~~~~~~~~~~\n\nuse the following command to show help message\n\n.. code-block:: bash\n\n      fate_test benchmark-quality --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json>\n\n   will run benchmark testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will run benchmark testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -g "hetero*"\n\n   will run benchmark testsuites in sub directory start with *hetero* of *path1*\n\n4. tol:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -t 1e-3\n\n   will run benchmark testsuites in *path1* with absolute tolerance of difference between metrics set to 0.001.\n   If absolute difference between metrics is smaller than *tol*, then metrics are considered\n   almost equal. Check benchmark testsuite `writing guide <#benchmark-testsuite>`_ on setting alternative tolerance.\n\n5. skip-data:\n\n   .. code-block:: bash\n\n       fate_test benchmark-quality -i <path1 contains *benchmark.json> --skip-data\n\n   will run benchmark testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> --yes\n\n   will run benchmark testsuites in *path1* directly, skipping double check\n\n\nbenchmark testsuite\n~~~~~~~~~~~~~~~~~~~\n\nConfiguration of jobs should be specified in a benchmark testsuite whose file name ends\nwith "\\*benchmark.json". For benchmark testsuite example,\nplease refer `here <../../examples/benchmark_quality>`_.\n\nA benchmark testsuite includes the following elements:\n\n- data: list of local data to be uploaded before running FATE jobs\n\n  - file: path to original data file to be uploaded, should be relative to testsuite or FATE installation path\n  - head: whether file includes header\n  - partition: number of partition for data storage\n  - table_name: table name in storage\n  - namespace: table namespace in storage\n  - role: which role to upload the data, as specified in fate_test.config;\n    naming format is: "{role_type}_{role_index}", index starts at 0\n\n  .. code-block:: json\n\n        "data": [\n            {\n                "file": "examples/data/motor_hetero_host.csv",\n                "head": 1,\n                "partition": 8,\n                "table_name": "motor_hetero_host",\n                "namespace": "experiment",\n                "role": "host_0"\n            }\n        ]\n\n- job group: each group includes arbitrary number of jobs with paths to corresponding script and configuration\n\n  - job: name of job to be run, must be unique within each group list\n\n    - script: path to `testing script <#testing-script>`_, should be relative to testsuite\n    - conf: path to job configuration file for script, should be relative to testsuite\n\n    .. code-block:: json\n\n       "local": {\n            "script": "./local-linr.py",\n            "conf": "./linr_config.yaml"\n       }\n\n  - compare_setting: additional setting for quality metrics comparison, currently only takes ``relative_tol``\n\n    If metrics *a* and *b* satisfy *abs(a-b) <= max(relative_tol \\* max(abs(a), abs(b)), absolute_tol)*\n    (from `math module <https://docs.python.org/3/library/math.html#math.isclose>`_),\n    they are considered almost equal. In the below example, metrics from "local" and "FATE" jobs are\n    considered almost equal if their relative difference is smaller than\n    *0.05 \\* max(abs(local_metric), abs(pipeline_metric)*.\n\n  .. code-block:: json\n\n     "linear_regression-regression": {\n         "local": {\n             "script": "./local-linr.py",\n             "conf": "./linr_config.yaml"\n         },\n         "FATE": {\n             "script": "./fate-linr.py",\n             "conf": "./linr_config.yaml"\n         },\n         "compare_setting": {\n             "relative_tol": 0.01\n         }\n     }\n\n\ntesting script\n~~~~~~~~~~~~~~\n\nAll job scripts need to have ``Main`` function as an entry point for executing jobs; scripts should\nreturn two dictionaries: first with data information key-value pairs: {data_type}: {data_name_dictionary};\nthe second contains {metric_name}: {metric_value} key-value pairs for metric comparison.\n\nBy default, the final data summary shows the output from the job named "FATE"; if no such job exists,\ndata information returned by the first job is shown. For clear presentation, we suggest that user follow\nthis general `guideline <../../examples/data/README.md#data-set-naming-rule>`_ for data set naming. In the case of multi-host\ntask, consider numbering host as such:\n\n::\n\n    {\'guest\': \'default_credit_homo_guest\',\n     \'host_1\': \'default_credit_homo_host_1\',\n     \'host_2\': \'default_credit_homo_host_2\'}\n\nReturned quality metrics of the same key are to be compared.\nNote that only **real-value** metrics can be compared.\n\n- FATE script: ``Main`` always has three inputs:\n\n  - config: job configuration, `JobConfig <../fate_client/pipeline/utils/tools.py#L64>`_ object loaded from "fate_test_config.yaml"\n  - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n  - namespace: namespace suffix, user-given *namespace* or generated timestamp string when using *namespace-mangling*\n\n- non-FATE script: ``Main`` always has one input:\n\n  - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n\n\ndata\n----\n\n`Data` sub-command is used for upload or delete dataset in suite\'s.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n      fate_test data --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json>\n\n   will upload/delete dataset in testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will upload/delete dataset in testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json> -g "hetero*"\n\n   will upload/delete dataset in testsuites in sub directory start with *hetero* of *path1*\n\n\nfull command options\n---------------------\n\n.. click:: fate_test.scripts.cli:cli\n  :prog: fate_test\n  :show-nested:\n',
     "author": "FederatedAI",

From dbd26783a0bb75ed587856565f88957dee8dc2ec Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 30 May 2024 17:06:49 +0800
Subject: [PATCH 17/30] clean up code, fix typo edit doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test.md                         |  2 +-
 doc/fate_test_command.md                 |  4 ++--
 python/fate_test/scripts/cli.py          | 12 ------------
 python/fate_test/scripts/config_cli.py   | 25 ++++++++++++------------
 python/fate_test/scripts/llmsuite_cli.py |  1 -
 5 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/doc/fate_test.md b/doc/fate_test.md
index 071e9b7..c80efaf 100644
--- a/doc/fate_test.md
+++ b/doc/fate_test.md
@@ -90,7 +90,7 @@ shown in last step
   fate_test performance -i <path contains *performance.yaml> --skip-data
   ```
 
-- [llm-suite](./fate_test_command.md#fate-llmsuite): used for running FATE-Llm testsuites, collection of FATE-Llm jobs and/or evaluations
+- [llm-suite](./fate_test_command.md#llmsuite): used for running FATE-Llm testsuites, collection of FATE-Llm jobs and/or evaluations
   
   Before running llmsuite for the first time, make sure to install FATE-Llm and allow its import in FATE-Test scripts:
 
diff --git a/doc/fate_test_command.md b/doc/fate_test_command.md
index 971511c..4e9e1fb 100644
--- a/doc/fate_test_command.md
+++ b/doc/fate_test_command.md
@@ -868,9 +868,9 @@ fate_test data --help
     *path1*
 
 
-## FATE Llmsuite
+## Llmsuite
 
-FATE Llmsuite is used for running a collection of FATE-Llm jobs in sequence and then evaluate them on user-specified tasks.
+Llmsuite is used for running a collection of FATE-Llm jobs in sequence and then evaluate them on user-specified tasks.
 It also allows users to compare the results of different llm jobs.
 
 ### command options
diff --git a/python/fate_test/scripts/cli.py b/python/fate_test/scripts/cli.py
index 56df9ac..1c4358e 100644
--- a/python/fate_test/scripts/cli.py
+++ b/python/fate_test/scripts/cli.py
@@ -30,17 +30,6 @@ class MultiCLI(click.MultiCommand):
     def __init__(self, *args, **kwargs):
         super(MultiCLI, self).__init__(*args, **kwargs)
         self.plugin_folder = os.path.dirname(__file__)
-        """self._commands =  {
-            "config": config_group,
-            "suite": run_suite,
-            "performance": run_task,
-            "benchmark-quality": run_benchmark,
-            "data": data_group}
-        self._load_extra_commands()
-        
-    def _load_extra_commands(self):
-        from fate_test.scripts.llmsuite_cli import run_llmsuite
-        self._commands["llmsuite"] = run_llmsuite"""
 
     def list_commands(self, ctx):
         rv = []
@@ -48,7 +37,6 @@ def list_commands(self, ctx):
             if filename.endswith("_cli.py"):
                 rv.append(filename[:-7])
         rv.sort()
-        print(f"rv: {rv}")
         return rv
 
     def get_command(self, ctx, name):
diff --git a/python/fate_test/scripts/config_cli.py b/python/fate_test/scripts/config_cli.py
index 87a86a7..1f07a3e 100644
--- a/python/fate_test/scripts/config_cli.py
+++ b/python/fate_test/scripts/config_cli.py
@@ -17,7 +17,6 @@
 
 import click
 
-from fate_test import utils
 from fate_test._client import Clients
 from fate_test._config import create_config, default_config, parse_config
 from fate_test.scripts._options import SharedOptions
@@ -80,14 +79,16 @@ def _config(ctx, **kwargs):
                 click.echo(f"[✓]connection {address} ok, fate version is {version}, role is {r}")
 
 
-@config_group.command(name="enable")
-@click.option('-i', '--include', required=True, type=str, multiple=True,
-              help="packages to be loaded in FATE-Test scripts")
-def _enable(include):
-    """
-    allow import of extra packages, currently only for FATE-Llm
-    """
-    for p in include:
-        if isinstance(p, str) and p.lower() == "fate-llm":
-            utils.INCLUDE_FATE_LLM = '1'
-    click.echo(f"FATE-Test will allow import {include}.")
+"""@config_group.command(name="set-extra-command")
+@SharedOptions.get_shared_options(hidden=True)
+@click.argument('enable', required=True, type=click.BOOL)
+@click.pass_context
+def _enable(ctx, enable, **kwargs):
+"""
+"""
+    allow extra commands, currently only FATE-Llm
+    
+    ctx.obj.update(**kwargs)
+    ctx.obj.update(include_fate_llm=enable)
+    os.environ["INCLUDE_FATE_LLM"] = '1' if enable else '0'
+    click.echo(f"Extra command {'enabled' if enable else 'disabled'}.")"""
diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 1a6b0b1..2ae4531 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -98,7 +98,6 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.stdout_newline()
     # with Clients(config_inst) as client:
     client = Clients(config_inst)
-    print(f"\n called import llm evaluator\n")
     from fate_llm.evaluate.utils import llm_evaluator
     llm_evaluator.init_tasks()
     for i, suite in enumerate(suites):

From 81be3dc2260d65b3155384c6f5d01fd80066b791 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Mon, 3 Jun 2024 17:41:37 +0800
Subject: [PATCH 18/30] fix loading default config from invalid path

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 2ae4531..a4ba894 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -21,7 +21,6 @@
 from inspect import signature
 
 import click
-import yaml
 
 from fate_test._client import Clients
 from fate_test._config import Config
@@ -107,15 +106,21 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
             echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red')
             os.environ['enable_pipeline_job_info_callback'] = '1'
             try:
+                # eval_config_dict = {}
                 if not eval_config:
                     from fate_llm.evaluate.utils.config import default_eval_config
                     eval_config = default_eval_config()
-
-                eval_config_dict = {}
-                with eval_config.open("r") as f:
-                    eval_config_dict.update(yaml.safe_load(f))
+                    if not os.path.exists(eval_config):
+                        """eval_config = os.path.abspath(eval_config)
+                        eval_config_dict = {}
+                        with eval_config.open("r") as f:
+                            eval_config_dict.update(yaml.safe_load(f))"""
+                        eval_config = None
+
+                """_run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
+                                    skip_evaluate, eval_config_dict)"""
                 _run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
-                                    skip_evaluate, eval_config_dict)
+                                    skip_evaluate, eval_config)
             except Exception as e:
                 raise RuntimeError(f"exception occur while running llmsuite jobs for {suite.path}") from e
 
@@ -135,7 +140,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
 
 @LOGGER.catch
 def _run_llmsuite_pairs(config: Config, suite, namespace: str,
-                        data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: dict,
+                        data_namespace_mangling: bool, clients: Clients, skip_evaluate: bool, eval_conf: str,
                         output_path: str = None):
     from fate_llm.evaluate.scripts.eval_cli import run_job_eval
     client = clients['guest_0']

From 12ceea611cce7eb3e3c16529593a87e16879979e Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 6 Jun 2024 17:21:06 +0800
Subject: [PATCH 19/30] tidy up printout message

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index a4ba894..1f0a054 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -90,7 +90,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
         suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, provider=provider,
                                   suffix="llmsuite.yaml", suite_type="llmsuite")
     for suite in suites:
-        echo.echo(f"\tllm groups({len(suite.pairs)}) {suite.path}")
+        echo.echo(f"\tllm suite count: ({len(suite.pairs)}) from {suite.path}")
     if not yes and not click.confirm("running?"):
         return
 
@@ -165,6 +165,7 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                 echo.file(f"exception({exception_id}), error message:\n{err_msg}")
             # evaluate_only
             if job.evaluate_only and not skip_evaluate:
+                echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}")
                 job_results[job.job_name] = run_job_eval(job, eval_conf)
             # run pipeline job then evaluate
             else:
@@ -209,6 +210,7 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                                              "guest", guest_party_id, model_task_name,
                                              "0", "output", "output_model", "model_directory")"""
                     job.peft_path = peft_path
+                    echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}")
                     try:
                         result = run_job_eval(job, eval_conf)
                         job_results[job_name] = result

From 5e61c9fd9009a695754cf1b7a2186369ad694f6e Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 6 Jun 2024 17:49:33 +0800
Subject: [PATCH 20/30] redirect default data upload path to updated yaml file

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/fate_test/_config.py b/python/fate_test/_config.py
index e00b82d..ebf5479 100644
--- a/python/fate_test/_config.py
+++ b/python/fate_test/_config.py
@@ -36,11 +36,11 @@
 # st_config_directory: examples/flow_test_template/hetero_lr/flow_test_config.yaml
 
 # directory stores testsuite file with min_test data sets to upload,
-# default location={FATE}/examples/data/upload_config/min_test_data_testsuite.json
-min_test_data_config: examples/data/upload_config/min_test_data_testsuite.json
+# default location={FATE}/examples/data/upload_config/min_test_data_testsuite.yaml
+min_test_data_config: examples/data/upload_config/min_test_data_testsuite.yaml
 # directory stores testsuite file with all example data sets to upload,
-# default location={FATE}/examples/data/upload_config/all_examples_data_testsuite.json
-all_examples_data_config: examples/data/upload_config/all_examples_data_testsuite.json
+# default location={FATE}/examples/data/upload_config/all_examples_data_testsuite.yaml
+all_examples_data_config: examples/data/upload_config/all_examples_data_testsuite.yaml
 
 # directory where FATE code locates, default installation location={FATE}/fate
 # python/ml -> $fate_base/python/ml

From e2973b391dbc518268b75ae6fef93a8861b36ee7 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 6 Jun 2024 17:49:54 +0800
Subject: [PATCH 21/30] redirect default data upload path to updated yaml file

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/fate_test_config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/fate_test/fate_test_config.yaml b/python/fate_test/fate_test_config.yaml
index f086391..92c1d63 100644
--- a/python/fate_test/fate_test_config.yaml
+++ b/python/fate_test/fate_test_config.yaml
@@ -11,11 +11,11 @@ performance_template_directory: examples/benchmark_performance/
 flow_test_config_directory: examples/flow_test_template/hetero_lr/flow_test_config.yaml
 
 # directory stores testsuite file with min_test data sets to upload,
-# default location={FATE}/examples/data/upload_config/min_test_data_testsuite.json
-min_test_data_config: examples/data/upload_config/min_test_data_testsuite.json
+# default location={FATE}/examples/data/upload_config/min_test_data_testsuite.yaml
+min_test_data_config: examples/data/upload_config/min_test_data_testsuite.yaml
 # directory stores testsuite file with all example data sets to upload,
-# default location={FATE}/examples/data/upload_config/all_examples_data_testsuite.json
-all_examples_data_config: examples/data/upload_config/all_examples_data_testsuite.json
+# default location={FATE}/examples/data/upload_config/all_examples_data_testsuite.yaml
+all_examples_data_config: examples/data/upload_config/all_examples_data_testsuite.yaml
 
 # directory where FATE code locates, default installation location={FATE}/fate
 # python/ml -> $fate_base/python/ml

From 85ec8f09e0050b1a86005bc32fd3560c63b0d895 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Fri, 7 Jun 2024 10:59:21 +0800
Subject: [PATCH 22/30] clean up code

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py    | 10 ----------
 python/fate_test/scripts/performance_cli.py |  2 --
 python/fate_test/scripts/suite_cli.py       |  2 --
 3 files changed, 14 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 1f0a054..2e74606 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -111,14 +111,7 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
                     from fate_llm.evaluate.utils.config import default_eval_config
                     eval_config = default_eval_config()
                     if not os.path.exists(eval_config):
-                        """eval_config = os.path.abspath(eval_config)
-                        eval_config_dict = {}
-                        with eval_config.open("r") as f:
-                            eval_config_dict.update(yaml.safe_load(f))"""
                         eval_config = None
-
-                """_run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
-                                    skip_evaluate, eval_config_dict)"""
                 _run_llmsuite_pairs(config_inst, suite, namespace, data_namespace_mangling, client,
                                     skip_evaluate, eval_config)
             except Exception as e:
@@ -206,9 +199,6 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                                                 "party_id": guest_party_id,
                                                 "model_task_name": model_task_name}
                                                )
-                    """peft_path = os.path.join(config.fate_base, "fate_flow", "model", job_id,
-                                             "guest", guest_party_id, model_task_name,
-                                             "0", "output", "output_model", "model_directory")"""
                     job.peft_path = peft_path
                     echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}")
                     try:
diff --git a/python/fate_test/scripts/performance_cli.py b/python/fate_test/scripts/performance_cli.py
index cc5afc7..ea550fb 100644
--- a/python/fate_test/scripts/performance_cli.py
+++ b/python/fate_test/scripts/performance_cli.py
@@ -70,8 +70,6 @@ def run_performance(ctx, job_type, include, timeout, epochs,
         config_inst.update_conf(timeout=timeout)
     if ctx.obj["engine_run"][0] is not None:
         config_inst.update_conf(engine_run=dict(ctx.obj["engine_run"]))
-    """if ctx.obj["auto_increasing_sid"] is not None:
-        config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]"""
     namespace = ctx.obj["namespace"]
     yes = ctx.obj["yes"]
     data_namespace_mangling = ctx.obj["namespace_mangling"]
diff --git a/python/fate_test/scripts/suite_cli.py b/python/fate_test/scripts/suite_cli.py
index 7235c7d..f7eaa4b 100644
--- a/python/fate_test/scripts/suite_cli.py
+++ b/python/fate_test/scripts/suite_cli.py
@@ -70,8 +70,6 @@ def run_suite(ctx, include, exclude, glob,
     if timeout is not None:
         config_inst.update_conf(timeout=timeout)
 
-    """if ctx.obj["auto_increasing_sid"] is not None:
-        config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]"""
     if clean_data is None:
         clean_data = config_inst.clean_data
     namespace = ctx.obj["namespace"]

From 1afa00840ebd75feded89442a9282cfde446f44b Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Fri, 7 Jun 2024 16:37:52 +0800
Subject: [PATCH 23/30] allow algorithm option by making include optional

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 2e74606..f327767 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -32,7 +32,7 @@
 
 
 @click.command("llmsuite")
-@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True,
+@click.option('-i', '--include', required=False, type=click.Path(exists=True), multiple=True,
               metavar="<include>",
               help="include *llmsuite.yaml under these paths")
 @click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True,
@@ -76,7 +76,8 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
     echo.echo(f"llmsuite namespace: {namespace}", fg='red')
     echo.echo("loading llmsuites:")
     if algorithm_suite:
-        algorithm_suite_path_dict = {"pellm": os.path.join(ctx.obj.get("fate_base"), "fate_llm", "examples")}
+        algorithm_suite_path_dict = {"pellm": os.path.join(config_inst.fate_base, "fate_llm", "examples", "pellm")}
+        # algorithm_suite_path_dict = {"pellm": os.path.join(config_inst.fate_base,"examples", "pellm")}
         suite_paths = []
         for alg in algorithm_suite:
             algorithm_suite_path = algorithm_suite_path_dict.get(alg, None)
@@ -84,14 +85,16 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
                 echo.echo(f"algorithm suite {alg} not found", fg='red')
             else:
                 suite_paths.append(algorithm_suite_path)
-        suites = _load_testsuites(includes=suite_paths, excludes=None, glob=None, provider=provider,
+        suites = _load_testsuites(includes=suite_paths, excludes=[], glob=None, provider=provider,
                                   suffix="llmsuite.yaml", suite_type="llmsuite")
-    else:
+    elif len(include) > 0:
         suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, provider=provider,
                                   suffix="llmsuite.yaml", suite_type="llmsuite")
-    for suite in suites:
-        echo.echo(f"\tllm suite count: ({len(suite.pairs)}) from {suite.path}")
-    if not yes and not click.confirm("running?"):
+        for suite in suites:
+            echo.echo(f"\tllm suite count: ({len(suite.pairs)}) from {suite.path}")
+        if not yes and not click.confirm("running?"):
+            return
+    else:
         return
 
     echo.stdout_newline()

From 36966aef16f74b611b7d6a2de3208d85011695f8 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 11 Jun 2024 11:18:01 +0800
Subject: [PATCH 24/30] edit doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 doc/fate_test.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fate_test.md b/doc/fate_test.md
index c80efaf..8428d41 100644
--- a/doc/fate_test.md
+++ b/doc/fate_test.md
@@ -9,7 +9,7 @@ A collection of useful tools to running FATE tests and PipeLine tasks.
     ```bash
     pip install -e python/fate_test
     ```
-2. edit default fate\_test\_config.yaml
+2. edit default fate\_test\_config.yaml; edit path to fate base/data base accordingly
 
    ```bash
    # edit priority config file with system default editor

From 0e4940824c13ce244d9e5dceb727c6bff775c579 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 11 Jun 2024 17:55:37 +0800
Subject: [PATCH 25/30] load binding table in llmsuite config; add bind table
 option to fate-test llmsuite subcommand

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_flow_client.py         | 85 +++++++-----------------
 python/fate_test/_parser.py              | 16 ++---
 python/fate_test/scripts/_utils.py       | 52 ++++++++++++++-
 python/fate_test/scripts/llmsuite_cli.py | 16 ++++-
 4 files changed, 97 insertions(+), 72 deletions(-)

diff --git a/python/fate_test/_flow_client.py b/python/fate_test/_flow_client.py
index 6a29d19..c997bcb 100644
--- a/python/fate_test/_flow_client.py
+++ b/python/fate_test/_flow_client.py
@@ -41,6 +41,29 @@ def __init__(self,
     def set_address(self, address):
         self.address = address
 
+    def bind_table(self, data: Data, callback=None):
+        conf = data.config
+        conf['file'] = os.path.join(str(self._data_base_dir), conf.get('file'))
+        path = Path(conf.get('file'))
+        if not path.exists():
+            raise Exception('The file is obtained from the fate flow client machine, but it does not exist, '
+                            f'please check the path: {path}')
+        response = self._client.table.bind_path(file=str(path),
+                                                namespace=data.namespace,
+                                                name=data.table_name)
+        try:
+            if callback is not None:
+                callback(response)
+                status = str(response['message']).lower()
+            else:
+                status = response["message"]
+            code = response["code"]
+            if code != 0:
+                raise RuntimeError(f"Return code {code} != 0, bind path failed")
+        except BaseException:
+            raise ValueError(f"Bind path failed, response={response}")
+        return status
+
     def transform_local_file_to_dataframe(self, data: Data, callback=None, output_path=None):
         #data_warehouse = self.upload_data(data, callback, output_path)
         #status = self.transform_to_dataframe(data.namespace, data.table_name, data_warehouse, callback)
@@ -82,44 +105,6 @@ def upload_file_and_convert_to_dataframe(self, data: Data, callback=None, output
         self._awaiting(job_id, "local", 0)
         return status
 
-    """def upload_data(self, data: Data, callback=None, output_path=None):
-        response, file_path = self._upload_data(data, output_path=output_path)
-        try:
-            if callback is not None:
-                callback(response)
-            code = response["code"]
-            if code != 0:
-                raise ValueError(f"Return code {code}!=0")
-
-            namespace = response["data"]["namespace"]
-            name = response["data"]["name"]
-            job_id = response["job_id"]
-        except BaseException:
-            raise ValueError(f"Upload data fails, response={response}")
-        # self.monitor_status(job_id, role=self.role, party_id=self.party_id)
-        self._awaiting(job_id, "local", 0)
-
-        return dict(namespace=namespace, name=name)
-
-    def transform_to_dataframe(self, namespace, table_name, data_warehouse, callback=None):
-        response = self._client.data.dataframe_transformer(namespace=namespace,
-                                                           name=table_name,
-                                                           data_warehouse=data_warehouse)
-
-        try:
-            if callback is not None:
-                callback(response)
-                status = self._awaiting(response["job_id"], "local", 0)
-                status = str(status).lower()
-            else:
-                status = response["retmsg"]
-
-        except Exception as e:
-            raise RuntimeError(f"upload data failed") from e
-        job_id = response["job_id"]
-        self._awaiting(job_id, "local", 0)
-        return status"""
-
     def delete_data(self, data: Data):
         try:
             table_name = data.config['table_name'] if data.config.get(
@@ -154,27 +139,6 @@ def _awaiting(self, job_id, role, party_id, callback=None):
                 callback(response)
             time.sleep(1)
 
-    """def _upload_data(self, data, output_path=None, verbose=0, destroy=1):
-        conf = data.config
-        # if conf.get("engine", {}) != "PATH":
-        if output_path is not None:
-            conf['file'] = os.path.join(os.path.abspath(output_path), os.path.basename(conf.get('file')))
-        else:
-            if _config.data_switch is not None:
-                conf['file'] = os.path.join(str(self._cache_directory), os.path.basename(conf.get('file')))
-            else:
-                conf['file'] = os.path.join(str(self._data_base_dir), conf.get('file'))
-        path = Path(conf.get('file'))
-        if not path.exists():
-            raise Exception('The file is obtained from the fate flow client machine, but it does not exist, '
-                            f'please check the path: {path}')
-        response = self._client.data.upload(file=str(path),
-                                            head=data.head,
-                                            meta=data.meta,
-                                            extend_sid=data.extend_sid,
-                                            partitions=data.partitions)
-        return response, conf["file"]"""
-
     def _output_data_table(self, job_id, role, party_id, task_name):
         response = self._client.output.data_table(job_id, role=role, party_id=party_id, task_name=task_name)
         if response.get("code") is not None:
@@ -223,7 +187,7 @@ def get_version(self):
     """def _add_notes(self, job_id, role, party_id, notes):
         data = dict(job_id=job_id, role=role, party_id=party_id, notes=notes)
         response = AddNotesResponse(self._post(url='job/update', json=data))
-        return response"""
+        return response
 
     def _table_bind(self, data):
         response = self._post(url='table/bind', json=data)
@@ -235,6 +199,7 @@ def _table_bind(self, data):
         except Exception as e:
             raise RuntimeError(f"table bind error: {response}") from e
         return response
+    """
 
 
 class Status(object):
diff --git a/python/fate_test/_parser.py b/python/fate_test/_parser.py
index da4918c..901bd63 100644
--- a/python/fate_test/_parser.py
+++ b/python/fate_test/_parser.py
@@ -19,7 +19,6 @@
 from pathlib import Path
 
 import prettytable
-# import json
 from ruamel import yaml
 
 from fate_test import _config
@@ -62,19 +61,20 @@ def _chain_hooks(hook_funcs, d):
 
 
 class Data(object):
-    def __init__(self, config: dict, role_str: str):
+    def __init__(self, config: dict, role_str: str, for_upload=True):
         self.config = config
         self.file = config.get("file", "")
-        self.meta = config.get("meta", {})
-        self.partitions = config.get("partitions", 4)
-        self.head = config.get("head", True)
-        self.extend_sid = config.get("extend_sid", True)
         self.namespace = config.get("namespace", "")
         self.table_name = config.get("table_name", "")
         self.role_str = role_str
+        if for_upload:
+            self.meta = config.get("meta", {})
+            self.partitions = config.get("partitions", 4)
+            self.head = config.get("head", True)
+            self.extend_sid = config.get("extend_sid", True)
 
     @staticmethod
-    def load(config, path: Path):
+    def load(config, path: Path, for_upload=True):
         kwargs = {}
         for field_name in config.keys():
             if field_name not in ["file", "role"]:
@@ -86,7 +86,7 @@ def load(config, path: Path):
         else:
             kwargs["file"] = file_path
         role_str = config.get("role") if config.get("role") != "guest" else "guest_0"
-        return Data(config=kwargs, role_str=role_str)
+        return Data(config=kwargs, role_str=role_str, for_upload=for_upload)
 
     def update(self, config: Config):
         if config.extend_sid is not None:
diff --git a/python/fate_test/scripts/_utils.py b/python/fate_test/scripts/_utils.py
index 56e7b28..50177fe 100644
--- a/python/fate_test/scripts/_utils.py
+++ b/python/fate_test/scripts/_utils.py
@@ -9,10 +9,10 @@
 
 from fate_test._client import Clients
 from fate_test._config import Config
-from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse
+from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse, Status
 from fate_test._io import echo, LOGGER, set_logger
 from fate_test._parser import (Testsuite, BenchmarkSuite, PerformanceSuite, FinalStatus,
-                               DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK)
+                               DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK, Data)
 
 
 def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type,
@@ -87,8 +87,18 @@ def _find_testsuite_files(path):
             elif suite_type == "performance":
                 suite = PerformanceSuite.load(suite_path.resolve())
             elif suite_type == "llmsuite":
+                from ruamel import yaml
                 from fate_llm.evaluate.utils import LlmSuite
                 suite = LlmSuite.load(suite_path.resolve())
+                # add data, if any provided
+                with suite_path.resolve().open("r") as f:
+                    suite_config = yaml.safe_load(f)
+                dataset = []
+                for d in suite_config.get("data"):
+                    d = DATA_LOAD_HOOK.hook(d)
+                    dataset.append(Data.load(d, suite_path, for_upload=False))
+                suite.dataset = dataset
+                # add job status
                 suite_status = {}
                 for pair in suite.pairs:
                     for job in pair.jobs:
@@ -104,6 +114,44 @@ def _find_testsuite_files(path):
     return suites
 
 
+@LOGGER.catch
+def _bind_data(clients: Clients, suite, config: Config):
+    with click.progressbar(length=len(suite.dataset),
+                           label="dataset",
+                           show_eta=False,
+                           show_pos=True,
+                           width=24) as bar:
+        for i, data in enumerate(suite.dataset):
+            data.update(config)
+            data_progress = DataProgress(f"{data.role_str}<-{data.namespace}.{data.table_name}")
+
+            def update_bar(n_step):
+                bar.item_show_func = lambda x: data_progress.show()
+                time.sleep(0.1)
+                bar.update(n_step)
+
+            def _call_back(resp):
+                if isinstance(resp, Status):
+                    echo.file(f"[table] bind: {resp}")
+                update_bar(0)
+
+            try:
+                echo.stdout_newline()
+                status = clients[data.role_str].bind_table(data,_call_back)
+                time.sleep(1)
+                if status != 'success':
+                    raise RuntimeError(f"binding {i + 1}th data for {suite.path} {status}")
+                bar.update(1)
+
+            except Exception:
+                exception_id = str(uuid.uuid1())
+                echo.file(f"exception({exception_id})")
+                LOGGER.exception(f"exception id: {exception_id}")
+                echo.echo(f"bind {i + 1}th data {data.config} to {data.role_str} fail, exception_id: {exception_id}")
+                # raise RuntimeError(f"exception uploading {i + 1}th data") from e
+
+
+
 @LOGGER.catch
 def _upload_data(clients: Clients, suite, config: Config, output_path=None, **kwargs):
     if kwargs.get("partitions") is not None:
diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index f327767..0f080df 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -27,7 +27,7 @@
 from fate_test._io import LOGGER, echo
 from fate_test._parser import record_non_success_jobs, non_success_summary
 from fate_test.scripts._options import SharedOptions
-from fate_test.scripts._utils import _load_testsuites, _load_module_from_script
+from fate_test.scripts._utils import _load_testsuites, _load_module_from_script, _bind_data
 from fate_test.utils import extract_job_status
 
 
@@ -50,9 +50,14 @@
               help='Path to FATE Llm evaluation config. If none, use default config.')
 @click.option('--skip-evaluate', is_flag=True, default=False,
               help="skip evaluation after training model")
+@click.option("--skip-data", is_flag=True, default=False,
+              help="skip binding table specified in llmsuite")
+@click.option("--data-only", is_flag=True, default=False,
+              help="bind data only")
 @SharedOptions.get_shared_options(hidden=True)
 @click.pass_context
-def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_cores, timeout, eval_config, skip_evaluate, **kwargs):
+def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_cores, timeout, eval_config, skip_evaluate,
+                 skip_data, data_only, **kwargs):
     """
     process llmsuite
     """
@@ -108,6 +113,13 @@ def run_llmsuite(ctx, include, exclude, algorithm_suite, glob, provider, task_co
             start = time.time()
             echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red')
             os.environ['enable_pipeline_job_info_callback'] = '1'
+            if not skip_data:
+                try:
+                    _bind_data(client, suite, config_inst)
+                except Exception as e:
+                    raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e
+            if data_only:
+                continue
             try:
                 # eval_config_dict = {}
                 if not eval_config:

From 4f706ca20faf5f8495c3206a5226c0dd0fa219e8 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 11 Jun 2024 19:02:11 +0800
Subject: [PATCH 26/30] fix bind table api

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/_flow_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fate_test/_flow_client.py b/python/fate_test/_flow_client.py
index c997bcb..49f6bdd 100644
--- a/python/fate_test/_flow_client.py
+++ b/python/fate_test/_flow_client.py
@@ -48,7 +48,7 @@ def bind_table(self, data: Data, callback=None):
         if not path.exists():
             raise Exception('The file is obtained from the fate flow client machine, but it does not exist, '
                             f'please check the path: {path}')
-        response = self._client.table.bind_path(file=str(path),
+        response = self._client.table.bind_path(path=str(path),
                                                 namespace=data.namespace,
                                                 name=data.table_name)
         try:

From 64edd3ca3ebe943679e58a7f98597276bf403b8b Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 11 Jun 2024 19:39:01 +0800
Subject: [PATCH 27/30] fix empty dataset for llmsuite

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/fate_test/scripts/_utils.py b/python/fate_test/scripts/_utils.py
index 50177fe..1f0552f 100644
--- a/python/fate_test/scripts/_utils.py
+++ b/python/fate_test/scripts/_utils.py
@@ -94,7 +94,7 @@ def _find_testsuite_files(path):
                 with suite_path.resolve().open("r") as f:
                     suite_config = yaml.safe_load(f)
                 dataset = []
-                for d in suite_config.get("data"):
+                for d in suite_config.get("data", {}):
                     d = DATA_LOAD_HOOK.hook(d)
                     dataset.append(Data.load(d, suite_path, for_upload=False))
                 suite.dataset = dataset
@@ -116,6 +116,8 @@ def _find_testsuite_files(path):
 
 @LOGGER.catch
 def _bind_data(clients: Clients, suite, config: Config):
+    if not suite.dataset:
+        return
     with click.progressbar(length=len(suite.dataset),
                            label="dataset",
                            show_eta=False,

From 48e0e74e5241453251e5ac9f5f05075c44829a82 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Thu, 20 Jun 2024 20:13:34 +0800
Subject: [PATCH 28/30] only assign 'failed' status to job if error occurs when
 running pipeline module

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/fate_test/scripts/llmsuite_cli.py | 78 +++++++++++++-----------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/python/fate_test/scripts/llmsuite_cli.py b/python/fate_test/scripts/llmsuite_cli.py
index 0f080df..1201151 100644
--- a/python/fate_test/scripts/llmsuite_cli.py
+++ b/python/fate_test/scripts/llmsuite_cli.py
@@ -168,8 +168,9 @@ def _run_llmsuite_pairs(config: Config, suite, namespace: str,
 
             def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None):
                 exception_id = str(uuid.uuid1())
-                suite.update_status(pair_name=pair.pair_name, job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
-                                   event=event, time_elapsed=time_elapsed)
+                if status is not None:
+                    suite.update_status(pair_name=pair.pair_name, job_name=job_name, job_id=job_id, exception_id=exception_id, status=status,
+                                        event=event, time_elapsed=time_elapsed)
                 echo.file(f"exception({exception_id}), error message:\n{err_msg}")
             # evaluate_only
             if job.evaluate_only and not skip_evaluate:
@@ -177,50 +178,53 @@ def _raise(err_msg, status="failed", job_id=None, event=None, time_elapsed=None)
                 job_results[job.job_name] = run_job_eval(job, eval_conf)
             # run pipeline job then evaluate
             else:
-                job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path
-                param = Config.load_from_file(conf_path)
-                mod = _load_module_from_script(script_path)
-                input_params = signature(mod.main).parameters
-
                 try:
-                    # pipeline should return pretrained model path
-                    pretrained_model_path = _run_mod(mod, input_params, config, param,
-                                                     namespace, data_namespace_mangling)
-                    job.pretrained_model_path = pretrained_model_path
-                    job_info = os.environ.get("pipeline_job_info")
-                    job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
-                    suite.update_status(pair_name=pair.pair_name, job_name=job_name,
-                                        job_id=job_id, status=status,
-                                        time_elapsed=time_elapsed,
-                                        event=event)
+                    job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path
+                    param = Config.load_from_file(conf_path)
+                    mod = _load_module_from_script(script_path)
+                    input_params = signature(mod.main).parameters
 
-                except Exception as e:
-                    job_info = os.environ.get("pipeline_job_info")
-                    if job_info is None:
-                        job_id, status, time_elapsed, event = None, 'failed', None, None
-                    else:
+                    try:
+                        # pipeline should return pretrained model path
+                        pretrained_model_path = _run_mod(mod, input_params, config, param,
+                                                         namespace, data_namespace_mangling)
+                        job.pretrained_model_path = pretrained_model_path
+                        job_info = os.environ.get("pipeline_job_info")
                         job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
-                    _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
-                    os.environ.pop("pipeline_job_info")
+                        suite.update_status(pair_name=pair.pair_name, job_name=job_name,
+                                            job_id=job_id, status=status,
+                                            time_elapsed=time_elapsed,
+                                            event=event)
+                    except Exception as e:
+                        job_info = os.environ.get("pipeline_job_info")
+                        if job_info is None:
+                            job_id, status, time_elapsed, event = None, 'failed', None, None
+                        else:
+                            job_id, status, time_elapsed, event = extract_job_status(job_info, client, guest_party_id)
+                        _raise(e, job_id=job_id, status=status, event=event, time_elapsed=time_elapsed)
+                        os.environ.pop("pipeline_job_info")
+                        continue
+                except Exception as e:
+                    _raise(f"pipeline failed: {e}", status="not submitted")
                     continue
                 if not skip_evaluate:
-                    model_task_name = "nn_0"
-                    if job.model_task_name:
-                        model_task_name = job.model_task_name
-                    from lm_eval.utils import apply_template
-                    peft_path = apply_template(job.peft_path_format,
-                                               {"fate_base": config.fate_base,
-                                                "job_id": job_id[0],
-                                                "party_id": guest_party_id,
-                                                "model_task_name": model_task_name}
-                                               )
-                    job.peft_path = peft_path
-                    echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}")
                     try:
+                        model_task_name = "nn_0"
+                        if job.model_task_name:
+                            model_task_name = job.model_task_name
+                        from lm_eval.utils import apply_template
+                        peft_path = apply_template(job.peft_path_format,
+                                                   {"fate_base": config.fate_base,
+                                                    "job_id": job_id[0],
+                                                    "party_id": guest_party_id,
+                                                    "model_task_name": model_task_name}
+                                                   )
+                        job.peft_path = peft_path
+                        echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}")
                         result = run_job_eval(job, eval_conf)
                         job_results[job_name] = result
                     except Exception as e:
-                        _raise(f"evaluate failed: {e}")
+                        _raise(f"evaluate failed: {e}", status=None)
                 os.environ.pop("pipeline_job_info")
         suite_results[pair.pair_name] = job_results
 

From 2233cd0aa55d8bd65a20b50a93bed9fe41502cd5 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Tue, 25 Jun 2024 10:24:34 +0800
Subject: [PATCH 29/30] update version

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index de698d5..4484593 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,7 +20,7 @@
 
 setup_kwargs = {
     "name": "fate-test",
-    "version": "2.2.0",
+    "version": "2.1.1",
     "description": "test tools for FATE",
     "long_description": 'FATE Test\n=========\n\nA collection of useful tools to running FATE\'s test.\n\n.. image:: images/tutorial.gif\n   :align: center\n   :alt: tutorial\n\nquick start\n-----------\n\n1. (optional) create virtual env\n\n   .. code-block:: bash\n\n      python -m venv venv\n      source venv/bin/activate\n      pip install -U pip\n\n\n2. install fate_test\n\n   .. code-block:: bash\n\n      pip install fate_test\n      fate_test --help\n\n\n3. edit default fate_test_config.yaml\n\n   .. code-block:: bash\n\n      # edit priority config file with system default editor\n      # filling some field according to comments\n      fate_test config edit\n\n4. configure FATE-Pipeline and FATE-Flow Commandline server setting\n\n.. code-block:: bash\n\n      # configure FATE-Pipeline server setting\n      pipeline init --port 9380 --ip 127.0.0.1\n      # configure FATE-Flow Commandline server setting\n      flow init --port 9380 --ip 127.0.0.1\n\n5. run some fate_test suite\n\n   .. code-block:: bash\n\n      fate_test suite -i <path contains *testsuite.json>\n\n\n6. run some fate_test benchmark\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path contains *benchmark.json>\n\n7. useful logs or exception will be saved to logs dir with namespace shown in last step\n\ndevelop install\n---------------\nIt is more convenient to use the editable mode during development: replace step 2 with flowing steps\n\n.. code-block:: bash\n\n   pip install -e ${FATE}/python/fate_client && pip install -e ${FATE}/python/fate_test\n\n\n\ncommand types\n-------------\n\n- suite: used for running testsuites, collection of FATE jobs\n\n  .. code-block:: bash\n\n     fate_test suite -i <path contains *testsuite.json>\n\n\n- benchmark-quality used for comparing modeling quality between FATE and other machine learning systems\n\n  .. code-block:: bash\n\n      fate_test benchmark-quality -i <path contains *benchmark.json>\n\n\n\nconfiguration by examples\n--------------------------\n\n1. no need ssh tunnel:\n\n   - 9999, service: service_a\n   - 10000, service: service_b\n\n   and both service_a, service_b can be requested directly:\n\n   .. code-block:: yaml\n\n      work_mode: 1 # 0 for standalone, 1 for cluster\n      data_base_dir: <path_to_data>\n      parties:\n        guest: [10000]\n        host: [9999, 10000]\n        arbiter: [9999]\n      services:\n        - flow_services:\n          - {address: service_a, parties: [9999]}\n          - {address: service_b, parties: [10000]}\n\n2. need ssh tunnel:\n\n   - 9999, service: service_a\n   - 10000, service: service_b\n\n   service_a, can be requested directly while service_b don\'t,\n   but you can request service_b in other node, say B:\n\n   .. code-block:: yaml\n\n      work_mode: 0 # 0 for standalone, 1 for cluster\n      data_base_dir: <path_to_data>\n      parties:\n        guest: [10000]\n        host: [9999, 10000]\n        arbiter: [9999]\n      services:\n        - flow_services:\n          - {address: service_a, parties: [9999]}\n        - flow_services:\n          - {address: service_b, parties: [10000]}\n          ssh_tunnel: # optional\n          enable: true\n          ssh_address: <ssh_ip_to_B>:<ssh_port_to_B>\n          ssh_username: <ssh_username_to B>\n          ssh_password: # optional\n          ssh_priv_key: "~/.ssh/id_rsa"\n\n\nTestsuite\n---------\n\nTestsuite is used for running a collection of jobs in sequence. Data used for jobs could be uploaded before jobs are\nsubmitted, and are cleaned when jobs finished. This tool is useful for FATE\'s release test.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n      fate_test suite --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json>\n\n   will run testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will run testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -g "hetero*"\n\n   will run testsuites in sub directory start with *hetero* of *path1*\n\n4. replace:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> -r \'{"maxIter": 5}\'\n\n   will find all key-value pair with key "maxIter" in `data conf` or `conf` or `dsl` and replace the value with 5\n\n\n5. skip-data:\n\n   .. code-block:: bash\n\n       fate_test suite -i <path1 contains *testsuite.json> --skip-data\n\n   will run testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --yes\n\n   will run testsuites in *path1* directly, skipping double check\n\n7. skip-dsl-jobs:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --skip-dsl-jobs\n\n   will run testsuites in *path1* but skip all *tasks* in testsuites. It\'s would be useful when only pipeline tasks needed.\n\n8. skip-pipeline-jobs:\n\n   .. code-block:: bash\n\n      fate_test suite -i <path1 contains *testsuite.json> --skip-pipeline-jobs\n\n   will run testsuites in *path1* but skip all *pipeline tasks* in testsuites. It\'s would be useful when only dsl tasks needed.\n\n\nBenchmark Quality\n------------------\n\nBenchmark-quality is used for comparing modeling quality between FATE\nand other machine learning systems. Benchmark produces a metrics comparison\nsummary for each benchmark job group.\n\n.. code-block:: bash\n\n   fate_test benchmark-quality -i examples/benchmark_quality/hetero_linear_regression\n\n.. code-block:: bash\n\n    +-------+--------------------------------------------------------------+\n    |  Data |                             Name                             |\n    +-------+--------------------------------------------------------------+\n    | train | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n    |  test | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n    +-------+--------------------------------------------------------------+\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    |             Model Name             | explained_variance |      r2_score      | root_mean_squared_error |  mean_squared_error |\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    | local-linear_regression-regression | 0.9035168452250094 | 0.9035070863155368 |   0.31340413289880553   | 0.09822215051805216 |\n    | FATE-linear_regression-regression  | 0.903146386539082  | 0.9031411831961411 |    0.3139977881119483   | 0.09859461093919596 |\n    +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n    +-------------------------+-----------+\n    |          Metric         | All Match |\n    +-------------------------+-----------+\n    |    explained_variance   |    True   |\n    |         r2_score        |    True   |\n    | root_mean_squared_error |    True   |\n    |    mean_squared_error   |    True   |\n    +-------------------------+-----------+\n\ncommand options\n~~~~~~~~~~~~~~~\n\nuse the following command to show help message\n\n.. code-block:: bash\n\n      fate_test benchmark-quality --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json>\n\n   will run benchmark testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will run benchmark testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -g "hetero*"\n\n   will run benchmark testsuites in sub directory start with *hetero* of *path1*\n\n4. tol:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> -t 1e-3\n\n   will run benchmark testsuites in *path1* with absolute tolerance of difference between metrics set to 0.001.\n   If absolute difference between metrics is smaller than *tol*, then metrics are considered\n   almost equal. Check benchmark testsuite `writing guide <#benchmark-testsuite>`_ on setting alternative tolerance.\n\n5. skip-data:\n\n   .. code-block:: bash\n\n       fate_test benchmark-quality -i <path1 contains *benchmark.json> --skip-data\n\n   will run benchmark testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n   .. code-block:: bash\n\n      fate_test benchmark-quality -i <path1 contains *benchmark.json> --yes\n\n   will run benchmark testsuites in *path1* directly, skipping double check\n\n\nbenchmark testsuite\n~~~~~~~~~~~~~~~~~~~\n\nConfiguration of jobs should be specified in a benchmark testsuite whose file name ends\nwith "\\*benchmark.json". For benchmark testsuite example,\nplease refer `here <../../examples/benchmark_quality>`_.\n\nA benchmark testsuite includes the following elements:\n\n- data: list of local data to be uploaded before running FATE jobs\n\n  - file: path to original data file to be uploaded, should be relative to testsuite or FATE installation path\n  - head: whether file includes header\n  - partition: number of partition for data storage\n  - table_name: table name in storage\n  - namespace: table namespace in storage\n  - role: which role to upload the data, as specified in fate_test.config;\n    naming format is: "{role_type}_{role_index}", index starts at 0\n\n  .. code-block:: json\n\n        "data": [\n            {\n                "file": "examples/data/motor_hetero_host.csv",\n                "head": 1,\n                "partition": 8,\n                "table_name": "motor_hetero_host",\n                "namespace": "experiment",\n                "role": "host_0"\n            }\n        ]\n\n- job group: each group includes arbitrary number of jobs with paths to corresponding script and configuration\n\n  - job: name of job to be run, must be unique within each group list\n\n    - script: path to `testing script <#testing-script>`_, should be relative to testsuite\n    - conf: path to job configuration file for script, should be relative to testsuite\n\n    .. code-block:: json\n\n       "local": {\n            "script": "./local-linr.py",\n            "conf": "./linr_config.yaml"\n       }\n\n  - compare_setting: additional setting for quality metrics comparison, currently only takes ``relative_tol``\n\n    If metrics *a* and *b* satisfy *abs(a-b) <= max(relative_tol \\* max(abs(a), abs(b)), absolute_tol)*\n    (from `math module <https://docs.python.org/3/library/math.html#math.isclose>`_),\n    they are considered almost equal. In the below example, metrics from "local" and "FATE" jobs are\n    considered almost equal if their relative difference is smaller than\n    *0.05 \\* max(abs(local_metric), abs(pipeline_metric)*.\n\n  .. code-block:: json\n\n     "linear_regression-regression": {\n         "local": {\n             "script": "./local-linr.py",\n             "conf": "./linr_config.yaml"\n         },\n         "FATE": {\n             "script": "./fate-linr.py",\n             "conf": "./linr_config.yaml"\n         },\n         "compare_setting": {\n             "relative_tol": 0.01\n         }\n     }\n\n\ntesting script\n~~~~~~~~~~~~~~\n\nAll job scripts need to have ``Main`` function as an entry point for executing jobs; scripts should\nreturn two dictionaries: first with data information key-value pairs: {data_type}: {data_name_dictionary};\nthe second contains {metric_name}: {metric_value} key-value pairs for metric comparison.\n\nBy default, the final data summary shows the output from the job named "FATE"; if no such job exists,\ndata information returned by the first job is shown. For clear presentation, we suggest that user follow\nthis general `guideline <../../examples/data/README.md#data-set-naming-rule>`_ for data set naming. In the case of multi-host\ntask, consider numbering host as such:\n\n::\n\n    {\'guest\': \'default_credit_homo_guest\',\n     \'host_1\': \'default_credit_homo_host_1\',\n     \'host_2\': \'default_credit_homo_host_2\'}\n\nReturned quality metrics of the same key are to be compared.\nNote that only **real-value** metrics can be compared.\n\n- FATE script: ``Main`` always has three inputs:\n\n  - config: job configuration, `JobConfig <../fate_client/pipeline/utils/tools.py#L64>`_ object loaded from "fate_test_config.yaml"\n  - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n  - namespace: namespace suffix, user-given *namespace* or generated timestamp string when using *namespace-mangling*\n\n- non-FATE script: ``Main`` always has one input:\n\n  - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n\n\ndata\n----\n\n`Data` sub-command is used for upload or delete dataset in suite\'s.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n      fate_test data --help\n\n1. include:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json>\n\n   will upload/delete dataset in testsuites in *path1*\n\n2. exclude:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json> -e <path2 to exclude> -e <path3 to exclude> ...\n\n   will upload/delete dataset in testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n   .. code-block:: bash\n\n      fate_test data [upload|delete] -i <path1 contains *testsuite.json> -g "hetero*"\n\n   will upload/delete dataset in testsuites in sub directory start with *hetero* of *path1*\n\n\nfull command options\n---------------------\n\n.. click:: fate_test.scripts.cli:cli\n  :prog: fate_test\n  :show-nested:\n',
     "author": "FederatedAI",

From c1069276c344e6f2e98a0a722641f6f3e45433f9 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Wed, 26 Jun 2024 14:57:20 +0800
Subject: [PATCH 30/30] edit doc

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 RELEASE.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index edbd1b7..8567d14 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,8 @@
+## Release 2.1.1
+### Major Features and Improvments
+> Fate-Test: FATE Automated Testing Tool
+* Add new subcommand `llmsuite` for FATE-LLM training and evaluation
+
 ## Release 2.1.0
 ### Major Features and Improvements
 > Fate-Test: FATE Automated Testing Tool