From ad684d9118b0cd2502e42e5ce0f6a28abd573c64 Mon Sep 17 00:00:00 2001 From: tejasathalye Date: Fri, 27 Sep 2024 18:51:30 +0530 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Run=20pre-commit=20checks=20and=20f?= =?UTF-8?q?ix=20issues=20in=20GenAI=20code=20style?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/causal_lm_cpp.yml | 47 +- .github/workflows/llm_bench-python.yml | 7 +- llm_bench/python/benchmark.py | 1009 +++++++---- llm_bench/python/convert.py | 407 +++-- .../python/llm_bench_utils/config_class.py | 189 +- .../better_transformer_patch.py | 37 +- .../conversion_utils/export_configs.py | 39 +- .../conversion_utils/helpers.py | 107 +- .../llm_bench_utils/hook_beam_search.py | 780 +++++---- .../python/llm_bench_utils/hook_common.py | 11 +- .../python/llm_bench_utils/hook_forward.py | 27 +- .../llm_bench_utils/hook_greedy_search.py | 558 +++--- .../python/llm_bench_utils/hook_sample.py | 364 ++-- .../python/llm_bench_utils/memory_profile.py | 22 +- .../python/llm_bench_utils/metrics_print.py | 210 ++- .../python/llm_bench_utils/model_utils.py | 255 +-- .../python/llm_bench_utils/nncf_utils.py | 13 +- .../python/llm_bench_utils/output_csv.py | 218 ++- .../python/llm_bench_utils/output_file.py | 101 +- .../python/llm_bench_utils/output_json.py | 110 +- .../llm_bench_utils/ov_model_classes.py | 169 +- llm_bench/python/llm_bench_utils/ov_utils.py | 140 +- llm_bench/python/llm_bench_utils/pt_utils.py | 219 ++- .../examples/openvino_batched_eval.py | 73 +- .../who_what_benchmark/tests/test_cli.py | 150 +- .../whowhatbench/__init__.py | 1 + .../whowhatbench/evaluator.py | 31 +- .../whowhatbench/whowhat_metrics.py | 5 +- .../who_what_benchmark/whowhatbench/wwb.py | 39 +- pyproject.toml | 34 + .../beam_search_causal_lm.py | 8 +- .../python/benchmark_genai/benchmark_genai.py | 62 +- samples/python/chat_sample/chat_sample.py | 12 +- .../greedy_causal_lm/greedy_causal_lm.py | 8 +- .../multinomial_causal_lm.py | 63 +- tests/python_tests/common.py | 215 ++- tests/python_tests/conftest.py | 19 +- tests/python_tests/ov_genai_test_utils.py | 117 +- .../python_tests/test_cache_optimizations.py | 158 +- tests/python_tests/test_chat_generate_api.py | 203 ++- tests/python_tests/test_generate_api.py | 635 ++++--- tests/python_tests/test_preemption.py | 282 ++- tests/python_tests/test_sampling.py | 360 ++-- tests/python_tests/tokenizer_configs.py | 1539 ++++++++--------- thirdparty/openvino_tokenizers | 1 - tools/cacheviz/__init__.py | 1 - tools/cacheviz/cacheviz.py | 269 ++- 47 files changed, 5729 insertions(+), 3595 deletions(-) delete mode 160000 thirdparty/openvino_tokenizers diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 8029eda1dc..75963030a9 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -16,9 +16,35 @@ env: l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip + jobs: + lint-checks: + name: Run Pre-Commit Lint Checks + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up Python 3.x + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install dependencies + run: | + pip install black flake8 pre-commit + - name: Run Black + run: black --check . + - name: Check Git Diff + run: git --no-pager diff --check $(git hash-object -t tree /dev/null) + - name: Prohibit non-ASCII characters in file names + run: | + test $(git diff --name-only --diff-filter=A -z $(git hash-object -t tree /dev/null) | LC_ALL=C tr -d '[ -~]\0' | wc -c) == 0 + - name: Check for non-ASCII characters in files + run: "! git grep -n '[^ -~]' -- ':(exclude)SECURITY.md' ':(exclude)LICENSE' ':(exclude)third-party-programs.txt' ':(exclude)tests/python_tests/README.md' ':(exclude)llm_bench/python/README.md' ':(exclude)samples/cpp/beam_search_casual_lm/README.md' ':(exclude)samples/cpp/benchmark_genai/README.md' ':(exclude)samples/cpp/chat_sample/README.md' ':(exclude)samples/cpp/greedy_casual_lm/README.md' ':(exclude)samples/cpp/multinomial_causal_lm/README.md' ':(exclude)samples/cpp/prompt_lookup_decoding_lm/README.md' ':(exclude)samples/cpp/speculative_decoding_lm/README.md' ':(exclude)samples/cpp/stable_diffusion/README.md' ':(exclude)samples/cpp/whisper_speech_recognition/README.md' ':(exclude)samples/python/beam_search_casual_lm/README.md' ':(exclude)samples/python/benchmark_genai/README.md' ':(exclude)samples/python/chat_sample/README.md' ':(exclude)samples/python/greedy_casual_lm/README.md' ':(exclude)samples/python/multinomial_causal_lm/README.md' ':(exclude)samples/python/whisper_speech_recognition/README.md' ':(exclude)src/README.md' ':(exclude)src/docs/BUILD.md' ':(exclude)src/docs/DOCKER.md' ':(exclude)src/docs/SUPPORTED_MODELS.md'" + cpp-multinomial-greedy_causal_lm-ubuntu: + name: Build and Test C++ Multinomial Greedy Causal LM runs-on: ubuntu-20.04-8-cores + needs: lint-checks defaults: run: shell: bash @@ -45,19 +71,20 @@ jobs: python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 - - run: > + - name: Run C++ Multinomial Causal LM + run: | . ./ov/setupvars.sh - && PYTHONPATH=./build/:$PYTHONPATH timeout 25s - ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a - - run: > + PYTHONPATH=./build/:$PYTHONPATH timeout 25s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a + - name: Run Python Multinomial Causal LM + run: | . ./ov/setupvars.sh - && PYTHONPATH=./build/:$PYTHONPATH timeout 25s - ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b - - run: > + PYTHONPATH=./build/:$PYTHONPATH timeout 25s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b + - name: Compare Outputs + run: | . ./ov/setupvars.sh - && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" - | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - + export PYTHONPATH=./build/:$PYTHONPATH + timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" | \ + diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - cpp-beam_search_causal_lm-ubuntu: strategy: diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 45e6dc2941..6b72388c1c 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -40,8 +40,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest black GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} @@ -75,6 +74,7 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly - name: WWB Tests run: | python -m pytest llm_bench/python/who_what_benchmark/tests + stateful: runs-on: ubuntu-20.04 steps: @@ -86,8 +86,7 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly run: | GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt python -m pip uninstall --yes openvino - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful grep beam_idx pytorch/dldt/FP32/openvino_model.xml - name: WWB Tests diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 7fb6d1757b..94901881c0 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -25,7 +25,7 @@ import llm_bench_utils.output_json import llm_bench_utils.output_file -FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils} +FW_UTILS = {"pt": llm_bench_utils.pt_utils, "ov": llm_bench_utils.ov_utils} DEFAULT_INFERENCE_STEPS = 20 LCM_DEFAULT_INFERENCE_STEPS = 4 @@ -41,92 +41,120 @@ def gen_iterate_data( - iter_idx='', - in_size='', - infer_count='', - out_size='', - gen_time='', - latency='', - res_md5='', - max_rss_mem='', - max_shared_mem='', - max_uss_mem='', - prompt_idx='', + iter_idx="", + in_size="", + infer_count="", + out_size="", + gen_time="", + latency="", + res_md5="", + max_rss_mem="", + max_shared_mem="", + max_uss_mem="", + prompt_idx="", tokenization_time=[], ): iter_data = {} - iter_data['iteration'] = iter_idx - iter_data['input_size'] = in_size - iter_data['infer_count'] = infer_count - iter_data['output_size'] = out_size - iter_data['generation_time'] = gen_time - iter_data['latency'] = latency - iter_data['result_md5'] = res_md5 - iter_data['first_token_latency'] = '' - iter_data['other_tokens_avg_latency'] = '' - iter_data['first_token_infer_latency'] = '' - iter_data['other_tokens_infer_avg_latency'] = '' - iter_data['max_rss_mem_consumption'] = max_rss_mem - iter_data['max_shared_mem_consumption'] = max_shared_mem - iter_data['max_uss_mem_consumption'] = max_uss_mem - iter_data['prompt_idx'] = prompt_idx - iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else '' - iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else '' + iter_data["iteration"] = iter_idx + iter_data["input_size"] = in_size + iter_data["infer_count"] = infer_count + iter_data["output_size"] = out_size + iter_data["generation_time"] = gen_time + iter_data["latency"] = latency + iter_data["result_md5"] = res_md5 + iter_data["first_token_latency"] = "" + iter_data["other_tokens_avg_latency"] = "" + iter_data["first_token_infer_latency"] = "" + iter_data["other_tokens_infer_avg_latency"] = "" + iter_data["max_rss_mem_consumption"] = max_rss_mem + iter_data["max_shared_mem_consumption"] = max_shared_mem + iter_data["max_uss_mem_consumption"] = max_uss_mem + iter_data["prompt_idx"] = prompt_idx + iter_data["tokenization_time"] = ( + tokenization_time[0] if len(tokenization_time) > 0 else "" + ) + iter_data["detokenization_time"] = ( + tokenization_time[1] if len(tokenization_time) > 1 else "" + ) return iter_data -def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id): - set_seed(args['seed']) - input_text_list = [input_text] * args['batch_size'] +def run_text_generation( + input_text, + num, + model, + tokenizer, + args, + iter_data_list, + md5_list, + prompt_index, + bench_hook, + model_precision, + proc_id, +): + set_seed(args["seed"]) + input_text_list = [input_text] * args["batch_size"] if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(input_text_list): - llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + llm_bench_utils.output_file.output_input_text( + in_text, args, model_precision, prompt_index, bs_index, proc_id + ) tok_encode_start = time.perf_counter() - input_data = tokenizer(input_text_list, return_tensors='pt') + input_data = tokenizer(input_text_list, return_tensors="pt") tok_encode_end = time.perf_counter() tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 - input_data.pop('token_type_ids', None) + input_data.pop("token_type_ids", None) # Remove `token_type_ids` from inputs - input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data + input_tokens = input_data["input_ids"] if "input_ids" in input_data else input_data input_token_size = input_tokens[0].numel() - if args['batch_size'] > 1: - out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) - out_str += " Batch_size={}, ".format(args['batch_size']) - out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) - if args['infer_count'] is not None: - out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + if args["batch_size"] > 1: + out_str = "[warm-up]" if num == 0 else "[{}]".format(num) + out_str += " Batch_size={}, ".format(args["batch_size"]) + out_str += "all input token size after padding: {} * {}, ".format( + input_token_size, args["batch_size"] + ) + if args["infer_count"] is not None: + out_str += "all max_output_token_size: {} * {}".format( + args["infer_count"], args["batch_size"] + ) log.info(out_str) - max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + max_rss_mem_consumption = "" + max_uss_mem_consumption = "" + max_shared_mem_consumption = "" + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.start_collect_memory_consumption() - max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + max_gen_tokens = ( + DEFAULT_OUTPUT_TOKEN_SIZE + if args["infer_count"] is None + else args["infer_count"] + ) start = time.perf_counter() - if args['infer_count'] is not None and args['end_token_stopping'] is False: + if args["infer_count"] is not None and args["end_token_stopping"] is False: model.generation_config.eos_token_id = None model.config.eos_token_id = None result = model.generate( **input_data, max_new_tokens=int(max_gen_tokens), - num_beams=args['num_beams'], + num_beams=args["num_beams"], use_cache=True, eos_token_id=None, - do_sample=False + do_sample=False, ) else: result = model.generate( **input_data, max_new_tokens=int(max_gen_tokens), - num_beams=args['num_beams'], + num_beams=args["num_beams"], use_cache=True, - do_sample=False + do_sample=False, ) end = time.perf_counter() - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = ( + mem_consumption.get_max_memory_consumption() + ) mem_consumption.clear_max_memory_consumption() generation_time = end - start @@ -137,41 +165,55 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] - for bs_idx in range(args['batch_size']): - if 'sum' not in args['model_name'] and result[bs_idx][:input_token_size].equal(input_tokens[bs_idx]): + for bs_idx in range(args["batch_size"]): + if "sum" not in args["model_name"] and result[bs_idx][:input_token_size].equal( + input_tokens[bs_idx] + ): generated_token_size = len(result[bs_idx]) - input_tokens[bs_idx].numel() else: generated_token_size = len(result[bs_idx]) # Encoder-decoder models expect the `decoder_input_ids` to start with a special token # When counting the output length, subtract 1. The last token does not participate in inference. - if model.config.is_encoder_decoder and result[bs_idx][0] == model.config.decoder_start_token_id: + if ( + model.config.is_encoder_decoder + and result[bs_idx][0] == model.config.decoder_start_token_id + ): generated_token_size = generated_token_size - 1 num_tokens += generated_token_size if generated_token_size > max_gen_tokens: - log.error('Output token size is over max output token size!') + log.error("Output token size is over max output token size!") result_text = generated_text[bs_idx] if args["output_dir"] is not None: - llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) - result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + llm_bench_utils.output_file.output_gen_text( + result_text, args, model_precision, prompt_index, num, bs_idx, proc_id + ) + result_md5_list.append( + hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest() + ) if len(md5_list[num]) == 0: - md5_list[num] = {prompt_index : result_md5_list} + md5_list[num] = {prompt_index: result_md5_list} else: md5_list[num][prompt_index] = result_md5_list - per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + per_token_time = generation_time * 1000 / (num_tokens / args["batch_size"]) tm_list = [] tm_infer_list = [] if bench_hook is not None: tm_list = bench_hook.get_time_list() - log.debug('latency of all tokens:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + log.debug("latency of all tokens:") + [log.debug("[{}]{:.4f}".format(idx, tm)) for idx, tm in enumerate(tm_list)] tm_infer_list = bench_hook.get_time_infer_list() - log.debug('latency of all infers:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_infer_list)] - if args['num_beams'] == 1 and generated_token_size != len(tm_infer_list): - log.warning(f'Output token size({generated_token_size}) is not equal to infer count({len(tm_infer_list)})') + log.debug("latency of all infers:") + [ + log.debug("[{}]{:.4f}".format(idx, tm)) + for idx, tm in enumerate(tm_infer_list) + ] + if args["num_beams"] == 1 and generated_token_size != len(tm_infer_list): + log.warning( + f"Output token size({generated_token_size}) is not equal to infer count({len(tm_infer_list)})" + ) iter_data = gen_iterate_data( num, - input_token_size * args['batch_size'], + input_token_size * args["batch_size"], len(tm_infer_list), num_tokens, generation_time, @@ -181,7 +223,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, prompt_idx=prompt_index, - tokenization_time=(tok_encode_time, tok_decode_time) + tokenization_time=(tok_encode_time, tok_decode_time), ) iter_data_list.append(iter_data) llm_bench_utils.metrics_print.print_metrics( @@ -194,59 +236,91 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, tokenization_time=(tok_encode_time, tok_decode_time), - batch_size=args['batch_size'] + batch_size=args["batch_size"], ) if num > 0: prev_md5 = md5_list[num - 1][prompt_index] if result_md5_list != prev_md5: - log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " - f"is different from md5 of the {num - 1} iteration {prev_md5}") - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + log.warning( + f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}" + ) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) if num == 1: # if the device is CPU, throw exception - if args['devices'].lower().startswith('cpu') is True: - assert (result_md5_list == prev_md5) + if args["devices"].lower().startswith("cpu") is True: + assert result_md5_list == prev_md5 else: # throw exception - assert (result_md5_list == prev_md5) + assert result_md5_list == prev_md5 else: - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) if bench_hook is not None: bench_hook.clear_time_list() bench_hook.clear_time_infer_list() -def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id): - set_seed(args['seed']) - input_text_list = [input_text] * args['batch_size'] +def run_text_generation_genai( + input_text, + num, + model, + tokenizer, + args, + iter_data_list, + md5_list, + prompt_index, + streamer, + model_precision, + proc_id, +): + set_seed(args["seed"]) + input_text_list = [input_text] * args["batch_size"] if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(input_text_list): - llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + llm_bench_utils.output_file.output_input_text( + in_text, args, model_precision, prompt_index, bs_index, proc_id + ) pt_inputs = tokenizer(input_text_list, return_tensors="pt") input_token_size = pt_inputs.input_ids.shape[1] - if args['batch_size'] > 1: - out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) - out_str += " Batch_size={}, ".format(args['batch_size']) - out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) - if args['infer_count'] is not None: - out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + if args["batch_size"] > 1: + out_str = "[warm-up]" if num == 0 else "[{}]".format(num) + out_str += " Batch_size={}, ".format(args["batch_size"]) + out_str += "all input token size after padding: {} * {}, ".format( + input_token_size, args["batch_size"] + ) + if args["infer_count"] is not None: + out_str += "all max_output_token_size: {} * {}".format( + args["infer_count"], args["batch_size"] + ) log.info(out_str) - max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + max_rss_mem_consumption = "" + max_uss_mem_consumption = "" + max_shared_mem_consumption = "" + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.start_collect_memory_consumption() - max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + max_gen_tokens = ( + DEFAULT_OUTPUT_TOKEN_SIZE + if args["infer_count"] is None + else args["infer_count"] + ) start = time.perf_counter() - generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"]) + generation_result = model.generate( + input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"] + ) end = time.perf_counter() generated_text = generation_result.texts perf_metrics = generation_result.perf_metrics - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = ( + mem_consumption.get_max_memory_consumption() + ) mem_consumption.clear_max_memory_consumption() generation_time = end - start @@ -254,30 +328,34 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] - for bs_idx in range(args['batch_size']): + for bs_idx in range(args["batch_size"]): generated_text_len = len(generated_tokens[bs_idx]) num_tokens += generated_text_len if generated_text_len > max_gen_tokens: - log.error('Output token size is over max output token size!') + log.error("Output token size is over max output token size!") result_text = generated_text[bs_idx] if args["output_dir"] is not None: - llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) - result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + llm_bench_utils.output_file.output_gen_text( + result_text, args, model_precision, prompt_index, num, bs_idx, proc_id + ) + result_md5_list.append( + hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest() + ) if len(md5_list[num]) == 0: - md5_list[num] = {prompt_index : result_md5_list} + md5_list[num] = {prompt_index: result_md5_list} else: md5_list[num][prompt_index] = result_md5_list - per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + per_token_time = generation_time * 1000 / (num_tokens / args["batch_size"]) tm_list = np.array(perf_metrics.raw_metrics.m_durations) / 1000 / 1000 - log.debug('latency of all tokens:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + log.debug("latency of all tokens:") + [log.debug("[{}]{:.4f}".format(idx, tm)) for idx, tm in enumerate(tm_list)] tokenization_time = ( np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000, - np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000 + np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000, ) iter_data = gen_iterate_data( num, - input_token_size * args['batch_size'], + input_token_size * args["batch_size"], len(tm_list), num_tokens, generation_time, @@ -287,7 +365,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, prompt_idx=prompt_index, - tokenization_time=tokenization_time + tokenization_time=tokenization_time, ) iter_data_list.append(iter_data) llm_bench_utils.metrics_print.print_metrics( @@ -300,31 +378,51 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, tokenization_time=tokenization_time, - batch_size=args['batch_size'] + batch_size=args["batch_size"], ) if num > 0: prev_md5 = md5_list[num - 1][prompt_index] if result_md5_list != prev_md5: - log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " - f"is different from md5 of the {num - 1} iteration {prev_md5}") - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + log.warning( + f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}" + ) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) if num == 1: # if the device is CPU, throw exception - if args['devices'].lower().startswith('cpu') is True: - assert (result_md5_list == prev_md5) + if args["devices"].lower().startswith("cpu") is True: + assert result_md5_list == prev_md5 else: # throw exception - assert (result_md5_list == prev_md5) + assert result_md5_list == prev_md5 else: - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) -def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id): - set_seed(args['seed']) - input_text_list = [input_text] * args['batch_size'] +def run_text_generation_genai_with_stream( + input_text, + num, + model, + tokenizer, + args, + iter_data_list, + md5_list, + prompt_index, + streamer, + model_precision, + proc_id, +): + set_seed(args["seed"]) + input_text_list = [input_text] * args["batch_size"] if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(input_text_list): - llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + llm_bench_utils.output_file.output_input_text( + in_text, args, model_precision, prompt_index, bs_index, proc_id + ) pt_inputs = tokenizer(input_text_list, return_tensors="pt") input_token_size = pt_inputs.input_ids.shape[1] pipe_tokenizer = model.get_tokenizer() @@ -332,26 +430,41 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg input_data = pipe_tokenizer.encode(input_text_list) tok_encode_end = time.perf_counter() tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 - if args['batch_size'] > 1: - out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) - out_str += " Batch_size={}, ".format(args['batch_size']) - out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) - if args['infer_count'] is not None: - out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + if args["batch_size"] > 1: + out_str = "[warm-up]" if num == 0 else "[{}]".format(num) + out_str += " Batch_size={}, ".format(args["batch_size"]) + out_str += "all input token size after padding: {} * {}, ".format( + input_token_size, args["batch_size"] + ) + if args["infer_count"] is not None: + out_str += "all max_output_token_size: {} * {}".format( + args["infer_count"], args["batch_size"] + ) log.info(out_str) - max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + max_rss_mem_consumption = "" + max_uss_mem_consumption = "" + max_shared_mem_consumption = "" + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.start_collect_memory_consumption() - max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + max_gen_tokens = ( + DEFAULT_OUTPUT_TOKEN_SIZE + if args["infer_count"] is None + else args["infer_count"] + ) streamer.reset() start = time.perf_counter() - generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens + generated_tokens = model.generate( + input_data, + max_new_tokens=max_gen_tokens, + num_beams=args["num_beams"], + streamer=streamer, + ).tokens end = time.perf_counter() - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = ( + mem_consumption.get_max_memory_consumption() + ) mem_consumption.clear_max_memory_consumption() generation_time = end - start tok_decode_start = time.perf_counter() @@ -361,26 +474,30 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] - for bs_idx in range(args['batch_size']): + for bs_idx in range(args["batch_size"]): generated_text_len = len(generated_tokens[bs_idx]) num_tokens += generated_text_len if generated_text_len > max_gen_tokens: - log.error('Output token size is over max output token size!') + log.error("Output token size is over max output token size!") result_text = generated_text[bs_idx] if args["output_dir"] is not None: - llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) - result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + llm_bench_utils.output_file.output_gen_text( + result_text, args, model_precision, prompt_index, num, bs_idx, proc_id + ) + result_md5_list.append( + hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest() + ) if len(md5_list[num]) == 0: - md5_list[num] = {prompt_index : result_md5_list} + md5_list[num] = {prompt_index: result_md5_list} else: md5_list[num][prompt_index] = result_md5_list - per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + per_token_time = generation_time * 1000 / (num_tokens / args["batch_size"]) tm_list = streamer.get_time_list() - log.debug('latency of all tokens:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + log.debug("latency of all tokens:") + [log.debug("[{}]{:.4f}".format(idx, tm)) for idx, tm in enumerate(tm_list)] iter_data = gen_iterate_data( num, - input_token_size * args['batch_size'], + input_token_size * args["batch_size"], len(tm_list), num_tokens, generation_time, @@ -390,7 +507,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, prompt_idx=prompt_index, - tokenization_time=(tok_encode_time, tok_decode_time) + tokenization_time=(tok_encode_time, tok_decode_time), ) iter_data_list.append(iter_data) llm_bench_utils.metrics_print.print_metrics( @@ -403,46 +520,58 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, tokenization_time=(tok_encode_time, tok_decode_time), - batch_size=args['batch_size'] + batch_size=args["batch_size"], ) if num > 0: prev_md5 = md5_list[num - 1][prompt_index] if result_md5_list != prev_md5: - log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " - f"is different from md5 of the {num - 1} iteration {prev_md5}") - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + log.warning( + f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}" + ) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) if num == 1: # if the device is CPU, throw exception - if args['devices'].lower().startswith('cpu') is True: - assert (result_md5_list == prev_md5) + if args["devices"].lower().startswith("cpu") is True: + assert result_md5_list == prev_md5 else: # throw exception - assert (result_md5_list == prev_md5) + assert result_md5_list == prev_md5 else: - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=generated_text[0] + ) streamer.reset() def run_text_generation_benchmark(model_path, framework, device, args, num_iters): - model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) + model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[ + framework + ].create_text_gen_model(model_path, device, **args) model_precision = llm_bench_utils.model_utils.get_model_precision(model_path.parts) iter_data_list = [] - md5_list = {num : {} for num in range(num_iters + 1)} + md5_list = {num: {} for num in range(num_iters + 1)} input_text_list = llm_bench_utils.model_utils.get_prompts(args) - if args['prompt_index'] is None: - prompt_idx_list = [prompt_idx for prompt_idx, input_text in enumerate(input_text_list)] + if args["prompt_index"] is None: + prompt_idx_list = [ + prompt_idx for prompt_idx, input_text in enumerate(input_text_list) + ] text_list = input_text_list else: prompt_idx_list = [] text_list = [] - for i in args['prompt_index']: + for i in args["prompt_index"]: if 0 <= i < len(input_text_list): text_list.append(input_text_list[i]) prompt_idx_list.append(i) if len(input_text_list) == 0: - raise RuntimeError('==Failure prompts is empty ==') - log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, " - f'prompt nums: {len(text_list)}, prompt idx: {prompt_idx_list}') + raise RuntimeError("==Failure prompts is empty ==") + log.info( + f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, " + f"prompt nums: {len(text_list)}, prompt idx: {prompt_idx_list}" + ) # if num_iters == 0, just output warm-up data if not use_genai: @@ -452,62 +581,113 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters else: text_gen_fn = run_text_generation_genai proc_id = os.getpid() - if args['subsequent'] is False: + if args["subsequent"] is False: for num in range(num_iters + 1): for idx, input_text in enumerate(text_list): if num == 0: - log.info(f'[warm-up] Input text: {input_text}') - text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + log.info(f"[warm-up] Input text: {input_text}") + text_gen_fn( + input_text, + num, + model, + tokenizer, + args, + iter_data_list, + md5_list, + prompt_idx_list[idx], + bench_hook, + model_precision, + proc_id, + ) else: for idx, input_text in enumerate(text_list): for num in range(num_iters + 1): if num == 0: - log.info(f'[warm-up] Input text: {input_text}') - text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + log.info(f"[warm-up] Input text: {input_text}") + text_gen_fn( + input_text, + num, + model, + tokenizer, + args, + iter_data_list, + md5_list, + prompt_idx_list[idx], + bench_hook, + model_precision, + proc_id, + ) - llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True) + llm_bench_utils.metrics_print.print_average( + iter_data_list, prompt_idx_list, args["batch_size"], True + ) return iter_data_list, pretrain_time -def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id): - set_seed(args['seed']) - input_text = image_param['prompt'] - image_width = image_param.get('width', DEFAULT_IMAGE_WIDTH) - image_height = image_param.get('height', DEFAULT_IMAGE_HEIGHT) - nsteps = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in args["model_name"] else LCM_DEFAULT_INFERENCE_STEPS) - guidance_scale = image_param.get('guidance_scale', None) +def run_image_generation( + image_param, num, image_id, pipe, args, iter_data_list, proc_id +): + set_seed(args["seed"]) + input_text = image_param["prompt"] + image_width = image_param.get("width", DEFAULT_IMAGE_WIDTH) + image_height = image_param.get("height", DEFAULT_IMAGE_HEIGHT) + nsteps = image_param.get( + "steps", + ( + DEFAULT_INFERENCE_STEPS + if "lcm" not in args["model_name"] + else LCM_DEFAULT_INFERENCE_STEPS + ), + ) + guidance_scale = image_param.get("guidance_scale", None) log.info( f"[{'warm-up' if num == 0 else num}] Input params: Batch_size={args['batch_size']}, " - f'steps={nsteps}, width={image_width}, height={image_height}, guidance_scale={guidance_scale}' + f"steps={nsteps}, width={image_width}, height={image_height}, guidance_scale={guidance_scale}" ) result_md5_list = [] - max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + max_rss_mem_consumption = "" + max_uss_mem_consumption = "" + max_shared_mem_consumption = "" + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.start_collect_memory_consumption() additional_args = {} if guidance_scale is not None: additional_args["guidance_scale"] = guidance_scale else: - if 'lcm-sdxl' in args['model_type']: + if "lcm-sdxl" in args["model_type"]: additional_args["guidance_scale"] = 1.0 - if 'turbo' in args['model_name']: + if "turbo" in args["model_name"]: additional_args["guidance_scale"] = 0.0 - input_text_list = [input_text] * args['batch_size'] + input_text_list = [input_text] * args["batch_size"] if num == 0 and args["output_dir"] is not None: for bs_idx, in_text in enumerate(input_text_list): - llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id) + llm_bench_utils.output_file.output_image_input_text( + in_text, args, image_id, bs_idx, proc_id + ) start = time.perf_counter() - res = pipe(input_text_list, num_inference_steps=nsteps, height=image_height, width=image_width, **additional_args).images + res = pipe( + input_text_list, + num_inference_steps=nsteps, + height=image_height, + width=image_width, + **additional_args, + ).images end = time.perf_counter() - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = ( + mem_consumption.get_max_memory_consumption() + ) mem_consumption.clear_max_memory_consumption() - for bs_idx in range(args['batch_size']): - rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[bs_idx], args, image_id, num, bs_idx, proc_id, '.png') - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) + for bs_idx in range(args["batch_size"]): + rslt_img_fn = llm_bench_utils.output_file.output_gen_image( + res[bs_idx], args, image_id, num, bs_idx, proc_id, ".png" + ) + result_md5_list.append( + hashlib.md5( + Image.open(rslt_img_fn).tobytes(), usedforsecurity=False + ).hexdigest() + ) generation_time = end - start iter_data = gen_iterate_data( iter_idx=num, @@ -527,56 +707,90 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, - stable_diffusion=stable_diffusion_hook + stable_diffusion=stable_diffusion_hook, + ) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=rslt_img_fn ) - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn) stable_diffusion_hook.clear_statistics() def run_image_generation_benchmark(model_path, framework, device, args, num_iters): - if args['genai']: - log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") - pipe, pretrain_time = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) + if args["genai"]: + log.warning( + "GenAI pipeline is not supported for this task. Switched on default benchmarking" + ) + pipe, pretrain_time = FW_UTILS[framework].create_image_gen_model( + model_path, device, **args + ) iter_data_list = [] - input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file(args) + input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file( + args + ) if framework == "ov": stable_diffusion_hook.new_text_encoder(pipe) stable_diffusion_hook.new_unet(pipe) stable_diffusion_hook.new_vae_decoder(pipe) - if args['prompt_index'] is None: - prompt_idx_list = [image_id for image_id, input_text in enumerate(input_image_list)] + if args["prompt_index"] is None: + prompt_idx_list = [ + image_id for image_id, input_text in enumerate(input_image_list) + ] image_list = input_image_list else: prompt_idx_list = [] image_list = [] - for i in args['prompt_index']: + for i in args["prompt_index"]: if 0 <= i < len(input_image_list): image_list.append(input_image_list[i]) prompt_idx_list.append(i) if len(image_list) == 0: - raise RuntimeError('==Failure prompts is empty ==') - log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}') + raise RuntimeError("==Failure prompts is empty ==") + log.info( + f"Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}" + ) # if num_iters == 0, just output warm-up data proc_id = os.getpid() - if args['subsequent'] is False: + if args["subsequent"] is False: for num in range(num_iters + 1): for image_id, image_param in enumerate(image_list): - run_image_generation(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id) + run_image_generation( + image_param, + num, + prompt_idx_list[image_id], + pipe, + args, + iter_data_list, + proc_id, + ) else: for image_id, image_param in enumerate(image_list): for num in range(num_iters + 1): - run_image_generation(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id) + run_image_generation( + image_param, + num, + prompt_idx_list[image_id], + pipe, + args, + iter_data_list, + proc_id, + ) - llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], False) + llm_bench_utils.metrics_print.print_average( + iter_data_list, prompt_idx_list, args["batch_size"], False + ) return iter_data_list, pretrain_time def run_image_classification(model_path, framework, device, args, num_iters=10): - if args['genai']: - log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") - model, input_size = FW_UTILS[framework].create_image_classification_model(model_path, device, **args) + if args["genai"]: + log.warning( + "GenAI pipeline is not supported for this task. Switched on default benchmarking" + ) + model, input_size = FW_UTILS[framework].create_image_classification_model( + model_path, device, **args + ) data = torch.rand(input_size) @@ -589,40 +803,55 @@ def run_image_classification(model_path, framework, device, args, num_iters=10): generation_time = end - start test_time.append(generation_time) - iter_data = gen_iterate_data(iter_idx=num, in_size=input_size, infer_count=num_iters, gen_time=generation_time) + iter_data = gen_iterate_data( + iter_idx=num, + in_size=input_size, + infer_count=num_iters, + gen_time=generation_time, + ) iter_data_list.append(iter_data) - log.info(f'Processed {num_iters} images in {np.sum(test_time)}s') - log.info(f'Average processing time {np.mean(test_time)} s') + log.info(f"Processed {num_iters} images in {np.sum(test_time)}s") + log.info(f"Average processing time {np.mean(test_time)} s") return iter_data_list -def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list, proc_id): - set_seed(args['seed']) - nsteps = img.get('steps', DEFAULT_SUPER_RESOLUTION_STEPS) - resize_image_width = img.get('width', DEFAULT_SUPER_RESOLUTION_WIDTH) - resize_image_height = img.get('height', DEFAULT_SUPER_RESOLUTION_HEIGHT) +def run_ldm_super_resolution( + img, num, pipe, args, framework, iter_data_list, image_id, tm_list, proc_id +): + set_seed(args["seed"]) + nsteps = img.get("steps", DEFAULT_SUPER_RESOLUTION_STEPS) + resize_image_width = img.get("width", DEFAULT_SUPER_RESOLUTION_WIDTH) + resize_image_height = img.get("height", DEFAULT_SUPER_RESOLUTION_HEIGHT) log.info( f"[{'warm-up' if num == 0 else num}] Input params: steps={nsteps}, " - f'resize_width={resize_image_width}, resize_height={resize_image_height}' + f"resize_width={resize_image_width}, resize_height={resize_image_height}" ) - low_res_img = PIL.Image.open(img['prompt']).convert('RGB') + low_res_img = PIL.Image.open(img["prompt"]).convert("RGB") low_res_img = low_res_img.resize((resize_image_width, resize_image_height)) - max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + max_rss_mem_consumption = "" + max_uss_mem_consumption = "" + max_shared_mem_consumption = "" + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.start_collect_memory_consumption() start = time.perf_counter() res = pipe(low_res_img, num_inference_steps=nsteps, tm_list=tm_list) end = time.perf_counter() - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + if (args["mem_consumption"] == 1 and num == 0) or args["mem_consumption"] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = ( + mem_consumption.get_max_memory_consumption() + ) mem_consumption.clear_max_memory_consumption() result_md5_list = [] - if framework == 'ov': - rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[0], args, image_id, num, None, proc_id, '.png') - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) + if framework == "ov": + rslt_img_fn = llm_bench_utils.output_file.output_gen_image( + res[0], args, image_id, num, None, proc_id, ".png" + ) + result_md5_list.append( + hashlib.md5( + Image.open(rslt_img_fn).tobytes(), usedforsecurity=False + ).hexdigest() + ) generation_time = end - start iter_data = gen_iterate_data( @@ -642,52 +871,67 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption + max_uss_mem=max_uss_mem_consumption, + ) + llm_bench_utils.metrics_print.print_generated( + num, warm_up=(num == 0), generated=rslt_img_fn + ) + llm_bench_utils.metrics_print.print_ldm_unet_vqvae_infer_latency( + num, iter_data, tm_list, warm_up=(num == 0) ) - llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn) - llm_bench_utils.metrics_print.print_ldm_unet_vqvae_infer_latency(num, iter_data, tm_list, warm_up=(num == 0)) def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_iters): if args["genai"]: - log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") - pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, **args) + log.warning( + "GenAI pipeline is not supported for this task. Switched on default benchmarking" + ) + pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model( + model_path, device, **args + ) iter_data_list = [] tm_list = [] - input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file(args) + input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file( + args + ) if len(input_image_list) > 0: images = [] for image in input_image_list: - if args['prompt'] is None and args['prompt_file'] is None: - raise RuntimeError('==Failure image is empty ==') - elif args['prompt_file'] is not None and len(args['prompt_file']) > 0: - image['prompt'] = os.path.join(os.path.dirname(args['prompt_file'][0]), image['prompt'].replace('./', '')) - image['prompt'] = Path(image['prompt']) + if args["prompt"] is None and args["prompt_file"] is None: + raise RuntimeError("==Failure image is empty ==") + elif args["prompt_file"] is not None and len(args["prompt_file"]) > 0: + image["prompt"] = os.path.join( + os.path.dirname(args["prompt_file"][0]), + image["prompt"].replace("./", ""), + ) + image["prompt"] = Path(image["prompt"]) images.append(image) else: - if args['images'] is not None: - images = Path(args['images']) + if args["images"] is not None: + images = Path(args["images"]) if images.is_dir(): - images = list(images.glob('*')) + images = list(images.glob("*")) else: images = [images] else: - raise RuntimeError('==Failure image is empty ==') + raise RuntimeError("==Failure image is empty ==") prompt_idx_list = [image_id for image_id, image_param in enumerate(images)] - if args['prompt_index'] is None: + if args["prompt_index"] is None: prompt_idx_list = [image_id for image_id, input_text in enumerate(images)] image_list = images else: prompt_idx_list = [] image_list = [] - for i in args['prompt_index']: + for i in args["prompt_index"]: if 0 <= i < len(images): image_list.append(images[i]) prompt_idx_list.append(i) if len(image_list) == 0: - raise RuntimeError('==Failure prompts is empty ==') - log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}') + raise RuntimeError("==Failure prompts is empty ==") + log.info( + f"Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}" + ) # if num_iters == 0, just output warm-up data proc_id = os.getpid() @@ -695,11 +939,29 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_ for image_id, img in enumerate(image_list): if num == 0: if args["output_dir"] is not None: - llm_bench_utils.output_file.output_image_input_text(str(img['prompt']), args, prompt_idx_list[image_id], None, proc_id) + llm_bench_utils.output_file.output_image_input_text( + str(img["prompt"]), + args, + prompt_idx_list[image_id], + None, + proc_id, + ) log.info(f"[{'warm-up' if num == 0 else num}] Input image={img['prompt']}") - run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, prompt_idx_list[image_id], tm_list, proc_id) + run_ldm_super_resolution( + img, + num, + pipe, + args, + framework, + iter_data_list, + prompt_idx_list[image_id], + tm_list, + proc_id, + ) tm_list.clear() - llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, 1, False) + llm_bench_utils.metrics_print.print_average( + iter_data_list, prompt_idx_list, 1, False + ) return iter_data_list, pretrain_time @@ -707,161 +969,244 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_ def num_iters_type(x): x = int(x) if x < 0: - raise argparse.ArgumentTypeError('Minimum input value is 0') + raise argparse.ArgumentTypeError("Minimum input value is 0") return x def num_infer_count_type(x): x = int(x) if x < 1: - raise argparse.ArgumentTypeError('Minimum input value is 1') + raise argparse.ArgumentTypeError("Minimum input value is 1") return x def get_argprser(): - parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError) - parser.add_argument('-d', '--device', default='cpu', help='inference device') - parser.add_argument('-r', '--report', help='report csv') - parser.add_argument('-rj', '--report_json', help='report json') - parser.add_argument('-f', '--framework', default='ov', help='framework') - parser.add_argument('-p', '--prompt', default=None, help='one prompt') - parser.add_argument('-pf', '--prompt_file', nargs='+', default=None, - help='Prompt file(s) in jsonl format. Multiple prompt files should be separated with space(s).') - parser.add_argument('-pi', '--prompt_index', nargs='+', type=num_iters_type, default=None, - help='Run the specified prompt index. You can specify multiple prompt indexes, separated by spaces.') + parser = argparse.ArgumentParser( + "LLM benchmarking tool", + add_help=True, + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument( - '-ic', - '--infer_count', + "-m", + "--model", + help="model folder including IR files or Pytorch files", + required=TabError, + ) + parser.add_argument("-d", "--device", default="cpu", help="inference device") + parser.add_argument("-r", "--report", help="report csv") + parser.add_argument("-rj", "--report_json", help="report json") + parser.add_argument("-f", "--framework", default="ov", help="framework") + parser.add_argument("-p", "--prompt", default=None, help="one prompt") + parser.add_argument( + "-pf", + "--prompt_file", + nargs="+", + default=None, + help="Prompt file(s) in jsonl format. Multiple prompt files should be separated with space(s).", + ) + parser.add_argument( + "-pi", + "--prompt_index", + nargs="+", + type=num_iters_type, + default=None, + help="Run the specified prompt index. You can specify multiple prompt indexes, separated by spaces.", + ) + parser.add_argument( + "-ic", + "--infer_count", default=None, type=num_infer_count_type, - help='set the output token size, the value must be greater than 0.' + help="set the output token size, the value must be greater than 0.", ) parser.add_argument( - '-n', - '--num_iters', + "-n", + "--num_iters", default=0, type=num_iters_type, - help='number of benchmarking iterations, ' - 'if the value is greater than 0, the average numbers exclude the first(0th) iteration,\n' - 'if the value equals 0 (default), execute the warm-up iteration(0th iteration).', + help="number of benchmarking iterations, " + "if the value is greater than 0, the average numbers exclude the first(0th) iteration,\n" + "if the value equals 0 (default), execute the warm-up iteration(0th iteration).", ) - parser.add_argument('-i', '--images', default=None, help='test images for vision tasks. Can be directory or path to single image') - parser.add_argument('-s', '--seed', type=int, default=42, required=False, help='specific random seed to generate fix result. Default 42.') parser.add_argument( - '-lc', - '--load_config', + "-i", + "--images", default=None, + help="test images for vision tasks. Can be directory or path to single image", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=42, required=False, - help='path to JSON file to load customized configurations.\n' - 'Example for OpenVINO: {\"INFERENCE_NUM_THREADS\":32,\"PERFORMANCE_HINT\":\"LATENCY\"}.\n' - 'Example for Pytorch: {\"PREC_BF16\":true}. Pytorch currently only supports bf16 settings.\n', + help="specific random seed to generate fix result. Default 42.", ) parser.add_argument( - '-mc', - '--memory_consumption', + "-lc", + "--load_config", + default=None, + required=False, + help="path to JSON file to load customized configurations.\n" + 'Example for OpenVINO: {"INFERENCE_NUM_THREADS":32,"PERFORMANCE_HINT":"LATENCY"}.\n' + 'Example for Pytorch: {"PREC_BF16":true}. Pytorch currently only supports bf16 settings.\n', + ) + parser.add_argument( + "-mc", + "--memory_consumption", default=0, required=False, type=int, - help='if the value is 1, output the maximum memory consumption in warm-up iterations. If the value is 2,' - ' output the maximum memory consumption in all iterations.', + help="if the value is 1, output the maximum memory consumption in warm-up iterations. If the value is 2," + " output the maximum memory consumption in all iterations.", ) - parser.add_argument('-bs', '--batch_size', type=int, default=1, required=False, help='Batch size value') parser.add_argument( - '--fuse_decoding_strategy', - action='store_true', - help='Add decoding postprocessing for next token selection to the model as an extra ops. Original hf_model.generate function will be patched.', + "-bs", + "--batch_size", + type=int, + default=1, + required=False, + help="Batch size value", ) parser.add_argument( - '--save_prepared_model', + "--fuse_decoding_strategy", + action="store_true", + help="Add decoding postprocessing for next token selection to the model as an extra ops. Original hf_model.generate function will be patched.", + ) + parser.add_argument( + "--save_prepared_model", default=None, - help='Path to .xml file to save IR used for inference with all pre-/post processing included', + help="Path to .xml file to save IR used for inference with all pre-/post processing included", + ) + parser.add_argument( + "--num_beams", + type=int, + default=1, + help="Number of beams in the decoding strategy, activates beam_search if greater than 1", ) - parser.add_argument('--num_beams', type=int, default=1, help='Number of beams in the decoding strategy, activates beam_search if greater than 1') parser.add_argument( - '--torch_compile_backend', - default='openvino', + "--torch_compile_backend", + default="openvino", required=False, - help='Enables running the torch.compile() with specified backend: pytorch or openvino (default)', + help="Enables running the torch.compile() with specified backend: pytorch or openvino (default)", ) parser.add_argument( - '--torch_compile_dynamic', - action='store_true', - help='Enables dynamic shape tracking for torch.compile()', + "--torch_compile_dynamic", + action="store_true", + help="Enables dynamic shape tracking for torch.compile()", ) parser.add_argument( - '--torch_compile_options', + "--torch_compile_options", default=None, required=False, - help='Options for torch.compile() in JSON format', + help="Options for torch.compile() in JSON format", ) parser.add_argument( - '--torch_compile_input_module', + "--torch_compile_input_module", default=None, required=False, - help='Specifies the module to decorate with torch.compile(). By default, parent module will be decorated.', + help="Specifies the module to decorate with torch.compile(). By default, parent module will be decorated.", + ) + parser.add_argument( + "--convert_tokenizer", + action="store_true", + help="Convert tokenizer to OpenVINO format", ) parser.add_argument( - '--convert_tokenizer', action='store_true', help='Convert tokenizer to OpenVINO format' + "--subsequent", + action="store_true", + help="if the value is True, input prompts are processed in subsequent manner" + "if the value is False (default), input prompts are processed in interleave manner", ) parser.add_argument( - '--subsequent', - action='store_true', - help='if the value is True, input prompts are processed in subsequent manner' - 'if the value is False (default), input prompts are processed in interleave manner' + "-od", + "--output_dir", + help="Save the input text and generated text, images to files", ) - parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files') llm_bench_utils.model_utils.add_stateful_model_arguments(parser) - parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking") - parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") - parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings") parser.add_argument( - '--end_token_stopping', - action='store_true', - help='Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).' + "--genai", + action="store_true", + help="Use OpenVINO GenAI optimized pipelines for benchmarking", + ) + parser.add_argument( + "--use_cb", action="store_true", help="Use Continuous Batching inference mode" + ) + parser.add_argument( + "--cb_config", + required=False, + default=None, + help="Path to file with Continuous Batching Scheduler settings", + ) + parser.add_argument( + "--end_token_stopping", + action="store_true", + help="Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).", ) return parser.parse_args() CASE_TO_BENCH = { - 'text_gen': run_text_generation_benchmark, - 'image_gen': run_image_generation_benchmark, - 'image_cls': run_image_classification, - 'code_gen': run_text_generation_benchmark, - 'ldm_super_resolution': run_ldm_super_resolution_benchmark, + "text_gen": run_text_generation_benchmark, + "image_gen": run_image_generation_benchmark, + "image_cls": run_image_classification, + "code_gen": run_text_generation_benchmark, + "ldm_super_resolution": run_ldm_super_resolution_benchmark, } def main(): logging_kwargs = {"encoding": "utf-8"} if sys.version_info[1] > 8 else {} - log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, **logging_kwargs) + log.basicConfig( + format="[ %(levelname)s ] %(message)s", + level=os.environ.get("LOGLEVEL", log.INFO), + stream=sys.stdout, + **logging_kwargs, + ) args = get_argprser() - model_path, framework, model_args, model_name = llm_bench_utils.model_utils.analyze_args(args) + model_path, framework, model_args, model_name = ( + llm_bench_utils.model_utils.analyze_args(args) + ) # Set the device for running OpenVINO backend for torch.compile() - if model_args['torch_compile_backend']: + if model_args["torch_compile_backend"]: ov_torch_backend_device = str(args.device) - os.putenv('OPENVINO_TORCH_BACKEND_DEVICE', ov_torch_backend_device.upper()) - os.system('echo [ INFO ] OPENVINO_TORCH_BACKEND_DEVICE=$OPENVINO_TORCH_BACKEND_DEVICE') - - out_str = 'Model path={}'.format(model_path) - if framework == 'ov': - out_str += ', openvino runtime version: {}'.format(get_version()) - if model_args['config'].get('PREC_BF16') and model_args['config']['PREC_BF16'] is True: - log.warning('[Warning] Param bf16/prec_bf16 only work for framework pt. It will be disabled.') + os.putenv("OPENVINO_TORCH_BACKEND_DEVICE", ov_torch_backend_device.upper()) + os.system( + "echo [ INFO ] OPENVINO_TORCH_BACKEND_DEVICE=$OPENVINO_TORCH_BACKEND_DEVICE" + ) + + out_str = "Model path={}".format(model_path) + if framework == "ov": + out_str += ", openvino runtime version: {}".format(get_version()) + if ( + model_args["config"].get("PREC_BF16") + and model_args["config"]["PREC_BF16"] is True + ): + log.warning( + "[Warning] Param bf16/prec_bf16 only work for framework pt. It will be disabled." + ) log.info(out_str) if args.memory_consumption: mem_consumption.start_collect_mem_consumption_thread() try: - iter_data_list, pretrain_time = CASE_TO_BENCH[model_args['use_case']](model_path, framework, args.device, model_args, args.num_iters) + iter_data_list, pretrain_time = CASE_TO_BENCH[model_args["use_case"]]( + model_path, framework, args.device, model_args, args.num_iters + ) if args.report is not None or args.report_json is not None: - model_precision = '' - if framework == 'ov': - ir_conversion_frontend = llm_bench_utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parts) - if ir_conversion_frontend != '': - framework = framework + '(' + ir_conversion_frontend + ')' - model_precision = llm_bench_utils.model_utils.get_model_precision(model_path.parts) + model_precision = "" + if framework == "ov": + ir_conversion_frontend = ( + llm_bench_utils.model_utils.get_ir_conversion_frontend( + model_name, model_path.parts + ) + ) + if ir_conversion_frontend != "": + framework = framework + "(" + ir_conversion_frontend + ")" + model_precision = llm_bench_utils.model_utils.get_model_precision( + model_path.parts + ) if args.report is not None: llm_bench_utils.output_csv.write_result( args.report, @@ -885,7 +1230,7 @@ def main(): model_precision, ) except Exception: - log.error('An exception occurred') + log.error("An exception occurred") log.info(traceback.format_exc()) exit(1) finally: @@ -893,5 +1238,5 @@ def main(): mem_consumption.end_collect_mem_consumption_thread() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index ae676bc269..a1d61504ce 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -51,7 +51,9 @@ from llm_bench_utils.nncf_utils import get_compressed_path from llm_bench_utils.model_utils import add_stateful_model_arguments from optimum.exporters.openvino.utils import flattenize_inputs -from llm_bench_utils.conversion_utils.convert_patch import patch_model_for_optimum_export +from llm_bench_utils.conversion_utils.convert_patch import ( + patch_model_for_optimum_export, +) from llm_bench_utils.conversion_utils.better_transformer_patch import ( register_bettertransformer_config, ) @@ -108,9 +110,13 @@ def compress_torchmodels( if is_wrapped_model(submodel): dataset = None else: - dummy_inputs = sub_export_config.generate_dummy_inputs(framework="pt", **dummy_shapes) + dummy_inputs = sub_export_config.generate_dummy_inputs( + framework="pt", **dummy_shapes + ) dataset = nncf.Dataset([dummy_inputs]) - compressed_submodel = nncf.compress_weights(submodel, dataset=dataset, **compression_options) + compressed_submodel = nncf.compress_weights( + submodel, dataset=dataset, **compression_options + ) models_and_export_configs[model_name] = (compressed_submodel, sub_export_config) return models_and_export_configs @@ -129,7 +135,9 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= if not compress_only: model_config = model.config model = patch_model_for_optimum_export(model) - precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + precision = ( + precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + ) ov_out_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / precision if gptq_applied and args.compress_weights: log.info("Weights compression will be skipped for GPTQ models") @@ -148,12 +156,17 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= preprocessors=None, _variant="default", monolith=False, - library_name="transformers" + library_name="transformers", ) if "decoder_with_past_model" in models_and_export_configs: - models_and_export_configs = {"model": models_and_export_configs["decoder_with_past_model"]} + models_and_export_configs = { + "model": models_and_export_configs["decoder_with_past_model"] + } model.config.save_pretrained(ov_out_dir) - files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] + files_subpaths = [ + "openvino_" + model_name + ".xml" + for model_name in models_and_export_configs.keys() + ] export_models( models_and_export_configs=models_and_export_configs, output_dir=ov_out_dir, @@ -175,7 +188,9 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= ) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - optimized_dir = get_compressed_path(args.output_dir, args.precision, compress_option) + optimized_dir = get_compressed_path( + args.output_dir, args.precision, compress_option + ) model_config.save_pretrained(optimized_dir) fp_path = get_fp_path(args, "openvino_model.xml") ir_model = Core().read_model(fp_path) @@ -214,7 +229,7 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= preprocessors=None, _variant="default", monolith=False, - library_name="transformers" + library_name="transformers", ) compression_options = COMPRESSION_OPTIONS[compress_mode] @@ -229,7 +244,9 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=precision, compression=compress_mode + ) ) model.config.save_pretrained(pt_out_dir) export_models( @@ -275,7 +292,11 @@ def convert_causal_lm(args): def convert_seq2seq(args): config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) - tokenizer_id = args.model_id if "blenderbot-9B" not in args.model_id else "facebook/blenderbot-3B" + tokenizer_id = ( + args.model_id + if "blenderbot-9B" not in args.model_id + else "facebook/blenderbot-3B" + ) tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) pt_compress_weights = is_torch_compression(args) if args.save_orig or pt_compress_weights: @@ -293,7 +314,9 @@ def convert_seq2seq(args): for cw in args.compress_weights: if is_int8_compression(cw): compression_modes.append(cw) - assert compression_modes, "Only INT8 compression supported for PyTorch backend" + assert ( + compression_modes + ), "Only INT8 compression supported for PyTorch backend" for idx, compress_mode in enumerate(compression_modes): if idx > 0: pt_model = AutoModelForSeq2SeqLM.from_pretrained( @@ -302,11 +325,17 @@ def convert_seq2seq(args): config=config, ) - export_config_constructor = TasksManager.get_exporter_config_constructor( - model=pt_model, exporter="openvino", task="text2text-generation" + export_config_constructor = ( + TasksManager.get_exporter_config_constructor( + model=pt_model, exporter="openvino", task="text2text-generation" + ) + ) + export_config = export_config_constructor( + pt_model.config, use_past=True + ) + models_and_export_configs = get_encoder_decoder_models_for_export( + pt_model, export_config ) - export_config = export_config_constructor(pt_model.config, use_past=True) - models_and_export_configs = get_encoder_decoder_models_for_export(pt_model, export_config) compression_options = COMPRESSION_OPTIONS[compress_mode] models_and_export_configs = compress_torchmodels( @@ -315,7 +344,9 @@ def convert_seq2seq(args): encoder_file_name = Path("encoder") / OV_ENCODER_NAME decoder_file_name = Path("decoder") / OV_DECODER_NAME - decoder_with_past_file_name = Path("decoder_with_past") / OV_DECODER_WITH_PAST_NAME + decoder_with_past_file_name = ( + Path("decoder_with_past") / OV_DECODER_WITH_PAST_NAME + ) output_names = [ encoder_file_name, @@ -326,7 +357,9 @@ def convert_seq2seq(args): Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=args.precision, compression=compress_mode + ) ) try: export_models( @@ -334,12 +367,16 @@ def convert_seq2seq(args): opset=export_config.DEFAULT_ONNX_OPSET, output_dir=save_dir_path, output_names=output_names, - ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, - stateful=False + ov_config=( + OVConfig(dtype="fp16") if args.precision == "FP16" else None + ), + stateful=False, ) save_tokenizer(tok, save_dir_path) except Exception as ex: - log.warning(f"PT weights compression failed with {ex}, please use OpenVINO backend instead") + log.warning( + f"PT weights compression failed with {ex}, please use OpenVINO backend instead" + ) del pt_model gc.collect() @@ -349,9 +386,15 @@ def convert_seq2seq(args): return ov_compression = is_ov_compression(args) - ov_encoder = is_ov_model_provided(args.model_id, args.output_dir, args.precision, "openvino_encoder_model.xml") - ov_decoder = is_ov_model_provided(args.model_id, args.output_dir, args.precision, "openvino_decoder_model.xml") - compress_only = ov_compression and not args.force_convert and ov_encoder and ov_decoder + ov_encoder = is_ov_model_provided( + args.model_id, args.output_dir, args.precision, "openvino_encoder_model.xml" + ) + ov_decoder = is_ov_model_provided( + args.model_id, args.output_dir, args.precision, "openvino_decoder_model.xml" + ) + compress_only = ( + ov_compression and not args.force_convert and ov_encoder and ov_decoder + ) if not compress_only: start = time.perf_counter() model = OVModelForSeq2SeqLM.from_pretrained( @@ -360,7 +403,7 @@ def convert_seq2seq(args): compile=False, trust_remote_code=True, config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), - load_in_8bit=False + load_in_8bit=False, ) if is_fp16(args): model.half() @@ -384,7 +427,9 @@ def convert_seq2seq(args): ) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - optimized_dir = get_compressed_path(args.output_dir, args.precision, compress_option) + optimized_dir = get_compressed_path( + args.output_dir, args.precision, compress_option + ) fp_enc_path = get_fp_path(args, "openvino_encoder_model.xml") enc_model = Core().read_model(fp_enc_path) compress_ov_model_weights_helper( @@ -448,15 +493,21 @@ def _get_submodels_for_export_stable_diffusion( pipeline.unet.config.text_encoder_projection_dim = projection_dim # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 - pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + pipeline.unet.config.requires_aesthetics_score = getattr( + pipeline.config, "requires_aesthetics_score", False + ) models_for_export["unet"] = pipeline.unet # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 vae_encoder = copy.deepcopy(pipeline.vae) if isinstance(vae_encoder, AutoencoderTiny): - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latents"]} + vae_encoder.forward = lambda sample: { + "latent_sample": vae_encoder.encode(x=sample)["latents"] + } else: - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + vae_encoder.forward = lambda sample: { + "latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample() + } models_for_export["vae_encoder"] = vae_encoder # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 @@ -508,7 +559,10 @@ def get_stable_diffusion_models_for_export( text_encoder_export_config = text_encoder_config_constructor( pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype ) - models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config) + models_for_export["text_encoder"] = ( + models_for_export["text_encoder"], + text_encoder_export_config, + ) # U-NET export_config_constructor = TasksManager.get_exporter_config_constructor( @@ -518,7 +572,9 @@ def get_stable_diffusion_models_for_export( model_type="unet", library_name="diffusers", ) - unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype) + unet_export_config = export_config_constructor( + pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype + ) models_for_export["unet"] = (models_for_export["unet"], unet_export_config) # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 @@ -530,7 +586,9 @@ def get_stable_diffusion_models_for_export( model_type="vae-encoder", library_name="diffusers", ) - vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + vae_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) models_for_export["vae_encoder"] = (vae_encoder, vae_export_config) # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 @@ -542,7 +600,9 @@ def get_stable_diffusion_models_for_export( model_type="vae-decoder", library_name="diffusers", ) - vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + vae_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) models_for_export["vae_decoder"] = (vae_decoder, vae_export_config) if "text_encoder_2" in models_for_export: @@ -556,20 +616,30 @@ def get_stable_diffusion_models_for_export( export_config = export_config_constructor( pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype ) - models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) + models_for_export["text_encoder_2"] = ( + models_for_export["text_encoder_2"], + export_config, + ) return models_for_export -def convert_sd_prepared_for_export_common(pipeline, models_and_export_configs, output_dir, args): +def convert_sd_prepared_for_export_common( + pipeline, models_and_export_configs, output_dir, args +): for model_name in models_and_export_configs: subcomponent = models_and_export_configs[model_name][0] if hasattr(subcomponent, "save_config"): subcomponent.save_config(output_dir / model_name) - elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): + elif hasattr(subcomponent, "config") and hasattr( + subcomponent.config, "save_pretrained" + ): subcomponent.config.save_pretrained(output_dir / model_name) - files_subpaths = [Path(name_dir) / OV_XML_FILE_NAME for name_dir in models_and_export_configs] + files_subpaths = [ + Path(name_dir) / OV_XML_FILE_NAME + for name_dir in models_and_export_configs + ] # Saving the additional components needed to perform inference. pipeline.scheduler.save_pretrained(output_dir.joinpath("scheduler")) @@ -593,13 +663,15 @@ def convert_sd_prepared_for_export_common(pipeline, models_and_export_configs, o output_dir=output_dir, output_names=files_subpaths, ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, - stateful=False + stateful=False, ) def convert_sd_common(pipeline, output_dir, args): models_and_export_configs = get_stable_diffusion_models_for_export(pipeline) - convert_sd_prepared_for_export_common(pipeline, models_and_export_configs, output_dir, args) + convert_sd_prepared_for_export_common( + pipeline, models_and_export_configs, output_dir, args + ) def convert_sd(args): @@ -610,7 +682,9 @@ def convert_sd(args): output_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output_dir, args) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, output_dir, args + ) if pt_compress_weights: compression_modes = [] @@ -621,23 +695,33 @@ def convert_sd(args): for idx, compress_mode in enumerate(compression_modes): if idx > 0: pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) - models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + models_and_export_configs = get_stable_diffusion_models_for_export( + pt_model + ) target_models_and_export_configs = { - k: models_and_export_configs[k] for k in ("text_encoder", "unet", "vae_decoder") + k: models_and_export_configs[k] + for k in ("text_encoder", "unet", "vae_decoder") } compression_options = COMPRESSION_OPTIONS[compress_mode] models_and_export_configs.update( - compress_torchmodels(target_models_and_export_configs, compression_options=compression_options) + compress_torchmodels( + target_models_and_export_configs, + compression_options=compression_options, + ) ) output = ( Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=args.precision, compression=compress_mode + ) + ) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, output, args ) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) del pt_model gc.collect() @@ -649,7 +733,9 @@ def convert_sd(args): ) continue model = OVStableDiffusionPipeline.from_pretrained(output_dir, compile=False) - ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + ov_int8_dir = get_compressed_path( + args.output_dir, args.precision, weigths_compression_option + ) model.text_encoder.model = nncf.compress_weights(model.text_encoder.model) model.unet.model = nncf.compress_weights(model.unet.model) model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model) @@ -667,7 +753,9 @@ def convert_lcm(args): output_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output_dir, args) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, output_dir, args + ) if pt_compress_weights: compression_modes = [] @@ -678,23 +766,33 @@ def convert_lcm(args): for idx, compress_mode in enumerate(compression_modes): if idx > 0: pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) - models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + models_and_export_configs = get_stable_diffusion_models_for_export( + pt_model + ) target_models_and_export_configs = { - k: models_and_export_configs[k] for k in ("text_encoder", "unet", "vae_decoder") + k: models_and_export_configs[k] + for k in ("text_encoder", "unet", "vae_decoder") } compression_options = COMPRESSION_OPTIONS[compress_mode] models_and_export_configs.update( - compress_torchmodels(target_models_and_export_configs, compression_options=compression_options) + compress_torchmodels( + target_models_and_export_configs, + compression_options=compression_options, + ) ) output = ( Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=args.precision, compression=compress_mode + ) + ) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, output, args ) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) del pt_model gc.collect() @@ -705,8 +803,12 @@ def convert_lcm(args): f"Weights compression {weigths_compression_option} is not supported for LCM, will be ignored" ) continue - model = OVLatentConsistencyModelPipeline.from_pretrained(output_dir, compile=False) - ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + model = OVLatentConsistencyModelPipeline.from_pretrained( + output_dir, compile=False + ) + ov_int8_dir = get_compressed_path( + args.output_dir, args.precision, weigths_compression_option + ) model.text_encoder.model = nncf.compress_weights(model.text_encoder.model) model.unet.model = nncf.compress_weights(model.unet.model) model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model) @@ -728,13 +830,17 @@ def build_pt_model(model_id): pt_model.load_lora_weights(additional_model) pt_model.fuse_lora() if "lcm" in additional_model: - pt_model.scheduler = LCMScheduler.from_config(pt_model.scheduler.config) + pt_model.scheduler = LCMScheduler.from_config( + pt_model.scheduler.config + ) continue if "lcm" in additional_model and "lora" not in additional_model: unet = UNet2DConditionModel.from_pretrained(additional_model) pt_model.unet = unet - pt_model.scheduler = LCMScheduler.from_config(pt_model.scheduler.config) + pt_model.scheduler = LCMScheduler.from_config( + pt_model.scheduler.config + ) continue if "tae" in additional_model: @@ -754,7 +860,9 @@ def build_pt_model(model_id): fp_out_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, fp_out_dir, args) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, fp_out_dir, args + ) if pt_compress_weights: compression_modes = [] @@ -765,7 +873,9 @@ def build_pt_model(model_id): for idx, compress_mode in enumerate(compression_modes): if idx > 0: pt_model = build_pt_model(args.model_id) - models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + models_and_export_configs = get_stable_diffusion_models_for_export( + pt_model + ) compression_options = COMPRESSION_OPTIONS[compress_mode] models_and_export_configs = compress_torchmodels( @@ -776,10 +886,14 @@ def build_pt_model(model_id): Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=args.precision, compression=compress_mode + ) ) - convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) + convert_sd_prepared_for_export_common( + pt_model, models_and_export_configs, output, args + ) del pt_model gc.collect() @@ -791,16 +905,28 @@ def build_pt_model(model_id): f"Weights compression {weigths_compression_option} is not supported for SDXL, will be ignored" ) continue - ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + ov_int8_dir = get_compressed_path( + args.output_dir, args.precision, weigths_compression_option + ) compression_options = COMPRESSION_OPTIONS[weigths_compression_option] - model = OVStableDiffusionXLPipeline.from_pretrained(fp_out_dir, compile=False) - model.text_encoder.model = nncf.compress_weights(model.text_encoder.model, **compression_options) + model = OVStableDiffusionXLPipeline.from_pretrained( + fp_out_dir, compile=False + ) + model.text_encoder.model = nncf.compress_weights( + model.text_encoder.model, **compression_options + ) if getattr(model, "text_encoder_2", None) is not None: - model.text_encoder_2.model = nncf.compress_weights(model.text_encoder_2.model, **compression_options) + model.text_encoder_2.model = nncf.compress_weights( + model.text_encoder_2.model, **compression_options + ) model.unet.model = nncf.compress_weights(model.unet.model) - model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model, **compression_options) + model.vae_decoder.model = nncf.compress_weights( + model.vae_decoder.model, **compression_options + ) if getattr(model, "vae_encoder", None) is not None: - model.vae_encoder.model = nncf.compress_weights(model.vae_encoder.model, **compression_options) + model.vae_encoder.model = nncf.compress_weights( + model.vae_encoder.model, **compression_options + ) model.save_pretrained(ov_int8_dir) del model @@ -854,9 +980,13 @@ def forward(self, latents): compression_options = COMPRESSION_OPTIONS[compress_mode] compressed_unet = nncf.compress_weights( - pipeline.unet, dataset=nncf.Dataset([unet_example_input]), **compression_options + pipeline.unet, + dataset=nncf.Dataset([unet_example_input]), + **compression_options, + ) + ov_compressed_unet = convert_model( + compressed_unet, example_input=unet_example_input ) - ov_compressed_unet = convert_model(compressed_unet, example_input=unet_example_input) ov_compressed_unet.inputs[1].get_node().set_element_type(OVType.i32) ov_compressed_unet.inputs[1].get_node().set_partial_shape(PartialShape([])) ov_compressed_unet.validate_nodes_and_infer_types() @@ -864,7 +994,9 @@ def forward(self, latents): Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=args.precision, compression=compress_mode + ) ) save_model( ov_compressed_unet, @@ -874,10 +1006,18 @@ def forward(self, latents): pipeline.scheduler.save_config(pt_out_dir) decoder_example_input = torch.zeros(1, 3, 128, 128) compressed_decoder = nncf.compress_weights( - decoder, dataset=nncf.Dataset([decoder_example_input]), **compression_options + decoder, + dataset=nncf.Dataset([decoder_example_input]), + **compression_options, + ) + ov_compressed_decoder = convert_model( + compressed_decoder, example_input=decoder_example_input + ) + save_model( + ov_compressed_decoder, + pt_out_dir / "vqvae.xml", + compress_to_fp16=compress_to_fp16, ) - ov_compressed_decoder = convert_model(compressed_decoder, example_input=decoder_example_input) - save_model(ov_compressed_decoder, pt_out_dir / "vqvae.xml", compress_to_fp16=compress_to_fp16) if is_ov_compression(args): for weigths_compression_option in args.compress_weights: @@ -886,7 +1026,9 @@ def forward(self, latents): f"Weights compression {weigths_compression_option} is not supported for LDM, will be ignored" ) continue - ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + ov_int8_dir = get_compressed_path( + args.output_dir, args.precision, weigths_compression_option + ) ov_unet = Core().read_model(save_dir / "unet.xml") compressed_ov_unet = nncf.compress_weights(ov_unet) save_model( @@ -919,9 +1061,13 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): dynamic_shapes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}} for idx in range(len(outs.past_key_values)): - inputs.extend([f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"]) + inputs.extend( + [f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"] + ) dynamic_shapes[inputs[-1]] = {2: "past_sequence + sequence"} - dynamic_shapes[inputs[-2]] = {3 if not old else 2: "past_sequence + sequence"} + dynamic_shapes[inputs[-2]] = { + 3 if not old else 2: "past_sequence + sequence" + } outputs.extend([f"present.{idx}.key", f"present.{idx}.value"]) inputs.append("attention_mask") @@ -951,7 +1097,9 @@ def ts_patched_forward( ov_model = convert_model(pt_model, example_input=dummy_inputs) pt_model.forward = orig_forward - for inp_name, m_input, input_data in zip(inputs, ov_model.inputs, flattenize_inputs(dummy_inputs.values())): + for inp_name, m_input, input_data in zip( + inputs, ov_model.inputs, flattenize_inputs(dummy_inputs.values()) + ): input_node = m_input.get_node() if input_node.element_type == OVType.dynamic: m_input.get_node().set_element_type(OVType.f32) @@ -965,7 +1113,9 @@ def ts_patched_forward( for out, out_name in zip(ov_model.outputs, outputs): out.get_tensor().set_names({out_name}) - save_ov_model_helper(ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config) + save_ov_model_helper( + ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config + ) remote_code = False pt_model = None @@ -984,7 +1134,9 @@ def ts_patched_forward( and is_ov_model_provided(args.model_id, args.output_dir, args.precision) ) gptq_applied = is_gptq(config) - precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + precision = ( + precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + ) if post_init is not None: model_kwargs = {"torch_dtype": torch.float32} pt_model = None @@ -1004,7 +1156,9 @@ def create_model(model_id, config, model_kwargs): pt_model = create_model(args.model_id, config, model_kwargs) if not remote_code: - return convert_optimum_causallm_base(pt_model, args, config, compression_only) + return convert_optimum_causallm_base( + pt_model, args, config, compression_only + ) if args.save_orig: pt_out_dir = Path(args.output_dir) / PYTORCH_DIR @@ -1020,7 +1174,9 @@ def create_model(model_id, config, model_kwargs): for cw in args.compress_weights: if is_int8_compression(cw): compression_modes.append(cw) - assert compression_modes, "Only INT8 compression supported for PyTorch backend" + assert ( + compression_modes + ), "Only INT8 compression supported for PyTorch backend" dummy_inputs = { "input_ids": torch.ones((1, 10), dtype=torch.long), @@ -1033,20 +1189,26 @@ def create_model(model_id, config, model_kwargs): compression_options = COMPRESSION_OPTIONS[compress_mode] compressed_pt_model = nncf.compress_weights( - pt_model, dataset=nncf.Dataset([dummy_inputs]), **compression_options + pt_model, + dataset=nncf.Dataset([dummy_inputs]), + **compression_options, ) pt_path = ( Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=precision, compression=compress_mode + ) ) convert_to_ov(compressed_pt_model, tok, pt_path, compress_to_fp16) if is_ov_compression(args): if not remote_code: - return convert_optimum_causallm_base(pt_model, args, config, compression_only) + return convert_optimum_causallm_base( + pt_model, args, config, compression_only + ) ov_path = get_fp_path(args, "openvino_model.xml") if compression_only: log.info( @@ -1056,7 +1218,9 @@ def create_model(model_id, config, model_kwargs): ov_model = Core().read_model(ov_path) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - ov_compressed_path = get_compressed_path(args.output_dir, args.precision, compress_option) + ov_compressed_path = get_compressed_path( + args.output_dir, args.precision, compress_option + ) compress_ov_model_weights_helper( ov_model, tok, @@ -1090,8 +1254,12 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): for i in range(1, len(ov_model.outputs), 2): idx = (i - 1) // 2 ov_model.outputs[i].get_tensor().set_names({f"present.{int(idx)}.key"}) - ov_model.outputs[i + 1].get_tensor().set_names({f"present.{int(idx)}.value"}) - save_ov_model_helper(ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config) + ov_model.outputs[i + 1].get_tensor().set_names( + {f"present.{int(idx)}.value"} + ) + save_ov_model_helper( + ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config + ) config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) cuda, post_init = patch_gptq(config) @@ -1105,7 +1273,9 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): ) compress_to_fp16 = is_fp16(args) gptq_applied = is_gptq(config) - precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + precision = ( + precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + ) tokenizer_id = args.tokenizer_id or args.model_id tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) ov_out_path = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / precision @@ -1114,7 +1284,9 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): if not compression_only: def create_model(model_id, config, model_kwargs): - pt_model = AutoModel.from_pretrained(model_id, trust_remote_code=True, config=config, **model_kwargs) + pt_model = AutoModel.from_pretrained( + model_id, trust_remote_code=True, config=config, **model_kwargs + ) pt_model.config.use_cache = True pt_model.to(torch.float32) pt_model.eval() @@ -1134,7 +1306,9 @@ def create_model(model_id, config, model_kwargs): for cw in args.compress_weights: if is_int8_compression(cw): compression_modes.append(cw) - assert compression_modes, "Only INT8 compression supported for PyTorch backend" + assert ( + compression_modes + ), "Only INT8 compression supported for PyTorch backend" dummy_input = make_dummy_input() for idx, compress_mode in enumerate(compression_modes): @@ -1150,7 +1324,9 @@ def create_model(model_id, config, model_kwargs): Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format( + precision=precision, compression=compress_mode + ) ) convert_to_ov(compressed_pt_model, tok, pt_out_path) @@ -1164,7 +1340,9 @@ def create_model(model_id, config, model_kwargs): ov_model = Core().read_model(ov_model_path) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - ov_compressed_path = get_compressed_path(args.output_dir, args.precision, args.compress_weights) + ov_compressed_path = get_compressed_path( + args.output_dir, args.precision, args.compress_weights + ) compress_ov_model_weights_helper( ov_model, tok, @@ -1194,7 +1372,9 @@ def convert_falcon(args): model_kwargs = {"torch_dtype": torch.float32} pt_model = None gptq_applied = is_gptq(config) - precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + precision = ( + precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + ) if not compression_only: pt_model = AutoModelForCausalLM.from_pretrained( args.model_id, @@ -1231,7 +1411,9 @@ def convert_phi(args): model_kwargs["torch_dtype"] = torch.float32 pt_model = None gptq_applied = is_gptq(config) - precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + precision = ( + precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + ) if not compression_only: pt_model = AutoModelForCausalLM.from_pretrained( args.model_id, @@ -1261,7 +1443,9 @@ def convert_baichaun(args): model_kwargs = {"torch_dtype": torch.float32} model = None if not compression_only: - model = AutoModelForCausalLM.from_pretrained(args.model_id, trust_remote_code=True, **model_kwargs) + model = AutoModelForCausalLM.from_pretrained( + args.model_id, trust_remote_code=True, **model_kwargs + ) try: model.to(torch.float32) if post_init is None: @@ -1294,7 +1478,9 @@ def convert_qwen(args): } model = None if not compression_only: - model = AutoModelForCausalLM.from_pretrained(args.model_id, trust_remote_code=True, **model_kwargs) + model = AutoModelForCausalLM.from_pretrained( + args.model_id, trust_remote_code=True, **model_kwargs + ) try: model.to(torch.float32) except Exception: @@ -1384,16 +1570,24 @@ def get_convert_model_type(model_id): def main(): - log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) + log.basicConfig( + format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout + ) parser = ArgumentParser() - parser.add_argument("-m", "--model_id", required=True, help="model_id or directory for loading") + parser.add_argument( + "-m", "--model_id", required=True, help="model_id or directory for loading" + ) parser.add_argument( "--tokenizer_id", required=False, help="tokenizer id or directory for loading. If not provided, model_id will be used by default", ) - parser.add_argument("-o", "--output_dir", required=True, help="output directory for saving model") - parser.add_argument("--save_orig", action="store_true", help="save pytorch model on disk") + parser.add_argument( + "-o", "--output_dir", required=True, help="output directory for saving model" + ) + parser.add_argument( + "--save_orig", action="store_true", help="save pytorch model on disk" + ) parser.add_argument( "-p", "--precision", @@ -1401,14 +1595,25 @@ def main(): default="FP32", help="base conversion precision", ) - parser.add_argument("--force_convert", action="store_true", help="Force model conversion") + parser.add_argument( + "--force_convert", action="store_true", help="Force model conversion" + ) compression_group = parser.add_argument_group("Weights compression parameters") compression_group.add_argument( "-c", "--compress_weights", type=str, - choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM", "E2M1"], + choices=[ + "INT8", + "INT8_ASYM", + "INT8_SYM", + "4BIT_DEFAULT", + "4BIT_MAXIMUM", + "INT4_SYM", + "INT4_ASYM", + "E2M1", + ], nargs="+", help=( "The weight compression option, e.g. INT8 - INT8 weights (deprecated, please use INT8_ASYM instead), " diff --git a/llm_bench/python/llm_bench_utils/config_class.py b/llm_bench/python/llm_bench_utils/config_class.py index bba4d9a640..2635f1aea2 100644 --- a/llm_bench/python/llm_bench_utils/config_class.py +++ b/llm_bench/python/llm_bench_utils/config_class.py @@ -2,118 +2,135 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from transformers import AutoTokenizer -from transformers import AutoModelForCausalLM, T5ForConditionalGeneration, BlenderbotForConditionalGeneration, AutoModel +from transformers import ( + AutoModelForCausalLM, + T5ForConditionalGeneration, + BlenderbotForConditionalGeneration, + AutoModel, +) from diffusers.pipelines import DiffusionPipeline, LDMSuperResolutionPipeline from optimum.intel.openvino import ( OVModelForCausalLM, OVModelForSeq2SeqLM, OVStableDiffusionPipeline, OVLatentConsistencyModelPipeline, - OVStableDiffusionXLPipeline + OVStableDiffusionXLPipeline, +) +from llm_bench_utils.ov_model_classes import ( + OVMPTModel, + OVLDMSuperResolutionPipeline, + OVChatGLMModel, ) -from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel TOKENIZE_CLASSES_MAPPING = { - 'decoder': AutoTokenizer, - 'mpt': AutoTokenizer, - 't5': AutoTokenizer, - 'blenderbot': AutoTokenizer, - 'falcon': AutoTokenizer, + "decoder": AutoTokenizer, + "mpt": AutoTokenizer, + "t5": AutoTokenizer, + "blenderbot": AutoTokenizer, + "falcon": AutoTokenizer, } OV_MODEL_CLASSES_MAPPING = { - 'decoder': OVModelForCausalLM, - 't5': OVModelForSeq2SeqLM, - 'blenderbot': OVModelForSeq2SeqLM, - 'falcon': OVModelForCausalLM, - 'mpt': OVMPTModel, - 'stable-diffusion-xl': OVStableDiffusionXLPipeline, - 'sdxl': OVStableDiffusionXLPipeline, - 'lcm-sdxl': OVStableDiffusionXLPipeline, - 'ssd-': OVStableDiffusionXLPipeline, - 'lcm-ssd-': OVStableDiffusionXLPipeline, - 'stable_diffusion': OVStableDiffusionPipeline, - 'lcm': OVLatentConsistencyModelPipeline, - 'replit': OVMPTModel, - 'codet5': OVModelForSeq2SeqLM, - 'codegen2': OVModelForCausalLM, - 'ldm_super_resolution': OVLDMSuperResolutionPipeline, - 'chatglm2': OVModelForCausalLM, - 'chatglm3': OVModelForCausalLM, - 'chatglm': OVChatGLMModel, + "decoder": OVModelForCausalLM, + "t5": OVModelForSeq2SeqLM, + "blenderbot": OVModelForSeq2SeqLM, + "falcon": OVModelForCausalLM, + "mpt": OVMPTModel, + "stable-diffusion-xl": OVStableDiffusionXLPipeline, + "sdxl": OVStableDiffusionXLPipeline, + "lcm-sdxl": OVStableDiffusionXLPipeline, + "ssd-": OVStableDiffusionXLPipeline, + "lcm-ssd-": OVStableDiffusionXLPipeline, + "stable_diffusion": OVStableDiffusionPipeline, + "lcm": OVLatentConsistencyModelPipeline, + "replit": OVMPTModel, + "codet5": OVModelForSeq2SeqLM, + "codegen2": OVModelForCausalLM, + "ldm_super_resolution": OVLDMSuperResolutionPipeline, + "chatglm2": OVModelForCausalLM, + "chatglm3": OVModelForCausalLM, + "chatglm": OVChatGLMModel, } PT_MODEL_CLASSES_MAPPING = { - 'decoder': AutoModelForCausalLM, - 't5': T5ForConditionalGeneration, - 'blenderbot': BlenderbotForConditionalGeneration, - 'mpt': AutoModelForCausalLM, - 'falcon': AutoModelForCausalLM, - 'stable_diffusion': DiffusionPipeline, - 'ldm_super_resolution': LDMSuperResolutionPipeline, - 'chatglm': AutoModel, + "decoder": AutoModelForCausalLM, + "t5": T5ForConditionalGeneration, + "blenderbot": BlenderbotForConditionalGeneration, + "mpt": AutoModelForCausalLM, + "falcon": AutoModelForCausalLM, + "stable_diffusion": DiffusionPipeline, + "ldm_super_resolution": LDMSuperResolutionPipeline, + "chatglm": AutoModel, } USE_CASES = { - 'image_gen': ['stable-diffusion-', 'ssd-', 'deepfloyd-if', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl'], - 'text2speech': ['whisper'], - 'image_cls': ['vit'], - 'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"], - 'text_gen': [ - 'decoder', - 't5', - 'falcon', + "image_gen": [ + "stable-diffusion-", + "ssd-", + "deepfloyd-if", + "tiny-sd", + "small-sd", + "lcm-", + "sdxl", + ], + "text2speech": ["whisper"], + "image_cls": ["vit"], + "code_gen": ["replit", "codegen2", "codegen", "codet5", "stable-code"], + "text_gen": [ + "decoder", + "t5", + "falcon", "glm", - 'gpt-', - 'gpt2', - 'aquila', - 'mpt', - 'open-llama', - 'openchat', - 'neural-chat', - 'llama', - 'tiny-llama', - 'tinyllama', - 'opt-', - 'pythia-', - 'stablelm-', - 'stable-zephyr-', - 'rocket-', - 'blenderbot', - 'vicuna', - 'dolly', - 'bloom', - 'red-pajama', - 'chatglm', - 'xgen', - 'longchat', - 'jais', - 'orca-mini', - 'baichuan', - 'qwen', - 'zephyr', - 'mistral', - 'mixtral', - 'yi-', - 'phi-', - 'phi2-', - 'minicpm', - 'gemma', + "gpt-", + "gpt2", + "aquila", + "mpt", + "open-llama", + "openchat", + "neural-chat", + "llama", + "tiny-llama", + "tinyllama", + "opt-", + "pythia-", + "stablelm-", + "stable-zephyr-", + "rocket-", + "blenderbot", + "vicuna", + "dolly", + "bloom", + "red-pajama", + "chatglm", + "xgen", + "longchat", + "jais", + "orca-mini", + "baichuan", + "qwen", + "zephyr", + "mistral", + "mixtral", + "yi-", + "phi-", + "phi2-", + "minicpm", + "gemma", "deci", "internlm", "olmo", "phi3", "starcoder", - "instruct-gpt" + "instruct-gpt", ], - 'ldm_super_resolution': ['ldm-super-resolution'], + "ldm_super_resolution": ["ldm-super-resolution"], } DEFAULT_MODEL_CLASSES = { - 'text_gen': 'decoder', - 'image_gen': 'stable_diffusion', - 'image_cls': 'vit', - 'speech2text': 'whisper', - 'code_gen': 'decoder', - 'ldm_super_resolution': 'ldm_super_resolution', + "text_gen": "decoder", + "image_gen": "stable_diffusion", + "image_cls": "vit", + "speech2text": "whisper", + "code_gen": "decoder", + "ldm_super_resolution": "ldm_super_resolution", } diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py b/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py index 4def9cfa0a..fde9060b0d 100644 --- a/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py +++ b/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py @@ -865,7 +865,9 @@ def forward( return outputs -def gptj_apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor: +def gptj_apply_rotary_pos_emb( + tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor +) -> torch.Tensor: sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3) cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3) return (tensor * cos) + (rotate_every_two(tensor) * sin) @@ -930,7 +932,9 @@ def gptj_forward( # compute self-attention: V x Softmax(QK^T) attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) - attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim) + attn_output = self._merge_heads( + attn_output, self.num_attention_heads, self.head_dim + ) attn_output = self.out_proj(attn_output) attn_output = self.resid_dropout(attn_output) @@ -969,8 +973,14 @@ def gptj_wrapped_scaled_dot_product( query = query.to(value.dtype) key = key.to(value.dtype) - if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1: - raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.") + if ( + batch_size == 1 + and attention_mask is not None + and attention_mask[0, 0, -1, -1] < -1 + ): + raise ValueError( + "BetterTransformer does not support padding='max_length' with a batch size of 1." + ) dropout_p = self.dropout_prob_attn if self.training else 0.0 if batch_size == 1 or self.training: @@ -988,7 +998,9 @@ def gptj_wrapped_scaled_dot_product( # causal_mask is always [True, ..., True] otherwise, so executing this # is unnecessary if query_length > 1: - causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + causal_mask = self.bias[ + :, :, key_length - query_length : key_length, :key_length + ].to(torch.bool) causal_mask = torch.where(causal_mask, 0, mask_value) @@ -1000,7 +1012,12 @@ def gptj_wrapped_scaled_dot_product( causal_mask = causal_mask.expand(batch_size, -1, -1, -1) sdpa_result = torch.nn.functional.scaled_dot_product_attention( - query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False + query, + key, + value, + attn_mask=attention_mask, + dropout_p=dropout_p, + is_causal=False, ) # in gpt-neo-x and gpt-j the query and keys are always in fp32 @@ -1091,7 +1108,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): def forward(self, *args, **kwargs): return bt_aquila_forward(self, *args, **kwargs) - class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module): + class GPTJAttentionLayerBetterTransformer( + BetterTransformerBaseLayer, GPTJAttention, nn.Module + ): _attn = gptj_wrapped_scaled_dot_product def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): @@ -1118,7 +1137,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): setattr(self, attr, getattr(layer, attr)) self.module_mapping = None - self.original_layers_mapping = {submodule: submodule for submodule in submodules} + self.original_layers_mapping = { + submodule: submodule for submodule in submodules + } self.downcast_qk = True self.dropout_prob_attn = config.attn_pdrop diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py b/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py index cf465b8f53..003eb3c672 100644 --- a/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py +++ b/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py @@ -1,7 +1,10 @@ # -*- coding: utf-8 -*- # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig +from optimum.exporters.onnx.config import ( + TextDecoderOnnxConfig, + TextDecoderWithPositionIdsOnnxConfig, +) from optimum.exporters.tasks import TasksManager from optimum.utils import ( NormalizedTextConfig, @@ -19,14 +22,20 @@ class YIDummyTextInputGenerator(DummyTextInputGenerator): "position_ids", } - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + def generate( + self, + input_name: str, + framework: str = "pt", + int_dtype: str = "int64", + float_dtype: str = "fp32", + ): input = super().generate(input_name, framework, int_dtype, float_dtype) if input_name == "position_ids": input = input[:, -1:] return input -@register_in_tasks_manager('yi', *["text-generation", "text-generation-with-past"]) +@register_in_tasks_manager("yi", *["text-generation", "text-generation-with-past"]) class YIOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -42,11 +51,23 @@ class YIOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("jais", *["text-generation", "text-generation-with-past"]) class JaisOpenVINOConfig(TextDecoderOnnxConfig): DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head', hidden_size='n_embd') + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_layers="n_layer", num_attention_heads="n_head", hidden_size="n_embd" + ) -TasksManager._SUPPORTED_MODEL_TYPE['stablelm_epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] -TasksManager._SUPPORTED_MODEL_TYPE['stablelm-epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] -TasksManager._SUPPORTED_MODEL_TYPE['stablelm2'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] -TasksManager._SUPPORTED_MODEL_TYPE["aquila"] = TasksManager._SUPPORTED_MODEL_TYPE["stablelm"] -TasksManager._SUPPORTED_MODEL_TYPE["codegen2"] = TasksManager._SUPPORTED_MODEL_TYPE["codegen"] +TasksManager._SUPPORTED_MODEL_TYPE["stablelm_epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["stablelm"] +) +TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["stablelm"] +) +TasksManager._SUPPORTED_MODEL_TYPE["stablelm2"] = TasksManager._SUPPORTED_MODEL_TYPE[ + "stablelm" +] +TasksManager._SUPPORTED_MODEL_TYPE["aquila"] = TasksManager._SUPPORTED_MODEL_TYPE[ + "stablelm" +] +TasksManager._SUPPORTED_MODEL_TYPE["codegen2"] = TasksManager._SUPPORTED_MODEL_TYPE[ + "codegen" +] diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py b/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py index 5c6e05588e..b098fc6e78 100644 --- a/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py +++ b/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py @@ -16,27 +16,37 @@ import nncf from ..nncf_utils import COMPRESSION_OPTIONS from optimum.gptq.data import get_dataset, prepare_dataset -from optimum.intel.openvino.configuration import _check_default_4bit_configs, OVQuantizationMethod, _DEFAULT_4BIT_CONFIG +from optimum.intel.openvino.configuration import ( + _check_default_4bit_configs, + OVQuantizationMethod, + _DEFAULT_4BIT_CONFIG, +) import warnings class BackendType(Enum): - PYTORCH = 'pytorch' - OPENVINO = 'openvino' + PYTORCH = "pytorch" + OPENVINO = "openvino" -PYTORCH_DIR = 'pytorch' -PYTORCH_COMPRESS_WEIGHTS_DIR = 'compressed_weights/PT_{precision}-{compression}' -OV_DIR = 'dldt' +PYTORCH_DIR = "pytorch" +PYTORCH_COMPRESS_WEIGHTS_DIR = "compressed_weights/PT_{precision}-{compression}" +OV_DIR = "dldt" GPTQ_DIR = "GPTQ_INT4-{precision}" def is_torch_compression(args): - return args.compress_weights and BackendType.PYTORCH.value in args.compress_weights_backends + return ( + args.compress_weights + and BackendType.PYTORCH.value in args.compress_weights_backends + ) def is_ov_compression(args): - return args.compress_weights and BackendType.OPENVINO.value in args.compress_weights_backends + return ( + args.compress_weights + and BackendType.OPENVINO.value in args.compress_weights_backends + ) def is_fp16(args): @@ -47,7 +57,9 @@ def is_int8_compression(compress_weights_mode): return compress_weights_mode in ["INT8", "INT8_ASYM", "INT8_SYM"] -def is_ov_model_provided(model_id, model_dir, precision, model_name="openvino_model.xml"): +def is_ov_model_provided( + model_id, model_dir, precision, model_name="openvino_model.xml" +): model_dirs = [] if Path(model_id).is_dir(): model_dirs.append(Path(model_id)) @@ -61,7 +73,7 @@ def is_ov_model_provided(model_id, model_dir, precision, model_name="openvino_mo model_dirs.append(model_dir / PYTORCH_DIR / OV_DIR / precision) for md in model_dirs: found = True - for suffix in ['.xml', '.bin']: + for suffix in [".xml", ".bin"]: model_file = (md / model_name).with_suffix(suffix) if not model_file.exists(): found = False @@ -75,9 +87,14 @@ def get_fp_path(args, model_subpath): model_dirs = [] if Path(args.model_id).is_dir(): base_model_dir = Path(args.model_id) - model_dirs.extend([ - base_model_dir, base_model_dir / args.precision, base_model_dir / OV_DIR / args.precision, base_model_dir / PYTORCH_DIR / OV_DIR / args.precision - ]) + model_dirs.extend( + [ + base_model_dir, + base_model_dir / args.precision, + base_model_dir / OV_DIR / args.precision, + base_model_dir / PYTORCH_DIR / OV_DIR / args.precision, + ] + ) model_dir = Path(args.output_dir) model_dirs.append(model_dir) model_dirs.append(Path(model_dir) / args.precision) @@ -93,7 +110,7 @@ def save_tokenizer(tokenizer, out_dir): try: tokenizer.save_pretrained(out_dir) except Exception as e: - log.error(f'tokenizer loading failed with {e}') + log.error(f"tokenizer loading failed with {e}") def transform_fn( @@ -102,7 +119,7 @@ def transform_fn( input_ids: torch.LongTensor, attention_mask: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, - **kwargs + **kwargs, ): inputs = {"input_ids": np.array(input_ids)} @@ -159,13 +176,27 @@ def get_nncf_dataset(ov_model, tokenizer, config, dataset_name, subset_size): dataset = get_dataset(dataset_name, tokenizer, seqlen=32, nsamples=subset_size) dataset = prepare_dataset(dataset) input_shapes = get_ov_input_shapes(ov_model) - nncf_dataset = Dataset(dataset, lambda x: transform_fn(config=config, input_shapes=input_shapes, **x)) + nncf_dataset = Dataset( + dataset, lambda x: transform_fn(config=config, input_shapes=input_shapes, **x) + ) return nncf_dataset -def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_weights_format="INT8", fp16=False, args={}, model_name="openvino_model"): +def compress_ov_model_weights_helper( + ov_model, + tok, + config, + out_path, + compress_weights_format="INT8", + fp16=False, + args={}, + model_name="openvino_model", +): if "INT8" in compress_weights_format and "INT8_ASYM" in COMPRESSION_OPTIONS: - warnings.warn("Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", DeprecationWarning) + warnings.warn( + "Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", + DeprecationWarning, + ) if "4BIT_DEFAULT" in compress_weights_format: compression_args = _check_default_4bit_configs(config.name_or_path) if compression_args is None: @@ -182,7 +213,11 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w compression_args.pop("bits") sym = compression_args.pop("sym", False) - compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM + compression_args["mode"] = ( + nncf.CompressWeightsMode.INT4_SYM + if sym + else nncf.CompressWeightsMode.INT4_ASYM + ) if compression_args.pop("quant_method", None) == OVQuantizationMethod.AWQ: compression_args["awq"] = True if "num_samples" in compression_args: @@ -191,7 +226,14 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w compression_args.pop("all_layers", None) else: compression_args = copy.deepcopy(COMPRESSION_OPTIONS[compress_weights_format]) - for arg_name in ["ratio", "group_size", "all_layers", "dataset", "awq", "scale_estimation"]: + for arg_name in [ + "ratio", + "group_size", + "all_layers", + "dataset", + "awq", + "scale_estimation", + ]: arg_value = getattr(args, arg_name, None) if arg_value: compression_args[arg_name] = arg_value @@ -201,16 +243,26 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w dataset_name = compression_args.pop("dataset", None) if dataset_name is not None and tok is not None: - nncf_dataset = get_nncf_dataset(ov_model, tok, config, dataset_name, compression_args.get("subset_size", None)) + nncf_dataset = get_nncf_dataset( + ov_model, + tok, + config, + dataset_name, + compression_args.get("subset_size", None), + ) compression_args["dataset"] = nncf_dataset compressed_ov_model = compress_weights(ov_model, **compression_args) - save_ov_model_helper(compressed_ov_model, out_path, model_name, fp16=fp16, tok=tok, config=config) + save_ov_model_helper( + compressed_ov_model, out_path, model_name, fp16=fp16, tok=tok, config=config + ) -def save_ov_model_helper(ov_model, out_path, model_name='openvino_model', fp16=False, tok=None, config=None): +def save_ov_model_helper( + ov_model, out_path, model_name="openvino_model", fp16=False, tok=None, config=None +): model_name = model_name or "openvino_model" - save_model(ov_model, Path(out_path) / f'{model_name}.xml', compress_to_fp16=fp16) + save_model(ov_model, Path(out_path) / f"{model_name}.xml", compress_to_fp16=fp16) if tok is not None: save_tokenizer(tok, out_path) if config is not None: @@ -256,7 +308,11 @@ class StoreAttr(object): model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + if ( + self.desc_act + and not self.disable_exllama + and self.max_input_length is not None + ): model = exllama_set_max_input_length(model, self.max_input_length) return model @@ -266,5 +322,6 @@ class StoreAttr(object): def unpatch_gptq(orig_cuda_check, orig_post_init_model): from optimum.gptq import GPTQQuantizer + torch.cuda.is_available = orig_cuda_check GPTQQuantizer.post_init_model = orig_post_init_model diff --git a/llm_bench/python/llm_bench_utils/hook_beam_search.py b/llm_bench/python/llm_bench_utils/hook_beam_search.py index 99b0a9e5c3..f2dbab15b5 100644 --- a/llm_bench/python/llm_bench_utils/hook_beam_search.py +++ b/llm_bench/python/llm_bench_utils/hook_beam_search.py @@ -50,7 +50,9 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput): past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput] +GenerateBeamOutput = Union[ + GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput +] tm_list = [] tm_infer_list = [] @@ -60,400 +62,454 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput): # Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2911 # Add the function of collecting latency def new_beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - output_logits: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - sequential: Optional[bool] = None, - **model_kwargs, - ) -> Union[GenerateBeamOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search decoding** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_logits (`bool`, *optional*, defaults to `False`): - Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for - more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - sequential (`bool`, defaults to `False`): - By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for - more details). This flag will avoid parallelizing the beam search and will instead run beam search - sequentially. - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - sequential = sequential if sequential is not None else self.generation_config.low_memory - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - if len(stopping_criteria) == 0: - warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - if eos_token_id is not None: - logger.warning_once( - "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" - " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." - " Otherwise make sure to set `model.generation_config.eos_token_id`", - FutureWarning, - ) - stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) - else: - # TODO remove when the method is totally private and beam scorer refactored - # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever - eos_token_id = [ - criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") - ] - eos_token_id = eos_token_id[0] if eos_token_id else None - if eos_token_id is None and self.generation_config.eos_token_id is not None: - eos_token_id = self.generation_config.eos_token_id - stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) - - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_logits = output_logits if output_logits is not None else self.generation_config.output_logits - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions + self, + input_ids: torch.LongTensor, + beam_scorer: BeamScorer, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: bool = False, + sequential: Optional[bool] = None, + **model_kwargs, +) -> Union[GenerateBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **beam search decoding** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + + + In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (`BeamScorer`): + An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and + sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + sequential (`bool`, defaults to `False`): + By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for + more details). This flag will avoid parallelizing the beam search and will instead run beam search + sequentially. + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + + Examples: + + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... BeamSearchScorer, + ... ) + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") + + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + + + >>> # lets run beam search using 3 beams + >>> num_beams = 3 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()( + ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True + ... ) + ... } + + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... num_beams=num_beams, + ... device=model.device, + ... ) + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), + ... ] + ... ) + + >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + + >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) + ['Wie alt bist du?'] + ```""" + # init values + logits_processor = ( + logits_processor if logits_processor is not None else LogitsProcessorList() + ) + stopping_criteria = ( + stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + ) + sequential = ( + sequential if sequential is not None else self.generation_config.low_memory + ) + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + if len(stopping_criteria) == 0: + warnings.warn( + "You don't have defined any stopping_criteria, this will likely loop forever", + UserWarning, ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate + pad_token_id = ( + pad_token_id + if pad_token_id is not None + else self.generation_config.pad_token_id + ) + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private and beam scorer refactored + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() + for criteria in stopping_criteria + if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - if "inputs_embeds" in model_kwargs: - cur_len = model_kwargs["inputs_embeds"].shape[1] - model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + output_scores = ( + output_scores + if output_scores is not None + else self.generation_config.output_scores + ) + output_logits = ( + output_logits + if output_logits is not None + else self.generation_config.output_logits + ) + output_attentions = ( + output_attentions + if output_attentions is not None + else self.generation_config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.generation_config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + if "inputs_embeds" in model_kwargs: + cur_len = model_kwargs["inputs_embeds"].shape[1] + model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) + + if num_beams * batch_size != batch_beam_size: + raise ValueError( + f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + ) - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - raw_logits = () if (return_dict_in_generate and output_logits) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None + beam_indices = ( + tuple(() for _ in range(batch_beam_size)) + if (return_dict_in_generate and output_scores) + else None + ) + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = ( + () if (return_dict_in_generate and output_hidden_states) else None + ) + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = ( + model_kwargs["encoder_outputs"].get("attentions") + if output_attentions + else None + ) + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") + if output_hidden_states + else None ) - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens - # of the first beam are considered to avoid sampling the exact same tokens across all beams. - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False - - decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder - - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): - tic = time.perf_counter() - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # if sequential is True, split the input to batches of batch_size and run sequentially - tic_infer = time.perf_counter() - if sequential: - if any( - model_name in self.__class__.__name__.lower() - for model_name in [ - "fsmt", - "reformer", - "bloom", - "ctrl", - "gpt_bigcode", - "transo_xl", - "xlnet", - "cpm", - "jamba", - ] - ): - raise RuntimeError( - f"Currently generation for {self.__class__.__name__} is not supported " - f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." - ) - - inputs_per_sub_batches = _split_model_inputs( - model_inputs, split_size=batch_size, full_batch_size=batch_beam_size - ) - outputs_per_sub_batch = [ - self( - **inputs_per_sub_batch, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - for inputs_per_sub_batch in inputs_per_sub_batches + # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens + # of the first beam are considered to avoid sampling the exact same tokens across all beams. + beam_scores = torch.zeros( + (batch_size, num_beams), dtype=torch.float, device=input_ids.device + ) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * num_beams,)) + + this_peer_finished = False + + decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder + + while self._has_unfinished_sequences( + this_peer_finished, synced_gpus, device=input_ids.device + ): + tic = time.perf_counter() + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # if sequential is True, split the input to batches of batch_size and run sequentially + tic_infer = time.perf_counter() + if sequential: + if any( + model_name in self.__class__.__name__.lower() + for model_name in [ + "fsmt", + "reformer", + "bloom", + "ctrl", + "gpt_bigcode", + "transo_xl", + "xlnet", + "cpm", + "jamba", ] + ): + raise RuntimeError( + f"Currently generation for {self.__class__.__name__} is not supported " + f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." + ) - outputs = stack_model_outputs(outputs_per_sub_batch) - - else: # Unchanged original behavior - outputs = self( - **model_inputs, + inputs_per_sub_batches = _split_model_inputs( + model_inputs, split_size=batch_size, full_batch_size=batch_beam_size + ) + outputs_per_sub_batch = [ + self( + **inputs_per_sub_batch, return_dict=True, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) - tm_infer_list.append(time.perf_counter() - tic_infer) - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor(input_ids, next_token_scores) - next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as( - next_token_scores_processed - ) + for inputs_per_sub_batch in inputs_per_sub_batches + ] - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores_processed,) - if output_logits: - raw_logits += (next_token_logits,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) - - # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. - n_eos_tokens = len(eos_token_id) if eos_token_id else 0 - next_token_scores, next_tokens = torch.topk( - next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True - ) + outputs = stack_model_outputs(outputs_per_sub_batch) - next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=beam_indices, - decoder_prompt_len=decoder_prompt_len, + else: # Unchanged original behavior + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, ) + tm_infer_list.append(time.perf_counter() - tic_infer) + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] + next_token_logits = outputs.logits[:, -1, :] + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) - input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) + next_token_scores_processed = logits_processor(input_ids, next_token_scores) + next_token_scores = next_token_scores_processed + beam_scores[ + :, None + ].expand_as(next_token_scores_processed) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - if model_kwargs.get("past_key_values", None) is not None: - model_kwargs["past_key_values"] = self._temporary_reorder_cache( - model_kwargs["past_key_values"], beam_idx + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores_processed,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) + if self.config.is_encoder_decoder + else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) ) - if return_dict_in_generate and output_scores: - beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + + # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. + n_eos_tokens = len(eos_token_id) if eos_token_id else 0 + next_token_scores, next_tokens = torch.topk( + next_token_scores, + max(2, 1 + n_eos_tokens) * num_beams, + dim=1, + largest=True, + sorted=True, + ) - # increase cur_len - cur_len = cur_len + 1 - tm_list.append(time.perf_counter() - tic) - if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): - this_peer_finished = True + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") + next_tokens = next_tokens % vocab_size - sequence_outputs = beam_scorer.finalize( + # stateless + beam_outputs = beam_scorer.process( input_ids, - beam_scores, + next_token_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, beam_indices=beam_indices, decoder_prompt_len=decoder_prompt_len, ) - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - - if self.config.is_encoder_decoder: - return GenerateBeamEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - logits=raw_logits, - beam_indices=sequence_outputs["beam_indices"], - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) - else: - return GenerateBeamDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - logits=raw_logits, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + + input_ids = torch.cat( + [input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1 + ) + + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + if model_kwargs.get("past_key_values", None) is not None: + model_kwargs["past_key_values"] = self._temporary_reorder_cache( + model_kwargs["past_key_values"], beam_idx + ) + + if return_dict_in_generate and output_scores: + beam_indices = tuple( + ( + beam_indices[beam_idx[i]] + (beam_idx[i],) + for i in range(len(beam_indices)) ) + ) + + # increase cur_len + cur_len = cur_len + 1 + tm_list.append(time.perf_counter() - tic) + if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids, + beam_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + beam_indices=beam_indices, + decoder_prompt_len=decoder_prompt_len, + ) + + if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + + if self.config.is_encoder_decoder: + return GenerateBeamEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + logits=raw_logits, + beam_indices=sequence_outputs["beam_indices"], + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) else: - return sequence_outputs["sequences"] + return GenerateBeamDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + logits=raw_logits, + beam_indices=sequence_outputs["beam_indices"], + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return sequence_outputs["sequences"] class BeamSearchHook: @@ -485,4 +541,4 @@ def get_time_infer_list(self): def new_forward(self, model): """Define a new beam search function.""" - model._beam_search = new_beam_search.__get__(model, model.__class__) \ No newline at end of file + model._beam_search = new_beam_search.__get__(model, model.__class__) diff --git a/llm_bench/python/llm_bench_utils/hook_common.py b/llm_bench/python/llm_bench_utils/hook_common.py index 4751ed7d4d..299641c24e 100644 --- a/llm_bench/python/llm_bench_utils/hook_common.py +++ b/llm_bench/python/llm_bench_utils/hook_common.py @@ -6,22 +6,25 @@ import transformers from packaging import version -TRANS_MIN_VERSION = '4.40.0' +TRANS_MIN_VERSION = "4.40.0" def get_bench_hook(num_beams, ov_model): min_version = version.parse(TRANS_MIN_VERSION) trans_version = version.parse(transformers.__version__) - search_type = 'beam search' if num_beams > 1 else 'greedy search' + search_type = "beam search" if num_beams > 1 else "greedy search" if trans_version >= min_version: import llm_bench_utils.hook_greedy_search import llm_bench_utils.hook_beam_search + if num_beams > 1: bench_hook = llm_bench_utils.hook_beam_search.BeamSearchHook() else: bench_hook = llm_bench_utils.hook_greedy_search.GreedySearchHook() bench_hook.new_forward(ov_model) else: - log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') + log.warning( + f"The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}" + ) bench_hook = None - return bench_hook \ No newline at end of file + return bench_hook diff --git a/llm_bench/python/llm_bench_utils/hook_forward.py b/llm_bench/python/llm_bench_utils/hook_forward.py index 702bd947e1..522aee1530 100644 --- a/llm_bench/python/llm_bench_utils/hook_forward.py +++ b/llm_bench/python/llm_bench_utils/hook_forward.py @@ -11,19 +11,35 @@ def __init__(self): self.vae_decoder_step_count = 0 def get_text_encoder_latency(self): - return (self.text_encoder_time / self.text_encoder_step_count) * 1000 if self.text_encoder_step_count > 0 else 0 + return ( + (self.text_encoder_time / self.text_encoder_step_count) * 1000 + if self.text_encoder_step_count > 0 + else 0 + ) def get_1st_unet_latency(self): return self.unet_time_list[0] * 1000 if len(self.unet_time_list) > 0 else 0 def get_2nd_unet_latency(self): - return sum(self.unet_time_list[1:]) / (len(self.unet_time_list) - 1) * 1000 if len(self.unet_time_list) > 1 else 0 + return ( + sum(self.unet_time_list[1:]) / (len(self.unet_time_list) - 1) * 1000 + if len(self.unet_time_list) > 1 + else 0 + ) def get_unet_latency(self): - return (sum(self.unet_time_list) / len(self.unet_time_list)) * 1000 if len(self.unet_time_list) > 0 else 0 + return ( + (sum(self.unet_time_list) / len(self.unet_time_list)) * 1000 + if len(self.unet_time_list) > 0 + else 0 + ) def get_vae_decoder_latency(self): - return (self.vae_decoder_time / self.vae_decoder_step_count) * 1000 if self.vae_decoder_step_count > 0 else 0 + return ( + (self.vae_decoder_time / self.vae_decoder_step_count) * 1000 + if self.vae_decoder_step_count > 0 + else 0 + ) def get_text_encoder_step_count(self): return self.text_encoder_step_count @@ -53,6 +69,7 @@ def my_text_encoder(inputs, share_inputs=True, **kwargs): self.text_encoder_time += text_encoder_time self.text_encoder_step_count += 1 return r + pipe.text_encoder.request = my_text_encoder def new_unet(self, pipe): @@ -66,6 +83,7 @@ def my_unet(inputs, share_inputs=True, **kwargs): self.unet_time_list.append(unet_time) self.unet_step_count += 1 return r + pipe.unet.request = my_unet def new_vae_decoder(self, pipe): @@ -79,4 +97,5 @@ def my_vae_decoder(inputs, share_inputs=True, **kwargs): self.vae_decoder_time += vae_decoder_time self.vae_decoder_step_count += 1 return r + pipe.vae_decoder.request = my_vae_decoder diff --git a/llm_bench/python/llm_bench_utils/hook_greedy_search.py b/llm_bench/python/llm_bench_utils/hook_greedy_search.py index a4de32625a..83e05b7a3d 100644 --- a/llm_bench/python/llm_bench_utils/hook_greedy_search.py +++ b/llm_bench/python/llm_bench_utils/hook_greedy_search.py @@ -47,281 +47,320 @@ class GenerateEncoderDecoderOutput(ModelOutput): tm_list = [] tm_infer_list = [] + # Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 # Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310 # Add the function of collecting latency def new_greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - output_logits: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - streamer: Optional["BaseStreamer"] = None, - **model_kwargs, - ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be - used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin._greedy_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - output_logits (`bool`, *optional*, defaults to `False`): - Whether or not to return the raw prediction logit scores. See `logits` under returned tensors - for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. - model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - - >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token - >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id - - >>> input_prompt = "It might be possible to" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), - ... ] - ... ) - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - - >>> outputs = model._greedy_search( - ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ["It might be possible to get a better understanding of the nature of the problem, but it's not"] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - if eos_token_id is not None: - logger.warning_once( - "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" - " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." - " Otherwise make sure to set `model.generation_config.eos_token_id`", - FutureWarning, - ) - stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) - else: - # TODO remove when the method is totally private - # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever - eos_token_id = [ - criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") - ] - eos_token_id = eos_token_id[0] if eos_token_id else None - if eos_token_id is None and self.generation_config.eos_token_id is not None: - eos_token_id = self.generation_config.eos_token_id - stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) - - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: bool = False, + streamer: Optional["BaseStreamer"] = None, + **model_kwargs, +) -> Union[GenerateNonBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be + used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + + + In most cases, you do not need to call [`~generation.GenerationMixin._greedy_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + + + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors + for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + model_kwargs: + Additional model specific keyword arguments will be forwarded to the `forward` function of the model. + If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + Examples: + + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForCausalLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... StoppingCriteriaList, + ... MaxLengthCriteria, + ... ) + + >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") + + >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token + >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id + + >>> input_prompt = "It might be possible to" + >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), + ... ] + ... ) + >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) + + >>> outputs = model._greedy_search( + ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria + ... ) + + >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) + ["It might be possible to get a better understanding of the nature of the problem, but it's not"] + ```""" + # init values + logits_processor = ( + logits_processor if logits_processor is not None else LogitsProcessorList() + ) + stopping_criteria = ( + stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + ) + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = ( + pad_token_id + if pad_token_id is not None + else self.generation_config.pad_token_id + ) + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() + for criteria in stopping_criteria + if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + output_scores = ( + output_scores + if output_scores is not None + else self.generation_config.output_scores + ) + output_attentions = ( + output_attentions + if output_attentions is not None + else self.generation_config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.generation_config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + raw_logits = () if (return_dict_in_generate and output_logits) else None + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = ( + () if (return_dict_in_generate and output_hidden_states) else None + ) + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = ( + model_kwargs["encoder_outputs"].get("attentions") + if output_attentions + else None + ) + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") + if output_hidden_states + else None ) - # init attention / hidden states / scores tuples - raw_logits = () if (return_dict_in_generate and output_logits) else None - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) + # keep track of which sequences are already finished + batch_size, cur_len = input_ids.shape + if "inputs_embeds" in model_kwargs: + cur_len = model_kwargs["inputs_embeds"].shape[1] + this_peer_finished = False + unfinished_sequences = torch.ones( + batch_size, dtype=torch.long, device=input_ids.device + ) + model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) + + while self._has_unfinished_sequences( + this_peer_finished, synced_gpus, device=input_ids.device + ): + tic = time.perf_counter() + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + tic_infer = time.perf_counter() + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + tm_infer_list.append(time.perf_counter() - tic_infer) - # keep track of which sequences are already finished - batch_size, cur_len = input_ids.shape - if "inputs_embeds" in model_kwargs: - cur_len = model_kwargs["inputs_embeds"].shape[1] - this_peer_finished = False - unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) - model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) - - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): - tic = time.perf_counter() - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_logits: - raw_logits += (next_token_logits,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if streamer is not None: - streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need - unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) - this_peer_finished = unfinished_sequences.max() == 0 - tm_list.append(time.perf_counter() - tic) + next_token_logits = outputs.logits[:, -1, :] - if streamer is not None: - streamer.end() + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits) + # Store scores, attentions and hidden_states when required if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GenerateEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - logits=raw_logits, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), + if output_scores: + scores += (next_tokens_scores,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) + if self.config.is_encoder_decoder + else (outputs.attentions,) ) - else: - return GenerateDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - logits=raw_logits, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) ) + + # argmax + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError( + "If `eos_token_id` is defined, make sure that `pad_token_id` is defined." + ) + next_tokens = next_tokens * unfinished_sequences + pad_token_id * ( + 1 - unfinished_sequences + ) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + + unfinished_sequences = unfinished_sequences & ~stopping_criteria( + input_ids, scores + ) + this_peer_finished = unfinished_sequences.max() == 0 + tm_list.append(time.perf_counter() - tic) + + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GenerateEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) else: - return input_ids + return GenerateDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return input_ids class GreedySearchHook: @@ -355,4 +394,3 @@ def new_forward(self, model): """Define a new greedy search function.""" model._greedy_search = new_greedy_search.__get__(model, model.__class__) model._sample = hook_sample.new_sample.__get__(model, model.__class__) - diff --git a/llm_bench/python/llm_bench_utils/hook_sample.py b/llm_bench/python/llm_bench_utils/hook_sample.py index 22111c1a3f..f8c4b2733b 100644 --- a/llm_bench/python/llm_bench_utils/hook_sample.py +++ b/llm_bench/python/llm_bench_utils/hook_sample.py @@ -49,181 +49,201 @@ class GenerateEncoderDecoderOutput(ModelOutput): # Copied from https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/generation/utils.py#L2310 # Add the function of collecting latency def new_sample( - self, - input_ids: torch.LongTensor, - logits_processor: LogitsProcessorList, - stopping_criteria: StoppingCriteriaList, - generation_config: GenerationConfig, - synced_gpus: bool, - streamer: Optional["BaseStreamer"], - logits_warper: Optional[LogitsProcessorList] = None, - **model_kwargs, - ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - generation_config ([`~generation.GenerationConfig`]): - The generation configuration to be used as parametrization of the decoding method. - synced_gpus (`bool`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. - logits_warper (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used - to warp the prediction score distribution of the language modeling head applied before multinomial - sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in - `generation_config`) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: - A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - """ - # init values - pad_token_id = generation_config.pad_token_id - output_attentions = generation_config.output_attentions - output_hidden_states = generation_config.output_hidden_states - output_scores = generation_config.output_scores - output_logits = generation_config.output_logits - return_dict_in_generate = generation_config.return_dict_in_generate - has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) - do_sample = generation_config.do_sample - if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): - raise ValueError( - "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " - f"{logits_warper})." - ) + self, + input_ids: torch.LongTensor, + logits_processor: LogitsProcessorList, + stopping_criteria: StoppingCriteriaList, + generation_config: GenerationConfig, + synced_gpus: bool, + streamer: Optional["BaseStreamer"], + logits_warper: Optional[LogitsProcessorList] = None, + **model_kwargs, +) -> Union[GenerateNonBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + generation_config ([`~generation.GenerationConfig`]): + The generation configuration to be used as parametrization of the decoding method. + synced_gpus (`bool`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + logits_warper (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used + to warp the prediction score distribution of the language modeling head applied before multinomial + sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in + `generation_config`) + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: + A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + """ + # init values + pad_token_id = generation_config.pad_token_id + output_attentions = generation_config.output_attentions + output_hidden_states = generation_config.output_hidden_states + output_scores = generation_config.output_scores + output_logits = generation_config.output_logits + return_dict_in_generate = generation_config.return_dict_in_generate + has_eos_stopping_criteria = any( + hasattr(criteria, "eos_token_id") for criteria in stopping_criteria + ) + do_sample = generation_config.do_sample + if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): + raise ValueError( + "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " + f"{logits_warper})." + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = ( + () if (return_dict_in_generate and output_hidden_states) else None + ) + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = ( + model_kwargs["encoder_outputs"].get("attentions") + if output_attentions + else None + ) + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") + if output_hidden_states + else None + ) + + # keep track of which sequences are already finished + batch_size = input_ids.shape[0] + this_peer_finished = False + unfinished_sequences = torch.ones( + batch_size, dtype=torch.long, device=input_ids.device + ) + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + while self._has_unfinished_sequences( + this_peer_finished, synced_gpus, device=input_ids.device + ): + tic = time.perf_counter() + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + tic_infer = time.perf_counter() + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) + + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + if do_sample: + next_token_scores = logits_warper(input_ids, next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) + if self.config.is_encoder_decoder + else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - raw_logits = () if (return_dict_in_generate and output_logits) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) + # token selection + if do_sample: + probs = nn.functional.softmax(next_token_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(next_token_scores, dim=-1) - # keep track of which sequences are already finished - batch_size = input_ids.shape[0] - this_peer_finished = False - unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) - model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) - - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): - tic = time.perf_counter() - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_token_scores = logits_processor(input_ids, next_token_logits) - if do_sample: - next_token_scores = logits_warper(input_ids, next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores,) - if output_logits: - raw_logits += (next_token_logits,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # token selection - if do_sample: - probs = nn.functional.softmax(next_token_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) - else: - next_tokens = torch.argmax(next_token_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if has_eos_stopping_criteria: - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if streamer is not None: - streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, + # finished sentences should have their next token be a padding token + if has_eos_stopping_criteria: + next_tokens = next_tokens * unfinished_sequences + pad_token_id * ( + 1 - unfinished_sequences ) - unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) - this_peer_finished = unfinished_sequences.max() == 0 - hook_greedy.tm_list.append(time.perf_counter() - tic) - + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) if streamer is not None: - streamer.end() - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GenerateEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - logits=raw_logits, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) - else: - return GenerateDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - logits=raw_logits, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + + unfinished_sequences = unfinished_sequences & ~stopping_criteria( + input_ids, scores + ) + this_peer_finished = unfinished_sequences.max() == 0 + hook_greedy.tm_list.append(time.perf_counter() - tic) + + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GenerateEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) else: - return input_ids \ No newline at end of file + return GenerateDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return input_ids diff --git a/llm_bench/python/llm_bench_utils/memory_profile.py b/llm_bench/python/llm_bench_utils/memory_profile.py index 25bf33c938..def06300fe 100644 --- a/llm_bench/python/llm_bench_utils/memory_profile.py +++ b/llm_bench/python/llm_bench_utils/memory_profile.py @@ -28,10 +28,10 @@ def collect_memory_consumption(self): try: memory_full_info = process.memory_full_info() rss_mem_data = memory_full_info.rss - if sys.platform.startswith('linux'): + if sys.platform.startswith("linux"): shared_mem_data = memory_full_info.shared uss_mem_data = rss_mem_data - shared_mem_data - elif sys.platform.startswith('win'): + elif sys.platform.startswith("win"): uss_mem_data = memory_full_info.uss shared_mem_data = rss_mem_data - uss_mem_data else: @@ -70,9 +70,21 @@ def get_max_memory_consumption(self): """Return the data.""" self.g_data_event.wait() self.g_data_event.clear() - max_rss_mem = self.g_max_rss_mem_consumption / float(2**20) if self.g_max_rss_mem_consumption > -1 else -1 - max_shared_mem = self.g_max_shared_mem_consumption / float(2**20) if self.g_max_shared_mem_consumption > -1 else -1 - max_uss_mem = self.g_max_uss_mem_consumption / float(2**20) if self.g_max_uss_mem_consumption > -1 else -1 + max_rss_mem = ( + self.g_max_rss_mem_consumption / float(2**20) + if self.g_max_rss_mem_consumption > -1 + else -1 + ) + max_shared_mem = ( + self.g_max_shared_mem_consumption / float(2**20) + if self.g_max_shared_mem_consumption > -1 + else -1 + ) + max_uss_mem = ( + self.g_max_uss_mem_consumption / float(2**20) + if self.g_max_uss_mem_consumption > -1 + else -1 + ) return max_rss_mem, max_shared_mem, max_uss_mem def clear_max_memory_consumption(self): diff --git a/llm_bench/python/llm_bench_utils/metrics_print.py b/llm_bench/python/llm_bench_utils/metrics_print.py index c172060d8b..8fcec3e72b 100644 --- a/llm_bench/python/llm_bench_utils/metrics_print.py +++ b/llm_bench/python/llm_bench_utils/metrics_print.py @@ -5,123 +5,140 @@ def print_metrics( - iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1, - max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1 + iter_num, + iter_data, + tms=None, + tms_infer=None, + warm_up=False, + max_rss_mem=-1, + max_shared_mem=-1, + max_uss_mem=-1, + stable_diffusion=None, + tokenization_time=None, + batch_size=1, ): iter_str = str(iter_num) if warm_up: - iter_str = 'warm-up' - output_str = '' - latency_unit = 'token' + iter_str = "warm-up" + output_str = "" + latency_unit = "token" if batch_size > 1: - latency_unit = '{}tokens'.format(batch_size) - if iter_data['input_size'] != '': - output_str += 'Input token size: {}, '.format(iter_data['input_size']) - if iter_data['output_size'] != '': - output_str += 'Output size: {}, '.format(iter_data['output_size']) - if iter_data['infer_count'] != '': - output_str += 'Infer count: {}, '.format(iter_data['infer_count']) + latency_unit = "{}tokens".format(batch_size) + if iter_data["input_size"] != "": + output_str += "Input token size: {}, ".format(iter_data["input_size"]) + if iter_data["output_size"] != "": + output_str += "Output size: {}, ".format(iter_data["output_size"]) + if iter_data["infer_count"] != "": + output_str += "Infer count: {}, ".format(iter_data["infer_count"]) if tokenization_time: - output_str += 'Tokenization Time: {:.2f}ms, '.format(tokenization_time[0]) + output_str += "Tokenization Time: {:.2f}ms, ".format(tokenization_time[0]) if len(tokenization_time) > 1: - output_str += 'Detokenization Time: {:.2f}ms, '.format(tokenization_time[1]) - if iter_data['generation_time'] != '': - output_str += 'Generation Time: {:.2f}s, '.format(iter_data['generation_time']) - if iter_data['latency'] != '': - output_str += 'Latency: {:.2f} ms/{}'.format(iter_data['latency'], latency_unit) - if output_str != '': - output_str = ' '.join(['[{}]'.format(iter_str), output_str]) + output_str += "Detokenization Time: {:.2f}ms, ".format(tokenization_time[1]) + if iter_data["generation_time"] != "": + output_str += "Generation Time: {:.2f}s, ".format(iter_data["generation_time"]) + if iter_data["latency"] != "": + output_str += "Latency: {:.2f} ms/{}".format(iter_data["latency"], latency_unit) + if output_str != "": + output_str = " ".join(["[{}]".format(iter_str), output_str]) log.info(output_str) - if tms is not None: + if len(tms) > 0: iter_data['first_token_latency'] = tms[0] * 1000 if len(tms) > 0 else -1 iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1 - first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/{latency_unit}" - other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}" log.info( f"[{iter_str}] First token latency: {first_token_latency}, " f"other tokens latency: {other_token_latency}, len of tokens: {len(tms)} * {batch_size}", ) - if len(tms) == 0: + else: + if tokenization_time: log.warning(f'[{iter_str}] No hook data output for first token latency and other tokens latency') - if tms_infer is not None: + if len(tms_infer) > 0: iter_data['first_token_infer_latency'] = tms_infer[0] * 1000 if len(tms_infer) > 0 else -1 iter_data['other_tokens_infer_avg_latency'] = sum(tms_infer[1:]) / (len(tms_infer) - 1) * 1000 if len(tms_infer) > 1 else -1 - first_infer_latency = 'NA' if iter_data['first_token_infer_latency'] == -1 else f"{iter_data['first_token_infer_latency']:.2f} ms/infer" - other_infer_latency = 'NA' if iter_data['other_tokens_infer_avg_latency'] == -1 else f"{iter_data['other_tokens_infer_avg_latency']:.2f} ms/infer" log.info( f"[{iter_str}] First infer latency: {first_infer_latency}, " f"other infers latency: {other_infer_latency}, inference count: {len(tms_infer)}", ) - if len(tms_infer) == 0: + else: + if tokenization_time: log.warning(f'[{iter_str}] No hook data output for first infer latency and other infers latency') if stable_diffusion is not None: print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion) - output_str = '' - if max_rss_mem != '' and max_rss_mem > -1: - output_str += 'Max rss memory cost: {:.2f}MBytes, '.format(max_rss_mem) - if max_uss_mem != '' and max_uss_mem > -1: - output_str += 'max uss memory cost: {:.2f}MBytes, '.format(max_uss_mem) - if max_shared_mem != '' and max_shared_mem > -1: - output_str += 'max shared memory cost: {:.2f}MBytes'.format(max_shared_mem) - if output_str != '': - output_str = ' '.join(['[{}]'.format(iter_str), output_str]) + output_str = "" + if max_rss_mem != "" and max_rss_mem > -1: + output_str += "Max rss memory cost: {:.2f}MBytes, ".format(max_rss_mem) + if max_uss_mem != "" and max_uss_mem > -1: + output_str += "max uss memory cost: {:.2f}MBytes, ".format(max_uss_mem) + if max_shared_mem != "" and max_shared_mem > -1: + output_str += "max shared memory cost: {:.2f}MBytes".format(max_shared_mem) + if output_str != "": + output_str = " ".join(["[{}]".format(iter_str), output_str]) log.info(output_str) - if iter_data['result_md5'] != '': + if iter_data["result_md5"] != "": log.info(f"[{iter_str}] Result MD5:{iter_data['result_md5']}") def print_generated(iter_num, warm_up=False, generated=None): iter_str = str(iter_num) if warm_up: - iter_str = 'warm-up' + iter_str = "warm-up" if generated is not None: try: - log.info(f'[{iter_str}] Generated: {generated}') + log.info(f"[{iter_str}] Generated: {generated}") except UnicodeError: try: - utf8_generated = generated.encode(encoding="utf-8", errors="replace").decode() - log.info(f'[{iter_str}] Generated: {utf8_generated}') + utf8_generated = generated.encode( + encoding="utf-8", errors="replace" + ).decode() + log.info(f"[{iter_str}] Generated: {utf8_generated}") except Exception: log.warning(f"[{iter_str}] Unable print generated") def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion): - iter_data['first_token_latency'] = stable_diffusion.get_1st_unet_latency() - iter_data['other_tokens_avg_latency'] = stable_diffusion.get_2nd_unet_latency() - iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] - iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] - log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " - f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) - log.info(f"[{iter_str}] Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, " - f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " - f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, " - f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " - f"unet step count: {stable_diffusion.get_unet_step_count()}, " - f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",) + iter_data["first_token_latency"] = stable_diffusion.get_1st_unet_latency() + iter_data["other_tokens_avg_latency"] = stable_diffusion.get_2nd_unet_latency() + iter_data["first_token_infer_latency"] = iter_data["first_token_latency"] + iter_data["other_tokens_infer_avg_latency"] = iter_data["other_tokens_avg_latency"] + log.info( + f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " + f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step", + ) + log.info( + f"[{iter_str}] Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, " + f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " + f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, " + f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " + f"unet step count: {stable_diffusion.get_unet_step_count()}, " + f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}", + ) def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False): iter_str = str(iter_num) if warm_up: - iter_str = 'warm-up' + iter_str = "warm-up" len_tms = len(tms) - iter_data['first_token_latency'] = tms[0] * 1000 if len_tms > 0 else -1 - iter_data['other_tokens_avg_latency'] = sum(tms[1:(len_tms - 1)]) / (len_tms - 2) * 1000 if len_tms > 2 else 0 - iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] - iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] + iter_data["first_token_latency"] = tms[0] * 1000 if len_tms > 0 else -1 + iter_data["other_tokens_avg_latency"] = ( + sum(tms[1 : (len_tms - 1)]) / (len_tms - 2) * 1000 if len_tms > 2 else 0 + ) + iter_data["first_token_infer_latency"] = iter_data["first_token_latency"] + iter_data["other_tokens_infer_avg_latency"] = iter_data["other_tokens_avg_latency"] - first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/step" - other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/step" - log.info(f"[{iter_str}] First step of unet latency: {first_token_latency}, " - f"other steps of unet latency: {other_token_latency}",) + log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " + f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) if len_tms > 1: - log.info(f"[{iter_str}] Unet latency: {(sum(tms[0:(len_tms - 1)]) / (len_tms - 1)) * 1000:.2f} ms/step, " - f"vqvae decoder latency: {tms[len_tms - 1] * 1000:.2f} ms/step, " - f"unet step count: {len_tms - 1}, " - f"vqvae decoder step count: 1",) + log.info( + f"[{iter_str}] Unet latency: {(sum(tms[0:(len_tms - 1)]) / (len_tms - 1)) * 1000:.2f} ms/step, " + f"vqvae decoder latency: {tms[len_tms - 1] * 1000:.2f} ms/step, " + f"unet step count: {len_tms - 1}, " + f"vqvae decoder step count: 1", + ) -def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen): +def output_avg_statis_tokens( + prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen +): for p_idx in prompt_idx_list: avg_1st_token_latency = 0 avg_2nd_tokens_latency = 0 @@ -130,12 +147,22 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch index_num = 0 for iter_data in iter_data_list: # Exclude the warm-up iteration - if iter_data['iteration'] == 0: + if iter_data["iteration"] == 0: continue - if iter_data['prompt_idx'] == p_idx: - avg_1st_token_latency += iter_data['first_token_latency'] if iter_data['first_token_latency'] != '' else 0 - avg_2nd_tokens_latency += iter_data['other_tokens_avg_latency'] if iter_data['other_tokens_avg_latency'] != '' else 0 - avg_input_size += iter_data['input_size'] if iter_data['input_size'] != '' else 0 + if iter_data["prompt_idx"] == p_idx: + avg_1st_token_latency += ( + iter_data["first_token_latency"] + if iter_data["first_token_latency"] != "" + else 0 + ) + avg_2nd_tokens_latency += ( + iter_data["other_tokens_avg_latency"] + if iter_data["other_tokens_avg_latency"] != "" + else 0 + ) + avg_input_size += ( + iter_data["input_size"] if iter_data["input_size"] != "" else 0 + ) index_num = index_num + 1 if index_num > 0: avg_1st_token_latency = avg_1st_token_latency / index_num @@ -143,23 +170,20 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch avg_input_size = int(avg_input_size / index_num) if avg_2nd_tokens_latency > 0: avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 - latency_unit = 'token' if is_text_gen is True else 'step' + latency_unit = "token" if is_text_gen is True else "step" if batch_size > 1: if is_text_gen is True: - latency_unit = '{}tokens'.format(batch_size) + latency_unit = "{}tokens".format(batch_size) else: latency_unit = '{}steps'.format(batch_size) - avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}' - avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}' - avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s' if is_text_gen is True: - prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {}, ' \ - '2nd token lantency: {}, 2nd tokens throughput: {}' \ - .format(p_idx, avg_input_size, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput) + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \ + '2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \ + .format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput) else: - prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency: {}, ' \ - '2nd steps of unet latency: {}, 2nd steps throughput: {}' \ - .format(p_idx, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput) + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency {:.2f} ms/{}, ' \ + '2nd steps of unet latency: {:.2f} ms/{}, 2nd steps throughput: {:.2f} steps/s' \ + .format(p_idx, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput) def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False): @@ -170,22 +194,24 @@ def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False total_num_tokens = 0 warm_up_iters = 0 for iter_data in iter_data_list: - if iter_data['iteration'] == 0: + if iter_data["iteration"] == 0: # Exclude the warm-up iteration warm_up_iters = warm_up_iters + 1 continue - if iter_data['generation_time'] != '': - total_generation_time += iter_data['generation_time'] - if iter_data['output_size'] != '': - total_num_tokens += iter_data['output_size'] + if iter_data["generation_time"] != "": + total_generation_time += iter_data["generation_time"] + if iter_data["output_size"] != "": + total_num_tokens += iter_data["output_size"] total_iters = len(iter_data_list) - warm_up_iters if total_iters > 0: prompt_dict = {} - output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen) - log.info('<<< Warm-up iteration is excluded. >>>') - out_str = '[Total] Iterations: {}'.format(total_iters) + output_avg_statis_tokens( + prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen + ) + log.info("<<< Warm-up iteration is excluded. >>>") + out_str = "[Total] Iterations: {}".format(total_iters) for prompt_key in prompt_dict: out_str += prompt_dict[prompt_key] log.info(out_str) diff --git a/llm_bench/python/llm_bench_utils/model_utils.py b/llm_bench/python/llm_bench_utils/model_utils.py index b35d7be47b..fd1946a3d9 100644 --- a/llm_bench/python/llm_bench_utils/model_utils.py +++ b/llm_bench/python/llm_bench_utils/model_utils.py @@ -6,112 +6,137 @@ import json import logging as log from pathlib import Path -from llm_bench_utils.config_class import DEFAULT_MODEL_CLASSES, USE_CASES, OV_MODEL_CLASSES_MAPPING, PT_MODEL_CLASSES_MAPPING +from llm_bench_utils.config_class import ( + DEFAULT_MODEL_CLASSES, + USE_CASES, + OV_MODEL_CLASSES_MAPPING, + PT_MODEL_CLASSES_MAPPING, +) def get_prompts(args): prompts_list = [] - if args['prompt'] is None and args['prompt_file'] is None: - if args['use_case'] == 'text_gen': - prompts_list.append('What is OpenVINO?') - elif args['use_case'] == 'code_gen': - prompts_list.append('def print_hello_world():') - elif args['prompt'] is not None and args['prompt_file'] is not None: - raise RuntimeError('== prompt and prompt file should not exist together ==') + if args["prompt"] is None and args["prompt_file"] is None: + if args["use_case"] == "text_gen": + prompts_list.append("What is OpenVINO?") + elif args["use_case"] == "code_gen": + prompts_list.append("def print_hello_world():") + elif args["prompt"] is not None and args["prompt_file"] is not None: + raise RuntimeError("== prompt and prompt file should not exist together ==") else: - if args['prompt'] is not None: - if args['prompt'] != '': - prompts_list.append(args['prompt']) + if args["prompt"] is not None: + if args["prompt"] != "": + prompts_list.append(args["prompt"]) else: - raise RuntimeError('== prompt should not be empty string ==') + raise RuntimeError("== prompt should not be empty string ==") else: - input_prompt_list = args['prompt_file'] + input_prompt_list = args["prompt_file"] for input_prompt in input_prompt_list: - if input_prompt.endswith('.jsonl'): + if input_prompt.endswith(".jsonl"): if os.path.exists(input_prompt): - log.info(f'Read prompts from {input_prompt}') - with open(input_prompt, 'r', encoding='utf-8') as f: + log.info(f"Read prompts from {input_prompt}") + with open(input_prompt, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) - if 'prompt' in data: - if data['prompt'] != '': - prompts_list.append(data['prompt']) + if "prompt" in data: + if data["prompt"] != "": + prompts_list.append(data["prompt"]) else: - raise RuntimeError(f'== prompt in prompt file:{input_prompt} should not be empty string ==') + raise RuntimeError( + f"== prompt in prompt file:{input_prompt} should not be empty string ==" + ) else: - raise RuntimeError(f'== key word "prompt" does not exist in prompt file:{input_prompt} ==') + raise RuntimeError( + f'== key word "prompt" does not exist in prompt file:{input_prompt} ==' + ) else: - raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==') + raise RuntimeError( + f"== The prompt file:{input_prompt} does not exist ==" + ) else: - raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==') + raise RuntimeError( + f"== The prompt file:{input_prompt} should be ended with .jsonl ==" + ) return prompts_list def get_image_param_from_prompt_file(args): image_param_list = [] - if args['prompt'] is None and args['prompt_file'] is None: - image_param_list.append({'prompt' : 'sailing ship in storm by Leonardo da Vinci'}) - elif args['prompt'] is not None and args['prompt_file'] is not None: - raise RuntimeError('== prompt and prompt file should not exist together ==') + if args["prompt"] is None and args["prompt_file"] is None: + image_param_list.append( + {"prompt": "sailing ship in storm by Leonardo da Vinci"} + ) + elif args["prompt"] is not None and args["prompt_file"] is not None: + raise RuntimeError("== prompt and prompt file should not exist together ==") else: - if args['prompt'] is not None: - if args['prompt'] != '': - image_param_list.append({'prompt' : args['prompt']}) + if args["prompt"] is not None: + if args["prompt"] != "": + image_param_list.append({"prompt": args["prompt"]}) else: - raise RuntimeError('== prompt should not be empty string ==') + raise RuntimeError("== prompt should not be empty string ==") else: - input_prompt_list = args['prompt_file'] + input_prompt_list = args["prompt_file"] for input_prompt in input_prompt_list: - if input_prompt.endswith('.jsonl'): + if input_prompt.endswith(".jsonl"): if os.path.exists(input_prompt): - log.info(f'Read prompts from {input_prompt}') - with open(input_prompt, 'r', encoding='utf-8') as f: + log.info(f"Read prompts from {input_prompt}") + with open(input_prompt, "r", encoding="utf-8") as f: for line in f: image_param = {} data = json.loads(line) - if 'prompt' in data: - if data['prompt'] != '': - image_param['prompt'] = data['prompt'] + if "prompt" in data: + if data["prompt"] != "": + image_param["prompt"] = data["prompt"] else: - raise RuntimeError('== prompt in prompt file:{input_prompt} should not be empty string ==') + raise RuntimeError( + "== prompt in prompt file:{input_prompt} should not be empty string ==" + ) else: - raise RuntimeError(f'== key word "prompt" does not exist in prompt file:{input_prompt} ==') - if 'width' in data: - image_param['width'] = int(data['width']) - if 'height' in data: - image_param['height'] = int(data['height']) - if 'steps' in data: - image_param['steps'] = int(data['steps']) - if 'guidance_scale' in data: - image_param['guidance_scale'] = float(data['guidance_scale']) + raise RuntimeError( + f'== key word "prompt" does not exist in prompt file:{input_prompt} ==' + ) + if "width" in data: + image_param["width"] = int(data["width"]) + if "height" in data: + image_param["height"] = int(data["height"]) + if "steps" in data: + image_param["steps"] = int(data["steps"]) + if "guidance_scale" in data: + image_param["guidance_scale"] = float( + data["guidance_scale"] + ) image_param_list.append(image_param) else: - raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==') + raise RuntimeError( + f"== The prompt file:{input_prompt} does not exist ==" + ) else: - raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==') + raise RuntimeError( + f"== The prompt file:{input_prompt} should be ended with .jsonl ==" + ) return image_param_list def set_default_param_for_ov_config(ov_config): # With this PR https://github.com/huggingface/optimum-intel/pull/362, we are able to disable model cache - if 'CACHE_DIR' not in ov_config: - ov_config['CACHE_DIR'] = '' + if "CACHE_DIR" not in ov_config: + ov_config["CACHE_DIR"] = "" def add_stateful_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( - '--stateful', - action='store_true', + "--stateful", + action="store_true", default=None, - help='Replace kv-cache inputs and outputs in the model by internal variables making a stateful model. ' - 'Additional operations are inserted into the model to handle cache state (Gathers, ShapeOf, etc.)', + help="Replace kv-cache inputs and outputs in the model by internal variables making a stateful model. " + "Additional operations are inserted into the model to handle cache state (Gathers, ShapeOf, etc.)", ) parser.add_argument( - '--disable-stateful', + "--disable-stateful", action="store_true", default=None, - help="Disable stateful transformation for model conversion" + help="Disable stateful transformation for model conversion", ) @@ -136,42 +161,42 @@ def analyze_args(args): model_args['subsequent'] = args.subsequent model_args['output_dir'] = args.output_dir model_args['genai'] = args.genai - model_args["use_cb"] = args.use_cb model_args['devices'] = args.device model_args['prompt_index'] = [] if args.prompt_index is not None else None if model_args['prompt_index'] is not None: # Deduplication - [model_args['prompt_index'].append(i) for i in args.prompt_index if i not in model_args['prompt_index']] - model_args['end_token_stopping'] = args.end_token_stopping + [ + model_args["prompt_index"].append(i) + for i in args.prompt_index + if i not in model_args["prompt_index"] + ] + model_args["end_token_stopping"] = args.end_token_stopping model_framework = args.framework model_path = Path(args.model) if not model_path.exists(): - raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}') - if model_framework in ('ov', 'pt'): + raise RuntimeError(f"==Failure FOUND==: Incorrect model path:{model_path}") + if model_framework in ("ov", "pt"): use_case, model_name = get_use_case(args.model) - model_args['use_case'] = use_case - if use_case == 'code_gen' and not model_args['prompt'] and not model_args['prompt_file']: - model_args['prompt'] = 'def print_hello_world():' - model_args['config'] = {} + model_args["use_case"] = use_case + if ( + use_case == "code_gen" + and not model_args["prompt"] + and not model_args["prompt_file"] + ): + model_args["prompt"] = "def print_hello_world():" + model_args["config"] = {} if args.load_config is not None: config = get_config(args.load_config) if type(config) is dict and len(config) > 0: - model_args['config'] = config - if model_framework == 'ov': - set_default_param_for_ov_config(model_args['config']) + model_args["config"] = config + if model_framework == "ov": + set_default_param_for_ov_config(model_args["config"]) log.info(f"OV Config={model_args['config']}") - elif model_framework == 'pt': + elif model_framework == "pt": log.info(f"PT Config={model_args['config']}") model_args['model_type'] = get_model_type(model_name, use_case, model_framework) model_args['model_name'] = model_name - - if args.use_cb and not args.genai: - raise RuntimeError("Continious batching mode supported only via OpenVINO GenAI") - cb_config = None - if args.cb_config: - cb_config = get_config(args.cb_config) - model_args["cb_config"] = cb_config return model_path, model_framework, model_args, model_name @@ -183,7 +208,9 @@ def get_use_case(model_name_or_path): for case, model_ids in USE_CASES.items(): for model_id in model_ids: if model_name.lower().startswith(model_id): - log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_name}') + log.info( + f"==SUCCESS FOUND==: use_case: {case}, model_type: {model_name}" + ) return case, model_name # 2. try to get use_case from model config @@ -196,29 +223,38 @@ def get_use_case(model_name_or_path): if config is not None: for case, model_ids in USE_CASES.items(): for idx, model_id in enumerate(normalize_model_ids(model_ids)): - if config.get("model_type").lower().replace('_', '-').startswith(model_id): - log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_id}') + if ( + config.get("model_type") + .lower() + .replace("_", "-") + .startswith(model_id) + ): + log.info( + f"==SUCCESS FOUND==: use_case: {case}, model_type: {model_id}" + ) return case, model_ids[idx] - raise RuntimeError('==Failure FOUND==: no use_case found') + raise RuntimeError("==Failure FOUND==: no use_case found") def get_config(config): - with open(config, 'r') as f: + with open(config, "r") as f: try: ov_config = json.load(f) except Exception: - raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + raise RuntimeError( + f"==Parse file:{config} failiure, json format is incorrect ==" + ) return ov_config def get_model_type(model_name, use_case, model_framework): default_model_type = DEFAULT_MODEL_CLASSES.get(use_case) - if model_framework == 'ov': + if model_framework == "ov": for cls in OV_MODEL_CLASSES_MAPPING: if cls in model_name.lower(): return cls - elif model_framework == 'pt': + elif model_framework == "pt": for cls in PT_MODEL_CLASSES_MAPPING: if cls in model_name.lower(): return cls @@ -226,11 +262,11 @@ def get_model_type(model_name, use_case, model_framework): def normalize_model_ids(model_ids_list): - return [m_id[:-1] if m_id.endswith('_') else m_id for m_id in model_ids_list] + return [m_id[:-1] if m_id.endswith("_") else m_id for m_id in model_ids_list] def get_ir_conversion_frontend(cur_model_name, model_name_list): - ir_conversion_frontend = '' + ir_conversion_frontend = "" idx = 0 for model_name in model_name_list: # idx+1 < len(model_name_list) to avoid out of bounds index of model_name_list @@ -243,21 +279,44 @@ def get_ir_conversion_frontend(cur_model_name, model_name_list): def get_model_precision(model_name_list): precision_list = [ - 'FP32', 'FP16', - 'FP16-INT8', 'INT8', 'INT8_compressed_weights', 'INT8_quantized', 'PT_compressed_weights', - 'OV_FP32-INT8', 'OV_FP16-INT8', - 'OV_FP32-INT8_ASYM', 'OV_FP32-INT8_SYM', 'OV_FP16-INT8_ASYM', 'OV_FP16-INT8_SYM', - 'PT_FP32-INT8', 'PT_FP16-INT8', 'PT_FP32-INT8_ASYM', 'PT_FP32-INT8_SYM', 'PT_FP16-INT8_ASYM', 'PT_FP16-INT8_SYM', - 'GPTQ_INT4-FP32', 'GPTQ_INT4-FP16', 'INT4', - 'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM', - 'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT', 'OV_FP32-4BIT_MAXIMUM', 'OV_FP16-4BIT_MAXIMUM'] - model_precision = 'unknown' + "FP32", + "FP16", + "FP16-INT8", + "INT8", + "INT8_compressed_weights", + "INT8_quantized", + "PT_compressed_weights", + "OV_FP32-INT8", + "OV_FP16-INT8", + "OV_FP32-INT8_ASYM", + "OV_FP32-INT8_SYM", + "OV_FP16-INT8_ASYM", + "OV_FP16-INT8_SYM", + "PT_FP32-INT8", + "PT_FP16-INT8", + "PT_FP32-INT8_ASYM", + "PT_FP32-INT8_SYM", + "PT_FP16-INT8_ASYM", + "PT_FP16-INT8_SYM", + "GPTQ_INT4-FP32", + "GPTQ_INT4-FP16", + "INT4", + "OV_FP16-INT4_SYM", + "OV_FP16-INT4_ASYM", + "OV_FP32-INT4_SYM", + "OV_FP32-INT4_ASYM", + "OV_FP32-4BIT_DEFAULT", + "OV_FP16-4BIT_DEFAULT", + "OV_FP32-4BIT_MAXIMUM", + "OV_FP16-4BIT_MAXIMUM", + ] + model_precision = "unknown" # Search from right to left of model path for i in range(len(model_name_list) - 1, -1, -1): for precision in precision_list: if model_name_list[i] == precision: model_precision = precision break - if model_precision != 'unknown': + if model_precision != "unknown": break return model_precision diff --git a/llm_bench/python/llm_bench_utils/nncf_utils.py b/llm_bench/python/llm_bench_utils/nncf_utils.py index b65e90a3a9..b656583ca4 100644 --- a/llm_bench/python/llm_bench_utils/nncf_utils.py +++ b/llm_bench/python/llm_bench_utils/nncf_utils.py @@ -4,7 +4,12 @@ COMPRESSION_OPTIONS = { "INT8": { - "mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM}, + "mode": ( + nncf.CompressWeightsMode.INT8 + if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ + else nncf.CompressWeightsMode.INT8_ASYM + ) + }, "INT4_SYM": { "mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, @@ -34,4 +39,8 @@ def get_compressed_path(output_dir: str, base_precision, option: str): - return Path(output_dir) / "pytorch/dldt/compressed_weights" / f"OV_{base_precision}-{option}" + return ( + Path(output_dir) + / "pytorch/dldt/compressed_weights" + / f"OV_{base_precision}-{option}" + ) diff --git a/llm_bench/python/llm_bench_utils/output_csv.py b/llm_bench/python/llm_bench_utils/output_csv.py index e01628f098..4efc68abf0 100644 --- a/llm_bench/python/llm_bench_utils/output_csv.py +++ b/llm_bench/python/llm_bench_utils/output_csv.py @@ -9,75 +9,108 @@ def output_comments(result, use_case, writer): for key in result.keys(): - result[key] = '' + result[key] = "" writer.writerow(result) comment_list = [] - if use_case == 'text_gen' or use_case == 'code_gen': - comment_list.append('input_size: Input token size') - comment_list.append('output_size: Text/Code generation models: generated text token size') - comment_list.append("infer_count: Limit the Text/Code generation models' output token size") - comment_list.append('latency: Text/Code generation models: ms/token. Output token size / generation time') - comment_list.append('1st_latency: Text/Code generation models: Fisrt token latency') - comment_list.append('2nd_avg_latency: Text/Code generation models: Other tokens (exclude first token) latency') - comment_list.append('1st_infer_latency: Text/Code generation models: Fisrt inference latency') - comment_list.append('2nd_infer_avg_latency: Text/Code generation models: Other inferences (exclude first inference) latency') - comment_list.append('result_md5: MD5 of generated text') - comment_list.append('prompt_idx: Index of prompts') - elif use_case == 'image_gen': - comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") - comment_list.append('1st_latency: First step lantency of unet') - comment_list.append('2nd_avg_latency: Other steps latency of unet(exclude first step)') - comment_list.append('1st_infer_latency: Same as 1st_latency') - comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') - comment_list.append('prompt_idx: Index of prompts') - elif use_case == 'ldm_super_resolution': - comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") - comment_list.append('1st_latency: First step lantency of unet') - comment_list.append('2nd_avg_latency: Other steps lantency of unet(exclude first step)') - comment_list.append('1st_infer_latency: Same as 1st_latency') - comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') - comment_list.append('prompt_idx: Image Index') - comment_list.append('tokenization_time: Tokenizer encode time') - comment_list.append('detokenization_time: Tokenizer decode time') - comment_list.append('pretrain_time: Total time of load model and compile model') - comment_list.append('generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)') - comment_list.append('iteration=0: warm-up; iteration=avg: average (exclude warm-up);iteration=mini: minimum value (exclude warm-up);' - 'iteration=median: median value (exclude warm-up);') + if use_case == "text_gen" or use_case == "code_gen": + comment_list.append("input_size: Input token size") + comment_list.append( + "output_size: Text/Code generation models: generated text token size" + ) + comment_list.append( + "infer_count: Limit the Text/Code generation models' output token size" + ) + comment_list.append( + "latency: Text/Code generation models: ms/token. Output token size / generation time" + ) + comment_list.append( + "1st_latency: Text/Code generation models: Fisrt token latency" + ) + comment_list.append( + "2nd_avg_latency: Text/Code generation models: Other tokens (exclude first token) latency" + ) + comment_list.append( + "1st_infer_latency: Text/Code generation models: Fisrt inference latency" + ) + comment_list.append( + "2nd_infer_avg_latency: Text/Code generation models: Other inferences (exclude first inference) latency" + ) + comment_list.append("result_md5: MD5 of generated text") + comment_list.append("prompt_idx: Index of prompts") + elif use_case == "image_gen": + comment_list.append( + "infer_count: Tex2Image models' Inference(or Sampling) step size" + ) + comment_list.append("1st_latency: First step lantency of unet") + comment_list.append( + "2nd_avg_latency: Other steps latency of unet(exclude first step)" + ) + comment_list.append("1st_infer_latency: Same as 1st_latency") + comment_list.append("2nd_infer_avg_latency: Same as 2nd_avg_latency") + comment_list.append("prompt_idx: Index of prompts") + elif use_case == "ldm_super_resolution": + comment_list.append( + "infer_count: Tex2Image models' Inference(or Sampling) step size" + ) + comment_list.append("1st_latency: First step lantency of unet") + comment_list.append( + "2nd_avg_latency: Other steps lantency of unet(exclude first step)" + ) + comment_list.append("1st_infer_latency: Same as 1st_latency") + comment_list.append("2nd_infer_avg_latency: Same as 2nd_avg_latency") + comment_list.append("prompt_idx: Image Index") + comment_list.append("tokenization_time: Tokenizer encode time") + comment_list.append("detokenization_time: Tokenizer decode time") + comment_list.append("pretrain_time: Total time of load model and compile model") comment_list.append( - 'max_rss_mem: max rss memory consumption;' + "generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)" ) comment_list.append( - 'max_shared_mem: max shared memory consumption;' + "iteration=0: warm-up; iteration=avg: average (exclude warm-up);iteration=mini: minimum value (exclude warm-up);" + "iteration=median: median value (exclude warm-up);" ) + comment_list.append("max_rss_mem: max rss memory consumption;") + comment_list.append("max_shared_mem: max shared memory consumption;") for comments in comment_list: - result['iteration'] = comments + result["iteration"] = comments writer.writerow(result) def output_avg_min_median(iter_data_list): prompt_idxs = [] for iter_data in iter_data_list: - prompt_idxs.append(iter_data['prompt_idx']) + prompt_idxs.append(iter_data["prompt_idx"]) prompt_idxs = list(set(prompt_idxs)) result = {} for prompt_idx in prompt_idxs: same_prompt_datas = [] for iter_data in iter_data_list: - if iter_data['prompt_idx'] == prompt_idx and iter_data['iteration'] > 0: + if iter_data["prompt_idx"] == prompt_idx and iter_data["iteration"] > 0: same_prompt_datas.append(iter_data) - key_word = ['input_size', 'infer_count', 'generation_time', 'output_size', 'latency', 'first_token_latency', 'other_tokens_avg_latency', - 'first_token_infer_latency', 'other_tokens_infer_avg_latency', 'tokenization_time', 'detokenization_time'] + key_word = [ + "input_size", + "infer_count", + "generation_time", + "output_size", + "latency", + "first_token_latency", + "other_tokens_avg_latency", + "first_token_infer_latency", + "other_tokens_infer_avg_latency", + "tokenization_time", + "detokenization_time", + ] if len(same_prompt_datas) > 0: - iters_idx = ['avg', 'mini', 'median'] + iters_idx = ["avg", "mini", "median"] result[prompt_idx] = [copy.deepcopy(same_prompt_datas[0]) for i in range(3)] for i in range(len(iters_idx)): - result[prompt_idx][i]['iteration'] = iters_idx[i] + result[prompt_idx][i]["iteration"] = iters_idx[i] for key in key_word: values = [] for prompt in same_prompt_datas: - if prompt[key] != '': + if prompt[key] != "": values.append(prompt[key]) if len(values) > 0: result[prompt_idx][0][key] = np.mean(values) @@ -106,22 +139,10 @@ def gen_data_to_csv(result, iter_data, pretrain_time): result['output_size'] = iter_data['output_size'] result['latency(ms)'] = round(latency, 5) if latency != '' else latency result['result_md5'] = iter_data['result_md5'] - if first_latency < 0: - result['1st_latency(ms)'] = 'NA' - else: - result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency - if other_latency < 0: - result['2nd_avg_latency(ms)'] = 'NA' - else: - result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency - if first_token_infer_latency < 0: - result['1st_infer_latency(ms)'] = 'NA' - else: - result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency - if other_token_infer_latency < 0: - result['2nd_infer_avg_latency(ms)'] = 'NA' - else: - result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency + result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency + result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency + result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency + result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem result['max_uss_mem(MB)'] = round(uss_mem, 5) if uss_mem != '' else uss_mem result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem @@ -130,50 +151,59 @@ def gen_data_to_csv(result, iter_data, pretrain_time): result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time -def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision): +def write_result( + report_file, + model, + framework, + device, + model_args, + iter_data_list, + pretrain_time, + model_precision, +): header = [ - 'iteration', - 'model', - 'framework', - 'device', - 'pretrain_time(s)', - 'input_size', - 'infer_count', - 'generation_time(s)', - 'output_size', - 'latency(ms)', - '1st_latency(ms)', - '2nd_avg_latency(ms)', - 'precision', - 'max_rss_mem(MB)', - 'max_uss_mem(MB)', - 'max_shared_mem(MB)', - 'prompt_idx', - '1st_infer_latency(ms)', - '2nd_infer_avg_latency(ms)', - 'num_beams', - 'batch_size', - 'tokenization_time', - 'detokenization_time', - 'result_md5', + "iteration", + "model", + "framework", + "device", + "pretrain_time(s)", + "input_size", + "infer_count", + "generation_time(s)", + "output_size", + "latency(ms)", + "1st_latency(ms)", + "2nd_avg_latency(ms)", + "precision", + "max_rss_mem(MB)", + "max_uss_mem(MB)", + "max_shared_mem(MB)", + "prompt_idx", + "1st_infer_latency(ms)", + "2nd_infer_avg_latency(ms)", + "num_beams", + "batch_size", + "tokenization_time", + "detokenization_time", + "result_md5", ] out_file = Path(report_file) if len(iter_data_list) > 0: - with open(out_file, 'w+', newline='') as f: + with open(out_file, "w+", newline="") as f: writer = csv.DictWriter(f, header) writer.writeheader() result = {} - result['model'] = model - result['framework'] = framework - result['device'] = device - result['pretrain_time(s)'] = round(pretrain_time, 5) - result['precision'] = model_precision - result['num_beams'] = model_args['num_beams'] - result['batch_size'] = model_args['batch_size'] + result["model"] = model + result["framework"] = framework + result["device"] = device + result["pretrain_time(s)"] = round(pretrain_time, 5) + result["precision"] = model_precision + result["num_beams"] = model_args["num_beams"] + result["batch_size"] = model_args["batch_size"] for i in range(len(iter_data_list)): iter_data = iter_data_list[i] - pre_time = '' if i > 0 else result['pretrain_time(s)'] + pre_time = "" if i > 0 else result["pretrain_time(s)"] gen_data_to_csv(result, iter_data, pre_time) writer.writerow(result) @@ -181,6 +211,6 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li for key in res_data.keys(): for data in res_data[key]: - gen_data_to_csv(result, data, '') + gen_data_to_csv(result, data, "") writer.writerow(result) - output_comments(result, model_args['use_case'], writer) + output_comments(result, model_args["use_case"], writer) diff --git a/llm_bench/python/llm_bench_utils/output_file.py b/llm_bench/python/llm_bench_utils/output_file.py index 8efbb430a7..dbfd93deea 100644 --- a/llm_bench/python/llm_bench_utils/output_file.py +++ b/llm_bench/python/llm_bench_utils/output_file.py @@ -5,62 +5,105 @@ def save_text_to_file(input_text, text_file_name, args): - if args['output_dir'] is not None: - if os.path.exists(args['output_dir']) is False: - os.mkdir(args['output_dir']) - out_path = args['output_dir'] + if args["output_dir"] is not None: + if os.path.exists(args["output_dir"]) is False: + os.mkdir(args["output_dir"]) + out_path = args["output_dir"] else: - out_path = '.' + out_path = "." save_path = out_path + os.sep + text_file_name - input_text_file = open(save_path, 'w') + input_text_file = open(save_path, "w") input_text_file.write(input_text) input_text_file.close() def save_image_file(img, img_file_name, args): - if args['output_dir'] is not None: - if os.path.exists(args['output_dir']) is False: - os.mkdir(args['output_dir']) - out_path = args['output_dir'] + if args["output_dir"] is not None: + if os.path.exists(args["output_dir"]) is False: + os.mkdir(args["output_dir"]) + out_path = args["output_dir"] else: - out_path = '.' + out_path = "." save_path = out_path + os.sep + img_file_name img.save(save_path) return save_path -def output_input_text(input_text, args, model_precision, prompt_idx, batchsize_idx, proc_id): - if args['batch_size'] > 1: - text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) +def output_input_text( + input_text, args, model_precision, prompt_idx, batchsize_idx, proc_id +): + if args["batch_size"] > 1: + text_file_name = ( + args["model_name"] + + "_" + + model_precision + + "_p" + + str(prompt_idx) + + "_bs" + + str(batchsize_idx) + ) else: - text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) - text_file_name = text_file_name + '_pid' + str(proc_id) + '_input.txt' + text_file_name = ( + args["model_name"] + "_" + model_precision + "_p" + str(prompt_idx) + ) + text_file_name = text_file_name + "_pid" + str(proc_id) + "_input.txt" save_text_to_file(input_text, text_file_name, args) def output_image_input_text(input_text, args, prompt_idx, batchsize_idx, proc_id): - if args['batch_size'] > 1 and batchsize_idx is not None: - text_file_name = args['model_name'] + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + if args["batch_size"] > 1 and batchsize_idx is not None: + text_file_name = ( + args["model_name"] + "_p" + str(prompt_idx) + "_bs" + str(batchsize_idx) + ) else: - text_file_name = args['model_name'] + '_p' + str(prompt_idx) - text_file_name = text_file_name + '_pid' + str(proc_id) + '_input.txt' + text_file_name = args["model_name"] + "_p" + str(prompt_idx) + text_file_name = text_file_name + "_pid" + str(proc_id) + "_input.txt" save_text_to_file(input_text, text_file_name, args) -def output_gen_text(generated_text, args, model_precision, prompt_idx, iteration, batchsize_idx, proc_id): - if args['batch_size'] > 1: - text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) +def output_gen_text( + generated_text, args, model_precision, prompt_idx, iteration, batchsize_idx, proc_id +): + if args["batch_size"] > 1: + text_file_name = ( + args["model_name"] + + "_" + + model_precision + + "_p" + + str(prompt_idx) + + "_bs" + + str(batchsize_idx) + ) else: - text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) - text_file_name = text_file_name + '_iter' + str(iteration) + '_pid' + str(proc_id) + '_output.txt' + text_file_name = ( + args["model_name"] + "_" + model_precision + "_p" + str(prompt_idx) + ) + text_file_name = ( + text_file_name + + "_iter" + + str(iteration) + + "_pid" + + str(proc_id) + + "_output.txt" + ) save_text_to_file(generated_text, text_file_name, args) def output_gen_image(img, args, prompt_idx, iteration, batchsize_idx, proc_id, suffix): - if args['batch_size'] > 1 and batchsize_idx is not None: - img_save_name = args['model_name'] + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + if args["batch_size"] > 1 and batchsize_idx is not None: + img_save_name = ( + args["model_name"] + "_p" + str(prompt_idx) + "_bs" + str(batchsize_idx) + ) else: - img_save_name = args['model_name'] + '_p' + str(prompt_idx) - img_save_name = img_save_name + '_iter' + str(iteration) + '_pid' + str(proc_id) + '_output' + suffix + img_save_name = args["model_name"] + "_p" + str(prompt_idx) + img_save_name = ( + img_save_name + + "_iter" + + str(iteration) + + "_pid" + + str(proc_id) + + "_output" + + suffix + ) img_save_path = save_image_file(img, img_save_name, args) return img_save_path diff --git a/llm_bench/python/llm_bench_utils/output_json.py b/llm_bench/python/llm_bench_utils/output_json.py index b50a17f974..911bde6343 100644 --- a/llm_bench/python/llm_bench_utils/output_json.py +++ b/llm_bench/python/llm_bench_utils/output_json.py @@ -1,52 +1,92 @@ import json -def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision): - metadata = {'model': model, 'framework': framework, 'device': device, 'precision': model_precision, - 'num_beams': model_args['num_beams'], 'batch_size': model_args['batch_size']} +def write_result( + report_file, + model, + framework, + device, + model_args, + iter_data_list, + pretrain_time, + model_precision, +): + metadata = { + "model": model, + "framework": framework, + "device": device, + "precision": model_precision, + "num_beams": model_args["num_beams"], + "batch_size": model_args["batch_size"], + } result = [] total_iters = len(iter_data_list) for i in range(total_iters): iter_data = iter_data_list[i] - generation_time = iter_data['generation_time'] - latency = iter_data['latency'] - first_latency = iter_data['first_token_latency'] - other_latency = iter_data['other_tokens_avg_latency'] - first_token_infer_latency = iter_data['first_token_infer_latency'] - other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] - rss_mem = iter_data['max_rss_mem_consumption'] - uss_mem = iter_data['max_uss_mem_consumption'] - shared_mem = iter_data['max_shared_mem_consumption'] - tokenization_time = iter_data['tokenization_time'] - detokenization_time = iter_data['detokenization_time'] + generation_time = iter_data["generation_time"] + latency = iter_data["latency"] + first_latency = iter_data["first_token_latency"] + other_latency = iter_data["other_tokens_avg_latency"] + first_token_infer_latency = iter_data["first_token_infer_latency"] + other_token_infer_latency = iter_data["other_tokens_infer_avg_latency"] + rss_mem = iter_data["max_rss_mem_consumption"] + uss_mem = iter_data["max_uss_mem_consumption"] + shared_mem = iter_data["max_shared_mem_consumption"] + tokenization_time = iter_data["tokenization_time"] + detokenization_time = iter_data["detokenization_time"] result_md5 = [] - for idx_md5 in range(len(iter_data['result_md5'])): - result_md5.append(iter_data['result_md5'][idx_md5]) + for idx_md5 in range(len(iter_data["result_md5"])): + result_md5.append(iter_data["result_md5"][idx_md5]) res_data = { - 'iteration': iter_data['iteration'], - 'input_size': iter_data['input_size'], - 'infer_count': iter_data['infer_count'], - 'generation_time': round(generation_time, 5) if generation_time != '' else generation_time, - 'output_size': iter_data['output_size'], - 'latency': round(latency, 5) if latency != '' else latency, - 'result_md5': result_md5, - 'first_latency': round(first_latency, 5) if first_latency != '' else first_latency, - 'second_avg_latency': round(other_latency, 5) if other_latency != '' else other_latency, - 'first_infer_latency': round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency, - 'second_infer_avg_latency': round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency, - 'max_rss_mem': round(rss_mem, 5) if rss_mem != '' else -1, - 'max_uss_mem': round(uss_mem, 5) if uss_mem != '' else -1, - 'max_shared_mem': round(shared_mem, 5) if shared_mem != '' else -1, - 'prompt_idx': iter_data['prompt_idx'], - 'tokenization_time': round(tokenization_time, 5) if tokenization_time != '' else tokenization_time, - 'detokenization_time': round(detokenization_time, 5) if detokenization_time != '' else detokenization_time, + "iteration": iter_data["iteration"], + "input_size": iter_data["input_size"], + "infer_count": iter_data["infer_count"], + "generation_time": ( + round(generation_time, 5) if generation_time != "" else generation_time + ), + "output_size": iter_data["output_size"], + "latency": round(latency, 5) if latency != "" else latency, + "result_md5": result_md5, + "first_latency": ( + round(first_latency, 5) if first_latency != "" else first_latency + ), + "second_avg_latency": ( + round(other_latency, 5) if other_latency != "" else other_latency + ), + "first_infer_latency": ( + round(first_token_infer_latency, 5) + if first_token_infer_latency != "" + else first_token_infer_latency + ), + "second_infer_avg_latency": ( + round(other_token_infer_latency, 5) + if other_token_infer_latency != "" + else other_token_infer_latency + ), + "max_rss_mem": round(rss_mem, 5) if rss_mem != "" else -1, + "max_uss_mem": round(uss_mem, 5) if uss_mem != "" else -1, + "max_shared_mem": round(shared_mem, 5) if shared_mem != "" else -1, + "prompt_idx": iter_data["prompt_idx"], + "tokenization_time": ( + round(tokenization_time, 5) + if tokenization_time != "" + else tokenization_time + ), + "detokenization_time": ( + round(detokenization_time, 5) + if detokenization_time != "" + else detokenization_time + ), } result.append(res_data) - output_result = {'metadata': metadata, "perfdata": {'compile_time': pretrain_time, 'results': result}} + output_result = { + "metadata": metadata, + "perfdata": {"compile_time": pretrain_time, "results": result}, + } - with open(report_file, 'w') as outfile: + with open(report_file, "w") as outfile: json.dump(output_result, outfile) diff --git a/llm_bench/python/llm_bench_utils/ov_model_classes.py b/llm_bench/python/llm_bench_utils/ov_model_classes.py index 0ade0f1299..a3a3bab929 100644 --- a/llm_bench/python/llm_bench_utils/ov_model_classes.py +++ b/llm_bench/python/llm_bench_utils/ov_model_classes.py @@ -38,14 +38,13 @@ def _reshape( if shapes[inputs].rank.get_length() in [2, 3]: shapes[inputs][1] = -1 else: - if '.key' in inputs.get_any_name(): + if ".key" in inputs.get_any_name(): shapes[inputs][3] = -1 elif inputs.get_any_name() != "beam_idx": shapes[inputs][2] = -1 model.reshape(shapes) return model - def forward( self, input_ids: torch.LongTensor, @@ -76,9 +75,10 @@ def forward( else: # Flatten the past_key_values past_key_values = tuple( - past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer + past_key_value + for pkv_per_layer in past_key_values + for past_key_value in pkv_per_layer ) - # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) @@ -88,7 +88,7 @@ def forward( for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) shape = model_inputs.get_partial_shape() - if self.config.model_type == 'chatglm': + if self.config.model_type == "chatglm": shape[0] = 0 shape[1] = batch_size else: @@ -99,7 +99,9 @@ def forward( shape[3] = 0 else: shape[1] = 0 - inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) + inputs[input_name] = Tensor( + model_inputs.get_element_type(), shape.get_shape() + ) else: # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: @@ -121,7 +123,8 @@ def forward( attention_mask = np.array(attention_mask) else: attention_mask = np.ones( - (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype + (input_ids.shape[0], input_ids.shape[1] + past_len), + dtype=inputs["input_ids"].dtype, ) if "attention_mask" in self.input_names: @@ -138,21 +141,27 @@ def forward( inputs["position_ids"] = position_ids - if hasattr(self, 'next_beam_idx') and "beam_idx" in self.input_names: - inputs['beam_idx'] = self.next_beam_idx + if hasattr(self, "next_beam_idx") and "beam_idx" in self.input_names: + inputs["beam_idx"] = self.next_beam_idx # Run inference self.request.start_async(inputs, share_inputs=True) self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + logits = torch.from_numpy(self.request.get_tensor("logits").data).to( + self.device + ) if not self.stateful: if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple( + self.request.get_tensor(key).data + for key in self.key_value_output_names + ) # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( - past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) + past_key_values[i : i + self.num_pkv] + for i in range(0, len(past_key_values), self.num_pkv) ) else: past_key_values = None @@ -163,9 +172,11 @@ def forward( class OVLDMSuperResolutionPipeline(DiffusionPipeline): def __init__(self, model_path: Path, core: Core, device: str): super().__init__() - self.vqvae = core.compile_model(model_path / 'vqvae.xml', device) - self.unet = core.compile_model(model_path / 'unet.xml', device) - self.scheduler = LMSDiscreteScheduler.from_config(model_path / 'scheduler_config.json') + self.vqvae = core.compile_model(model_path / "vqvae.xml", device) + self.unet = core.compile_model(model_path / "unet.xml", device) + self.scheduler = LMSDiscreteScheduler.from_config( + model_path / "scheduler_config.json" + ) self._unet_output = self.unet.output(0) self._vqvae_output = self.vqvae.output(0) @@ -177,12 +188,12 @@ def __call__( num_inference_steps: Optional[int] = 100, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - output_type: Optional[str] = 'pil', + output_type: Optional[str] = "pil", return_dict: bool = True, tm_list: Optional[List] = None, **kwargs, ) -> Union[Tuple, ImagePipelineOutput]: - r''' + r""" Args: image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the @@ -207,7 +218,7 @@ def __call__( Returns: [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. - ''' + """ image = image if isinstance(image, PIL.Image.Image): @@ -215,7 +226,9 @@ def __call__( elif isinstance(image, torch.Tensor): batch_size = image.shape[0] else: - raise ValueError(f'`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}') + raise ValueError( + f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}" + ) if isinstance(image, PIL.Image.Image): image = self.preprocess(image) @@ -232,8 +245,8 @@ def __call__( latents = latents * self.scheduler.init_noise_sigma latents = latents.numpy() extra_kwargs = {} - if 'eta' in set(inspect.signature(self.scheduler.step).parameters.keys()): - extra_kwargs['eta'] = eta + if "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()): + extra_kwargs["eta"] = eta for t in timesteps_tensor: # concat latents and low resolution image in the channel dimension. @@ -244,7 +257,9 @@ def __call__( noise_pred = self.unet([latents_input, t])[self._unet_output] tm_list.append(time.perf_counter() - tic) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents))['prev_sample'].numpy() + latents = self.scheduler.step( + torch.from_numpy(noise_pred), t, torch.from_numpy(latents) + )["prev_sample"].numpy() # decode the image latents with the VQVAE tic = time.perf_counter() @@ -253,7 +268,7 @@ def __call__( image = image / 2 + 0.5 image = image.transpose(0, 2, 3, 1) - if output_type == 'pil': + if output_type == "pil": image = self.numpy_to_pil(image) return image @@ -261,7 +276,7 @@ def __call__( def preprocess(image): w, h = image.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 - image = image.resize((w, h), resample=PIL_INTERPOLATION['lanczos']) + image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) image = torch.from_numpy(image) @@ -281,17 +296,19 @@ def __init__( self, model: Model, config: PretrainedConfig = None, - device: str = 'CPU', + device: str = "CPU", dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - super().__init__(model, config, device, dynamic_shapes, ov_config, model_save_dir, **kwargs) + super().__init__( + model, config, device, dynamic_shapes, ov_config, model_save_dir, **kwargs + ) self.is_v1 = False if not self.stateful and not self.key_value_input_names: self.is_v1 = True - self.key_value_input_names = ['past_key_values'] + self.key_value_input_names = ["past_key_values"] self.key_value_output_names = [o.any_name for o in self.model.outputs[1:]] def prepare_inputs_for_generation( @@ -305,10 +322,12 @@ def prepare_inputs_for_generation( ) -> dict: if not self.is_v1: return super().prepare_inputs_for_generation( - input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, position_ids=position_ids, past=past, - **kwargs + **kwargs, ) batch_size, seq_length = input_ids.shape mask = self.mask_token_id @@ -336,22 +355,31 @@ def prepare_inputs_for_generation( context_lengths = [seq.index(self.bos_token_id) for seq in seqs] if self.position_encoding_2d: # position_encoding_2d = True position_ids = torch.tensor( - [[mask_position, seq_length - context_length] for mask_position, context_length in zip(mask_positions, context_lengths)], + [ + [mask_position, seq_length - context_length] + for mask_position, context_length in zip( + mask_positions, context_lengths + ) + ], dtype=torch.long, device=input_ids.device, ).unsqueeze(-1) else: - position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long, device=input_ids.device).unsqueeze(-1) + position_ids = torch.tensor( + [mask_position for mask_position in mask_positions], + dtype=torch.long, + device=input_ids.device, + ).unsqueeze(-1) if past is None: past = self.get_past_key_values(past_key_values) return { - 'input_ids': last_token, - 'past_key_values': past, - 'position_ids': position_ids, - 'attention_mask': attention_mask, - 'use_cache': self.use_cache, - 'token_type_ids': None, + "input_ids": last_token, + "past_key_values": past, + "position_ids": position_ids, + "attention_mask": attention_mask, + "use_cache": self.use_cache, + "token_type_ids": None, } else: # First Step Inference @@ -374,14 +402,16 @@ def prepare_inputs_for_generation( past_key_values = np.zeros((self.num_layers, 2, 0, 1, 32, 128)) # numpy does not support bf16, pretending f16, should change to bf16 if self._pkv_precision == Type.bf16: - past_key_values = Tensor(past_key_values, past_key_values.shape, Type.bf16) + past_key_values = Tensor( + past_key_values, past_key_values.shape, Type.bf16 + ) return { - 'input_ids': input_ids, - 'position_ids': position_ids, - 'attention_mask': attention_mask, - 'past_key_values': past_key_values, - 'use_cache': self.use_cache, - 'token_type_ids': None, + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "use_cache": self.use_cache, + "token_type_ids": None, } def get_masks(self, input_ids, device): @@ -402,14 +432,21 @@ def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None): use_gmasks = [False] * batch_size context_lengths = [seq.tolist().index(self.bos_token_id) for seq in input_ids] if self.position_encoding_2d: - position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + position_ids = ( + torch.arange(seq_length, dtype=torch.long, device=device) + .unsqueeze(0) + .repeat(batch_size, 1) + ) for i, context_length in enumerate(context_lengths): position_ids[i, context_length:] = mask_positions[i] block_position_ids = [ torch.cat( ( torch.zeros(context_length, dtype=torch.long, device=device), - torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1, + torch.arange( + seq_length - context_length, dtype=torch.long, device=device + ) + + 1, ) ) for context_length in context_lengths @@ -417,7 +454,11 @@ def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None): block_position_ids = torch.stack(block_position_ids, dim=0) position_ids = torch.stack((position_ids, block_position_ids), dim=1) else: - position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + position_ids = ( + torch.arange(seq_length, dtype=torch.long, device=device) + .unsqueeze(0) + .repeat(batch_size, 1) + ) for i, context_length in enumerate(context_lengths): if not use_gmasks[i]: position_ids[context_length:] = mask_positions[i] @@ -440,34 +481,46 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, **kwargs, ) -> CausalLMOutputWithPast: - + if not self.is_v1: - return super().forward(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **kwargs) + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) self.compile() inputs = {} if past_key_values is not None: - inputs['past_key_values'] = past_key_values - inputs['input_ids'] = np.array(input_ids) + inputs["past_key_values"] = past_key_values + inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed - if 'attention_mask' in self.input_names and attention_mask is not None: - inputs['attention_mask'] = np.array(attention_mask) + if "attention_mask" in self.input_names and attention_mask is not None: + inputs["attention_mask"] = np.array(attention_mask) - if 'position_ids' in kwargs and kwargs['position_ids'] is not None: - inputs['position_ids'] = np.array(kwargs['position_ids']) + if "position_ids" in kwargs and kwargs["position_ids"] is not None: + inputs["position_ids"] = np.array(kwargs["position_ids"]) # Run inference self.request.start_async(inputs, share_inputs=True) self.request.wait() - logits = torch.from_numpy(self.request.get_tensor('logits').data).to(self.device) + logits = torch.from_numpy(self.request.get_tensor("logits").data).to( + self.device + ) if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple( + self.request.get_tensor(key).data for key in self.key_value_output_names + ) # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) - past_key_values = tuple(past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)) + past_key_values = tuple( + past_key_values[i : i + self.num_pkv] + for i in range(0, len(past_key_values), self.num_pkv) + ) else: past_key_values = None diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py index b9434c5f3d..99fe389031 100644 --- a/llm_bench/python/llm_bench_utils/ov_utils.py +++ b/llm_bench/python/llm_bench_utils/ov_utils.py @@ -10,35 +10,53 @@ import time import types from llm_bench_utils.hook_common import get_bench_hook -from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES +from llm_bench_utils.config_class import ( + OV_MODEL_CLASSES_MAPPING, + TOKENIZE_CLASSES_MAPPING, + DEFAULT_MODEL_CLASSES, +) import openvino.runtime.opset13 as opset def generate_simplified(self, *args, **kwargs): if len(args): - raise Exception(f'Not empty args is not supported in generate_simplified, given: {args}') + raise Exception( + f"Not empty args is not supported in generate_simplified, given: {args}" + ) # TODO: Check other ignored parameters and report about them - log.warning('Termination criteria is not supported in overridden generate, max_new_tokens only matters') + log.warning( + "Termination criteria is not supported in overridden generate, max_new_tokens only matters" + ) # TODO: Check if unsupported kwargs are provided - input_ids = kwargs['input_ids'] - attention_mask = kwargs['attention_mask'] + input_ids = kwargs["input_ids"] + attention_mask = kwargs["attention_mask"] - assert kwargs['num_beams'] == 1, "Overridden generate doesn't support num_beams > 1" + assert kwargs["num_beams"] == 1, "Overridden generate doesn't support num_beams > 1" past_key_values = None - for _i in range(kwargs['max_new_tokens']): - outputs = self(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, use_cache=True) + for _i in range(kwargs["max_new_tokens"]): + outputs = self( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=True, + ) - next_tokens = outputs.logits # logits is an old name from original model, when interprocessing is fused it is a token + next_tokens = ( + outputs.logits + ) # logits is an old name from original model, when interprocessing is fused it is a token # TODO: Apply termination criteria in addition to max_new_tokens # TODO: Doing the cat with input_ids here, we will 'uncat' it later in the next forward, # avoid doing it by passible next_tokens (without cat) directly to the next forward input_ids = torch.cat([input_ids, next_tokens], dim=-1) - attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1) + attention_mask = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], + dim=-1, + ) # Depending on whether we are in stateful mode, past_key_values may or may not represent meaningful values, # need to pass them anyway to identify the first iteration past_key_values = outputs.past_key_values @@ -50,14 +68,20 @@ def patch_decoding_strategy(hf_model, patch_methods, **kwargs): """Fuse post-processing as an extra ops into a model.""" ov_model = hf_model.model - if kwargs.get('fuse_decoding_strategy', False): + if kwargs.get("fuse_decoding_strategy", False): ppp = ov.preprocess.PrePostProcessor(ov_model) - assert kwargs['num_beams'] == 1, "Parameter fuse_decoding_strategy doesn't support beam_search, set num_beams to 1" + assert ( + kwargs["num_beams"] == 1 + ), "Parameter fuse_decoding_strategy doesn't support beam_search, set num_beams to 1" def greedy_search(input_port): - next_token = opset.gather(input_port, opset.constant(-1), opset.constant(1)) # take last logits only (makes sense at the first iteration only) - topk = opset.topk(next_token, opset.constant(1), axis=-1, mode='max', sort='none').output(1) + next_token = opset.gather( + input_port, opset.constant(-1), opset.constant(1) + ) # take last logits only (makes sense at the first iteration only) + topk = opset.topk( + next_token, opset.constant(1), axis=-1, mode="max", sort="none" + ).output(1) return topk ppp.output(0).postprocess().custom(greedy_search) @@ -70,9 +94,9 @@ def greedy_search(input_port): def save_model(hf_model, **kwargs): - xml_file_name = kwargs['save_prepared_model'] + xml_file_name = kwargs["save_prepared_model"] if xml_file_name is not None: - log.info(f'Saving prepared OpenVINO model to {xml_file_name} ...') + log.info(f"Saving prepared OpenVINO model to {xml_file_name} ...") ov.save_model(hf_model.model, xml_file_name) @@ -89,7 +113,9 @@ def build_ov_tokenizer(hf_tokenizer): log.warn("OV Tokenizer is unavailable, tokenizer conversion will be skipped") return hf_tokenizer - ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) + ov_tokenizer, ov_detokenizer = convert_tokenizer( + hf_tokenizer, with_detokenizer=True + ) return build_ov_tokenizer_wrapper(hf_tokenizer, ov_tokenizer, ov_detokenizer) @@ -115,7 +141,9 @@ def decode_ov_tokenizer(self, token_ids, *args, **kwargs): hf_tokenizer.encode = types.MethodType(encode_ov_tokenizer, hf_tokenizer) hf_tokenizer.__call__ = types.MethodType(encode_ov_tokenizer_full, hf_tokenizer) - hf_tokenizer.batch_decode = types.MethodType(batch_decode_ov_tokenizer, hf_tokenizer) + hf_tokenizer.batch_decode = types.MethodType( + batch_decode_ov_tokenizer, hf_tokenizer + ) hf_tokenizer.decode = types.MethodType(decode_ov_tokenizer, hf_tokenizer) return hf_tokenizer @@ -127,32 +155,42 @@ def create_text_gen_model(model_path, device, **kwargs): - device: can be CPU or GPU - model_type: """ - default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] - model_type = kwargs.get('model_type', default_model_type) - model_class = OV_MODEL_CLASSES_MAPPING.get(model_type, OV_MODEL_CLASSES_MAPPING[default_model_type]) - token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type]) + default_model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] + model_type = kwargs.get("model_type", default_model_type) + model_class = OV_MODEL_CLASSES_MAPPING.get( + model_type, OV_MODEL_CLASSES_MAPPING[default_model_type] + ) + token_class = TOKENIZE_CLASSES_MAPPING.get( + model_type, TOKENIZE_CLASSES_MAPPING[default_model_type] + ) model_path = Path(model_path) # specify the model path - if model_path.name.endswith('xml'): + if model_path.name.endswith("xml"): model_path = model_path.parents[2] - ov_config = kwargs['config'] + ov_config = kwargs["config"] model_path_existed = Path(model_path).exists() # load model if not model_path_existed: - raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') + raise RuntimeError(f"==Failure ==: model path:{model_path} does not exist") else: if kwargs.get("genai", False) and is_genai_available(log_msg=True): - if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]: + if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]: log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") else: - return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) + return create_genai_text_gen_model( + model_path, device, ov_config, **kwargs + ) remote_code = False try: - model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False) + model_config = AutoConfig.from_pretrained( + model_path, trust_remote_code=False + ) except Exception: - model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model_config = AutoConfig.from_pretrained( + model_path, trust_remote_code=True + ) remote_code = True start = time.perf_counter() ov_model = model_class.from_pretrained( @@ -161,14 +199,14 @@ def create_text_gen_model(model_path, device, **kwargs): ov_config=ov_config, config=model_config, stateful=kwargs.get("stateful", None), - trust_remote_code=remote_code + trust_remote_code=remote_code, ) - if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): + if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING["t5"]): patch_inter_processing_and_compile(ov_model, **kwargs) end = time.perf_counter() - bench_hook = get_bench_hook(kwargs['num_beams'], ov_model) + bench_hook = get_bench_hook(kwargs["num_beams"], ov_model) from_pretrained_time = end - start - log.info(f'From pretrained time: {from_pretrained_time:.2f}s') + log.info(f"From pretrained time: {from_pretrained_time:.2f}s") # load token tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True) if kwargs.get("convert_tokenizer", False): @@ -181,7 +219,10 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): import openvino_genai from transformers import AutoTokenizer - if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): + if ( + not (model_path / "openvino_tokenizer.xml").exists() + or not (model_path / "openvino_detokenizer.xml").exists() + ): convert_ov_tokenizer(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -200,7 +241,7 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): start = time.perf_counter() llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config) end = time.perf_counter() - log.info(f'Pipeline initialization time: {end - start:.2f}s') + log.info(f"Pipeline initialization time: {end - start:.2f}s") class TokenStreamer(openvino_genai.StreamerBase): def __init__(self, tokenizer): @@ -239,45 +280,50 @@ def convert_ov_tokenizer(tokenizer_path): def create_image_gen_model(model_path, device, **kwargs): - default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] - model_type = kwargs.get('model_type', default_model_type) + default_model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] + model_type = kwargs.get("model_type", default_model_type) model_class = OV_MODEL_CLASSES_MAPPING[model_type] model_path = Path(model_path) - ov_config = kwargs['config'] + ov_config = kwargs["config"] if not Path(model_path).exists(): - raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') + raise RuntimeError(f"==Failure ==: model path:{model_path} does not exist") else: start = time.perf_counter() - ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config) + ov_model = model_class.from_pretrained( + model_path, device=device, ov_config=ov_config + ) end = time.perf_counter() from_pretrained_time = end - start - log.info(f'From pretrained time: {from_pretrained_time:.2f}s') + log.info(f"From pretrained time: {from_pretrained_time:.2f}s") return ov_model, from_pretrained_time def create_ldm_super_resolution_model(model_path, device, **kwargs): core = Core() - ov_config = kwargs['config'] + ov_config = kwargs["config"] core.set_property(ov_config) - default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] - model_type = kwargs.get('model_type', default_model_type) + default_model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] + model_type = kwargs.get("model_type", default_model_type) model_class = OV_MODEL_CLASSES_MAPPING[model_type] model_path = Path(model_path) start = time.perf_counter() ov_model = model_class(model_path, core, device.upper()) end = time.perf_counter() from_pretrained_time = end - start - log.info(f'From pretrained time: {from_pretrained_time:.2f}s') + log.info(f"From pretrained time: {from_pretrained_time:.2f}s") return ov_model, from_pretrained_time def is_genai_available(log_msg=False): import importlib + try: - importlib.import_module('openvino_genai') + importlib.import_module("openvino_genai") except ImportError as ex: if log_msg: - log.warning("Attempt to load OpenVINO GenaAI package failed. Please install openvino_genai package. Full error message available in debug mode") + log.warning( + "Attempt to load OpenVINO GenaAI package failed. Please install openvino_genai package. Full error message available in debug mode" + ) log.warning(ex) return False return True diff --git a/llm_bench/python/llm_bench_utils/pt_utils.py b/llm_bench/python/llm_bench_utils/pt_utils.py index d9f530a179..518884898c 100644 --- a/llm_bench/python/llm_bench_utils/pt_utils.py +++ b/llm_bench/python/llm_bench_utils/pt_utils.py @@ -3,7 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path import torch -from llm_bench_utils.config_class import PT_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES +from llm_bench_utils.config_class import ( + PT_MODEL_CLASSES_MAPPING, + TOKENIZE_CLASSES_MAPPING, + DEFAULT_MODEL_CLASSES, +) import os import time import logging as log @@ -13,41 +17,73 @@ def set_bf16(model, device, **kwargs): try: - if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True: + if ( + len(kwargs["config"]) > 0 + and kwargs["config"].get("PREC_BF16") + and kwargs["config"]["PREC_BF16"] is True + ): model = model.to(device.lower(), dtype=torch.bfloat16) - log.info('Set inference precision to bf16') + log.info("Set inference precision to bf16") except Exception: - log.error('Catch exception for setting inference precision to bf16.') - raise RuntimeError('Set prec_bf16 fail.') + log.error("Catch exception for setting inference precision to bf16.") + raise RuntimeError("Set prec_bf16 fail.") return model -def torch_compile_child_module(model, child_modules, backend='openvino', dynamic=None, options=None): +def torch_compile_child_module( + model, child_modules, backend="openvino", dynamic=None, options=None +): if len(child_modules) == 1: - setattr(model, child_modules[0], torch.compile(getattr(model, child_modules[0]), backend=backend, dynamic=dynamic, fullgraph=True, options=options)) + setattr( + model, + child_modules[0], + torch.compile( + getattr(model, child_modules[0]), + backend=backend, + dynamic=dynamic, + fullgraph=True, + options=options, + ), + ) return model - setattr(model, child_modules[0], torch_compile_child_module(getattr(model, child_modules[0]), child_modules[1:], backend, dynamic, options)) + setattr( + model, + child_modules[0], + torch_compile_child_module( + getattr(model, child_modules[0]), + child_modules[1:], + backend, + dynamic, + options, + ), + ) return model -def run_torch_compile(model, backend='openvino', dynamic=None, options=None, child_modules=None): - if backend == 'pytorch': - log.info(f'Running torch.compile() with {backend} backend') +def run_torch_compile( + model, backend="openvino", dynamic=None, options=None, child_modules=None +): + if backend == "pytorch": + log.info(f"Running torch.compile() with {backend} backend") start = time.perf_counter() compiled_model = torch.compile(model) end = time.perf_counter() compile_time = end - start - log.info(f'Compiling model via torch.compile() took: {compile_time}') + log.info(f"Compiling model via torch.compile() took: {compile_time}") else: - log.info(f'Running torch.compile() with {backend} backend') + log.info(f"Running torch.compile() with {backend} backend") start = time.perf_counter() if child_modules and len(child_modules) > 0: - compiled_model = torch_compile_child_module(model, child_modules, backend, dynamic, options) + compiled_model = torch_compile_child_module( + model, child_modules, backend, dynamic, options + ) else: - compiled_model = torch.compile(model, backend=backend, dynamic=dynamic, options=options) + compiled_model = torch.compile( + model, backend=backend, dynamic=dynamic, options=options + ) end = time.perf_counter() compile_time = end - start - log.info(f'Compiling model via torch.compile() took: {compile_time}') + log.info(f"Compiling model via torch.compile() took: {compile_time}") return compiled_model @@ -56,67 +92,90 @@ def create_text_gen_model(model_path, device, **kwargs): from_pretrain_time = 0 if model_path.exists(): if model_path.is_dir() and len(os.listdir(model_path)) != 0: - log.info(f'Load text model from model path:{model_path}') - default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] - model_type = kwargs.get('model_type', default_model_type) - model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type]) - token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type]) + log.info(f"Load text model from model path:{model_path}") + default_model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] + model_type = kwargs.get("model_type", default_model_type) + model_class = PT_MODEL_CLASSES_MAPPING.get( + model_type, PT_MODEL_CLASSES_MAPPING[default_model_type] + ) + token_class = TOKENIZE_CLASSES_MAPPING.get( + model_type, TOKENIZE_CLASSES_MAPPING[default_model_type] + ) start = time.perf_counter() - if model_type == 'chatglm': - model = model_class.from_pretrained(model_path, trust_remote_code=True).to('cpu', dtype=float) + if model_type == "chatglm": + model = model_class.from_pretrained( + model_path, trust_remote_code=True + ).to("cpu", dtype=float) else: model = model_class.from_pretrained(model_path, trust_remote_code=True) tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() from_pretrain_time = end - start else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') + raise RuntimeError( + f"==Failure ==: model path:{model_path} is not directory or directory is empty" + ) else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist') + raise RuntimeError(f"==Failure ==: model path:{model_path} is not exist") - log.info(f'model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s') + log.info( + f"model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s" + ) if device is not None: - gptjfclm = 'transformers.models.gptj.modeling_gptj.GPTJForCausalLM' - lfclm = 'transformers.models.llama.modeling_llama.LlamaForCausalLM' - bfclm = 'transformers.models.bloom.modeling_bloom.BloomForCausalLM' - gpt2lmhm = 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel' - gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM' - chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration' + gptjfclm = "transformers.models.gptj.modeling_gptj.GPTJForCausalLM" + lfclm = "transformers.models.llama.modeling_llama.LlamaForCausalLM" + bfclm = "transformers.models.bloom.modeling_bloom.BloomForCausalLM" + gpt2lmhm = "transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel" + gptneoxclm = "transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM" + chatglmfcg = "transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration" real_base_model_name = str(type(model)).lower() - log.info(f'Real base model={real_base_model_name}') + log.info(f"Real base model={real_base_model_name}") # bfclm will trigger generate crash. # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch - if device.upper() == 'GPU': - device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable') + if device.upper() == "GPU": + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else log.info("CUDA device is unavailable") + ) else: device = torch.device(device.lower()) - log.info(f'Torch device was set to: {device}') + log.info(f"Torch device was set to: {device}") - if any(x in real_base_model_name for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg]): + if any( + x in real_base_model_name + for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg] + ): model = set_bf16(model, device, **kwargs) else: - if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True: - log.info('Param [bf16/prec_bf16] will not work.') + if ( + len(kwargs["config"]) > 0 + and kwargs["config"].get("PREC_BF16") + and kwargs["config"]["PREC_BF16"] is True + ): + log.info("Param [bf16/prec_bf16] will not work.") model.to(device) else: - raise RuntimeError('==Failure ==: no device to load') + raise RuntimeError("==Failure ==: no device to load") - bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model) + bench_hook = hook_common.get_bench_hook(kwargs["num_beams"], model) - if kwargs['torch_compile_backend']: - backend = kwargs['torch_compile_backend'] + if kwargs["torch_compile_backend"]: + backend = kwargs["torch_compile_backend"] dynamic = None options = None child_modules = None - if kwargs['torch_compile_dynamic']: - dynamic = kwargs['torch_compile_dynamic'] - if kwargs['torch_compile_options']: - options = json.loads(kwargs['torch_compile_options']) - if kwargs['torch_compile_input_module']: - child_modules = kwargs['torch_compile_input_module'].split(".") - compiled_model = run_torch_compile(model, backend, dynamic, options, child_modules) + if kwargs["torch_compile_dynamic"]: + dynamic = kwargs["torch_compile_dynamic"] + if kwargs["torch_compile_options"]: + options = json.loads(kwargs["torch_compile_options"]) + if kwargs["torch_compile_input_module"]: + child_modules = kwargs["torch_compile_input_module"].split(".") + compiled_model = run_torch_compile( + model, backend, dynamic, options, child_modules + ) model = compiled_model return model, tokenizer, from_pretrain_time, bench_hook, False @@ -126,34 +185,42 @@ def create_image_gen_model(model_path, device, **kwargs): from_pretrain_time = 0 if model_path.exists(): if model_path.is_dir() and len(os.listdir(model_path)) != 0: - log.info(f'Load image model from model path:{model_path}') - model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] + log.info(f"Load image model from model path:{model_path}") + model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] model_class = PT_MODEL_CLASSES_MAPPING[model_type] start = time.perf_counter() pipe = model_class.from_pretrained(model_path) end = time.perf_counter() from_pretrain_time = end - start else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') + raise RuntimeError( + f"==Failure ==: model path:{model_path} is not directory or directory is empty" + ) else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist') + raise RuntimeError(f"==Failure ==: model path:{model_path} is not exist") - log.info(f'Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s') + log.info( + f"Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s" + ) if device: # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch - if device.upper() == 'GPU': - device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable') + if device.upper() == "GPU": + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else log.info("CUDA device is unavailable") + ) else: device = torch.device(device.lower()) - log.info(f'Torch device was set to: {device}') + log.info(f"Torch device was set to: {device}") pipe.to(device) else: - raise RuntimeError('==Failure ==: no device to load') + raise RuntimeError("==Failure ==: no device to load") - if kwargs['torch_compile_backend']: - backend = kwargs['torch_compile_backend'] + if kwargs["torch_compile_backend"]: + backend = kwargs["torch_compile_backend"] compiled_model = run_torch_compile(pipe, backend) pipe = compiled_model return pipe, from_pretrain_time @@ -164,34 +231,42 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): from_pretrain_time = 0 if model_path.exists(): if model_path.is_dir() and len(os.listdir(model_path)) != 0: - log.info(f'Load image model from model path:{model_path}') - model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] + log.info(f"Load image model from model path:{model_path}") + model_type = DEFAULT_MODEL_CLASSES[kwargs["use_case"]] model_class = PT_MODEL_CLASSES_MAPPING[model_type] start = time.perf_counter() pipe = model_class.from_pretrained(model_path) end = time.perf_counter() from_pretrain_time = end - start else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') + raise RuntimeError( + f"==Failure ==: model path:{model_path} is not directory or directory is empty" + ) else: - raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist') + raise RuntimeError(f"==Failure ==: model path:{model_path} is not exist") - log.info(f'Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s') + log.info( + f"Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s" + ) if device: # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch - if device.upper() == 'GPU': - device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable') + if device.upper() == "GPU": + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else log.info("CUDA device is unavailable") + ) else: device = torch.device(device.lower()) - log.info(f'Torch device was set to: {device}') + log.info(f"Torch device was set to: {device}") pipe.to(device) else: - raise RuntimeError('==Failure ==: no device to load') + raise RuntimeError("==Failure ==: no device to load") - if kwargs['torch_compile_backend']: - backend = kwargs['torch_compile_backend'] + if kwargs["torch_compile_backend"]: + backend = kwargs["torch_compile_backend"] compiled_model = run_torch_compile(pipe, backend) pipe = compiled_model return pipe, from_pretrain_time diff --git a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py index 12fc726f38..a7bb54f9dc 100644 --- a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py +++ b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py @@ -6,7 +6,13 @@ from whowhatbench.wwb import load_dataset from optimum.intel.openvino import OVModelForCausalLM -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationConfig, + CacheEvictionConfig, + AggregationMode, +) from openvino_tokenizers import convert_tokenizer from openvino import serialize @@ -18,12 +24,16 @@ MAX_SEQUENCES = 100 -model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) +model = OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True +) tokenizer = AutoTokenizer.from_pretrained(model_id) model_path = PosixPath(tempfile.gettempdir()) / model_id model.save_pretrained(model_path) -ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) +ov_tokenizer, ov_detokenizer = convert_tokenizer( + tokenizer, with_detokenizer=True, skip_special_tokens=True +) serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") @@ -48,24 +58,39 @@ generation_config.num_return_sequences = 1 generation_config.max_new_tokens = MAX_NEW_TOKENS -data = load_dataset(path='squad', name=None, split='validation')["context"] +data = load_dataset(path="squad", name=None, split="validation")["context"] data_dict = {"questions": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} -model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {}) -model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) +model_cb_noopt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {} +) +model_cb_opt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {} +) -GT_DATA_FILE = 'gt_data.csv' +GT_DATA_FILE = "gt_data.csv" if os.path.exists(GT_DATA_FILE): - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, gt_data=GT_DATA_FILE, tokenizer=tokenizer, - test_data=data_dict, generation_config=generation_config, - max_new_tokens=MAX_NEW_TOKENS, seqs_per_request=3) + evaluator = whowhatbench.Evaluator( + base_model=model_cb_noopt, + gt_data=GT_DATA_FILE, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) else: - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, - generation_config=generation_config, max_new_tokens=MAX_NEW_TOKENS, - seqs_per_request=3) - evaluator.dump_gt('gt_data.csv') + evaluator = whowhatbench.Evaluator( + base_model=model_cb_noopt, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) + evaluator.dump_gt("gt_data.csv") all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt) @@ -89,8 +114,18 @@ pipeline_opt_metrics = model_cb_opt.get_metrics() pipeline_noopt_metrics = model_cb_noopt.get_metrics() -print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}") -print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}") -max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage) -avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage) -print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x") +print( + f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}" +) +print( + f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}" +) +max_optimization_ratio = ( + pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage +) +avg_optimization_ratio = ( + pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage +) +print( + f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x" +) diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli.py b/llm_bench/python/who_what_benchmark/tests/test_cli.py index 8110e98335..1a8c91d0bd 100644 --- a/llm_bench/python/who_what_benchmark/tests/test_cli.py +++ b/llm_bench/python/who_what_benchmark/tests/test_cli.py @@ -16,11 +16,7 @@ def run_wwb(args): logger.info(" ".join(["wwb"] + args)) - result = subprocess.run( - ["wwb"] + args, - capture_output=True, - text=True - ) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) logger.info(result) return result @@ -55,12 +51,18 @@ def teardown_module(): def test_target_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr @@ -72,15 +74,24 @@ def test_gt_data(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", base_model_path, - "--gt-data", temp_file_name, - "--dataset", "EleutherAI/lambada_openai,en", - "--dataset-field", "text", - "--split", "test", - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--gt-data", + temp_file_name, + "--dataset", + "EleutherAI/lambada_openai,en", + "--dataset-field", + "text", + "--split", + "test", + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) @@ -90,13 +101,20 @@ def test_gt_data(): def test_output_directory(): with tempfile.TemporaryDirectory() as temp_dir: - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--output", temp_dir - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--output", + temp_dir, + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv")) @@ -104,13 +122,19 @@ def test_output_directory(): def test_verbose(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--verbose" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--verbose", + ] + ) assert result.returncode == 0 assert "## Diff " in result.stderr @@ -119,12 +143,18 @@ def test_language_autodetect(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", "Qwen/Qwen2-0.5B", - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + "Qwen/Qwen2-0.5B", + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) @@ -136,13 +166,19 @@ def test_hf_model(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", model_id, - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU", - "--hf" - ]) + result = run_wwb( + [ + "--base-model", + model_id, + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + "--hf", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) @@ -151,13 +187,19 @@ def test_hf_model(): def test_genai_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--genai" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--genai", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert "## Reference text" not in result.stderr diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py index 86f428ddd7..ea15f8285f 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py @@ -1,4 +1,5 @@ """Who what benchmark APIs.""" + from .evaluator import Evaluator __all__ = ["Evaluator"] diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py index bb0d17e34e..b520cd8387 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py @@ -6,7 +6,7 @@ from .whowhat_metrics import DivergencyMetric, SimilarityMetric default_data = { - "en" : { + "en": { "questions": [ "Who is Mark Twain?", "Who is William Shakespeare?", @@ -43,8 +43,7 @@ "谁是威廉-莎士比亚?", "阿加莎-克里斯蒂是谁?", "芭芭拉-卡特兰是谁?", - "丹妮尔-斯蒂尔是谁?" - "谁是哈罗德-罗宾斯?", + "丹妮尔-斯蒂尔是谁?" "谁是哈罗德-罗宾斯?", "乔治-西默农是谁?", "伊妮德-布莱顿是谁?", "西德尼-谢尔顿是谁?", @@ -189,8 +188,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): return res +<<<<<<< Updated upstream def _generate_data(self, model, gen_answer_fn=None, generation_config=None): def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question): +======= + def _generate_data(self, model, gen_answer_fn=None): + def default_gen_answer( + model, tokenizer, question, max_new_tokens, crop_question + ): +>>>>>>> Stashed changes inputs = self.tokenizer(question, return_tensors="pt") tokens = model.generate(**inputs, max_new_tokens=max_new_tokens) @@ -211,15 +217,22 @@ def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question data = pd.DataFrame.from_dict(data) else: if self.language is None: - print("No language detecting in the base model or ground truth data. Taking language from target model.") + print( + "No language detecting in the base model or ground truth data. Taking language from target model." + ) self.language = autodetect_language(model) data = pd.DataFrame.from_dict(default_data[self.language]) questions = data["questions"] answers = [] - prompts = questions.values if self.num_samples is None else questions.values[:self.num_samples] + prompts = ( + questions.values + if self.num_samples is None + else questions.values[: self.num_samples] + ) +<<<<<<< Updated upstream if generation_config is None: for q in tqdm(prompts, desc="Evaluate pipeline"): answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question)) @@ -235,6 +248,14 @@ def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question answers.append(ans.m_generation_ids[0]) batch.clear() +======= + for q in tqdm(prompts, desc="Evaluate pipeline"): + answers.append( + gen_answer_fn( + model, self.tokenizer, q, self.max_new_tokens, self._crop_question + ) + ) +>>>>>>> Stashed changes res_data = {"questions": list(prompts), "answers": answers} df = pd.DataFrame(res_data) diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py index 83157e05ca..8d8208e455 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -1,6 +1,7 @@ """ Metrics for text similarity """ + from difflib import SequenceMatcher import numpy as np @@ -68,9 +69,7 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): fdt_list.append(fdt) num_matched = sum(block.size for block in blocks) - sdt = ( - len(b_indexes) - num_matched - ) + sdt = len(b_indexes) - num_matched sdt_list.append(sdt) sdt_norm = sdt / len(b_indexes) sdtn_list.append(sdt_norm) diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py index 8efca22059..cf6d3bc300 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py @@ -16,17 +16,20 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = TasksManager._SUPPORTED_MODEL_TYPE["llama"] +TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["llama"] +) NormalizedConfigManager._conf["stablelm-epoch"] = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", ) -class GenAIModelWrapper(): +class GenAIModelWrapper: """ A helper class to store additional attributes for GenAI models """ + def __init__(self, model, model_dir): self.model = model self.config = AutoConfig.from_pretrained(model_dir) @@ -51,7 +54,9 @@ def load_genai_pipeline(model_dir, device="CPU"): def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False): if use_hf: logger.info("Using HF Transformers API") - return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map=device.lower()) + return AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) if use_genai: return load_genai_pipeline(model_id, device) @@ -62,7 +67,9 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F else: ov_options = None try: - model = OVModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device=device, ov_config=ov_options) + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) except ValueError: config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) model = OVModelForCausalLM.from_pretrained( @@ -71,7 +78,7 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F trust_remote_code=True, use_cache=True, device=device, - ov_config=ov_options + ov_config=ov_options, ) return model @@ -279,7 +286,9 @@ def main(): language=args.language, ) else: - base_model = load_model(args.base_model, args.device, args.ov_config, args.hf, args.genai) + base_model = load_model( + args.base_model, args.device, args.ov_config, args.hf, args.genai + ) evaluator = Evaluator( base_model=base_model, test_data=prompts, @@ -287,15 +296,19 @@ def main(): similarity_model_id=args.text_encoder, num_samples=args.num_samples, language=args.language, - gen_answer_fn=genai_gen_answer if args.genai else None + gen_answer_fn=genai_gen_answer if args.genai else None, ) if args.gt_data: evaluator.dump_gt(args.gt_data) del base_model if args.target_model: - target_model = load_model(args.target_model, args.device, args.ov_config, args.hf, args.genai) - all_metrics_per_question, all_metrics = evaluator.score(target_model, genai_gen_answer if args.genai else None) + target_model = load_model( + args.target_model, args.device, args.ov_config, args.hf, args.genai + ) + all_metrics_per_question, all_metrics = evaluator.score( + target_model, genai_gen_answer if args.genai else None + ) logger.info("Metrics for model: %s", args.target_model) logger.info(all_metrics) @@ -314,14 +327,18 @@ def main(): ref_text = "" actual_text = "" diff = "" - for l1, l2 in zip(e["source_model"].splitlines(), e["optimized_model"].splitlines()): + for l1, l2 in zip( + e["source_model"].splitlines(), e["optimized_model"].splitlines() + ): if l1 == "" and l2 == "": continue ref_text += l1 + "\n" actual_text += l2 + "\n" diff += diff_strings(l1, l2) + "\n" - logger.info("--------------------------------------------------------------------------------------") + logger.info( + "--------------------------------------------------------------------------------------" + ) logger.info("## Reference text %d:\n%s", i + 1, ref_text) logger.info("## Actual text %d:\n%s", i + 1, actual_text) logger.info("## Diff %d: ", i + 1) diff --git a/pyproject.toml b/pyproject.toml index b7a23efa98..4eebdb8e4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,40 @@ dependencies = [ "openvino_tokenizers~=2024.5.0.0.dev" ] +[tool.black] +exclude = ''' +( + README\.md| + SECURITY\.md| + LICENSE| + third-party-programs\.txt| + tests/python_tests/README\.md| + llm_bench/python/README\.md| + llm_bench/python/doc/IMAGE_GEN\.md| + llm_bench/python/doc/NOTES\.md| + llm_bench/python/who_what_benchmark/README\.md| + samples/cpp/beam_search_casual_lm/README\.md| + samples/cpp/benchmark_genai/README\.md| + samples/cpp/chat_sample/README\.md| + samples/cpp/greedy_casual_lm/README\.md| + samples/cpp/multinomial_casual_lm/README\.md| + samples/cpp/prompt_lookup_decoding_lm/README\.md| + samples/cpp/speculative_decoding_lm/README\.md| + samples/cpp/stable_diffusion/README\.md| + samples/cpp/whisper_speech_recognition/README\.md| + samples/python/beam_search_casual_lm/README\.md| + samples/python/benchmark_genai/README\.md| + samples/python/chat_sample/README\.md| + samples/python/greedy_casual_lm/README\.md| + samples/python/multinomial_casual_lm/README\.md| + samples/python/whisper_speech_recognition/README\.md| + src/README\.md| + src/docs/BUILD\.md| + src/docs/DOCKER\.md| + src/docs/SUPPORTED_MODELS\.md +) +''' + [tool.py-build-cmake.module] directory = "src/python" diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py index 16b8b76175..2ab173796f 100755 --- a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py +++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py @@ -8,11 +8,11 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument('model_dir') - parser.add_argument('prompts', nargs='+') + parser.add_argument("model_dir") + parser.add_argument("prompts", nargs="+") args = parser.parse_args() - device = 'CPU' # GPU can be used as well + device = "CPU" # GPU can be used as well pipe = openvino_genai.LLMPipeline(args.model_dir, device) config = openvino_genai.GenerationConfig() @@ -25,5 +25,5 @@ def main(): print(beams) -if '__main__' == __name__: +if "__main__" == __name__: main() diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index 9851483880..1aeb2c6760 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -4,46 +4,74 @@ import argparse import openvino_genai as ov_genai + def main(): parser = argparse.ArgumentParser(description="Help command") - parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") - parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") - parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") - parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") - parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument( + "-m", "--model", type=str, help="Path to model and tokenizers base directory" + ) + parser.add_argument( + "-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt" + ) + parser.add_argument( + "-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations" + ) + parser.add_argument( + "-n", "--num_iter", type=int, default=2, help="Number of iterations" + ) + parser.add_argument( + "-mt", + "--max_new_tokens", + type=int, + default=20, + help="Maximal number of new tokens", + ) parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") - + args = parser.parse_args() - # Perf metrics is stored in DecodedResults. + # Perf metrics is stored in DecodedResults. # In order to get DecodedResults instead of a string input should be a list. prompt = [args.prompt] model_path = args.model device = args.device num_warmup = args.num_warmup num_iter = args.num_iter - + config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens pipe = ov_genai.LLMPipeline(model_path, device) - + for _ in range(num_warmup): pipe.generate(prompt, config) - + res = pipe.generate(prompt, config) perf_metrics = res.perf_metrics for _ in range(num_iter - 1): res = pipe.generate(prompt, config) perf_metrics += res.perf_metrics - + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") - print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") - print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") - print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms") - print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms") - print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms") - print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s") + print( + f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms" + ) + print( + f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms" + ) + print( + f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms" + ) + print( + f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms" + ) + print( + f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms" + ) + print( + f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s" + ) + if __name__ == "__main__": main() diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py index eee66fb71d..f76ef2ace9 100755 --- a/samples/python/chat_sample/chat_sample.py +++ b/samples/python/chat_sample/chat_sample.py @@ -7,7 +7,7 @@ def streamer(subword): - print(subword, end='', flush=True) + print(subword, end="", flush=True) # Return flag corresponds whether generation should be stopped. # False means continue generation. return False @@ -15,10 +15,10 @@ def streamer(subword): def main(): parser = argparse.ArgumentParser() - parser.add_argument('model_dir') + parser.add_argument("model_dir") args = parser.parse_args() - device = 'CPU' # GPU can be used as well + device = "CPU" # GPU can be used as well pipe = openvino_genai.LLMPipeline(args.model_dir, device) config = openvino_genai.GenerationConfig() @@ -27,13 +27,13 @@ def main(): pipe.start_chat() while True: try: - prompt = input('question:\n') + prompt = input("question:\n") except EOFError: break pipe.generate(prompt, config, streamer) - print('\n----------') + print("\n----------") pipe.finish_chat() -if '__main__' == __name__: +if "__main__" == __name__: main() diff --git a/samples/python/greedy_causal_lm/greedy_causal_lm.py b/samples/python/greedy_causal_lm/greedy_causal_lm.py index 983195c696..cdd14250ba 100755 --- a/samples/python/greedy_causal_lm/greedy_causal_lm.py +++ b/samples/python/greedy_causal_lm/greedy_causal_lm.py @@ -8,11 +8,11 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument('model_dir') - parser.add_argument('prompt') + parser.add_argument("model_dir") + parser.add_argument("prompt") args = parser.parse_args() - device = 'CPU' # GPU can be used as well + device = "CPU" # GPU can be used as well pipe = openvino_genai.LLMPipeline(args.model_dir, device) config = openvino_genai.GenerationConfig() @@ -21,5 +21,5 @@ def main(): print(pipe.generate(args.prompt, config)) -if '__main__' == __name__: +if "__main__" == __name__: main() diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py index 6300320264..b8c5f0f5f6 100755 --- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py +++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py @@ -11,18 +11,18 @@ class IterableStreamer(openvino_genai.StreamerBase): """ A custom streamer class for handling token streaming and detokenization with buffering. - + Attributes: tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens. tokens_cache (list): A buffer to accumulate tokens for detokenization. text_queue (Queue): A synchronized queue for storing decoded text chunks. print_len (int): The length of the printed text to manage incremental decoding. """ - + def __init__(self, tokenizer): """ Initializes the IterableStreamer with the given tokenizer. - + Args: tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens. """ @@ -37,35 +37,37 @@ def __iter__(self): Returns the iterator object itself. """ return self - + def __next__(self): """ Returns the next value from the text queue. - + Returns: str: The next decoded text chunk. - + Raises: StopIteration: If there are no more elements in the queue. """ - value = self.text_queue.get() # get() will be blocked until a token is available. + value = ( + self.text_queue.get() + ) # get() will be blocked until a token is available. if value is None: raise StopIteration return value - + def get_stop_flag(self): """ Checks whether the generation process should be stopped. - + Returns: bool: Always returns False in this implementation. """ return False - + def put_word(self, word: str): """ Puts a word into the text queue. - + Args: word (str): The word to put into the queue. """ @@ -74,20 +76,20 @@ def put_word(self, word: str): def put(self, token_id: int) -> bool: """ Processes a token and manages the decoding buffer. Adds decoded text to the queue. - + Args: token_id (int): The token_id to process. - + Returns: bool: True if generation should be stopped, False otherwise. - """ + """ self.tokens_cache.append(token_id) text = self.tokenizer.decode(self.tokens_cache) - word = '' - if len(text) > self.print_len and '\n' == text[-1]: + word = "" + if len(text) > self.print_len and "\n" == text[-1]: # Flush the cache after the new line symbol. - word = text[self.print_len:] + word = text[self.print_len :] self.tokens_cache = [] self.print_len = 0 elif len(text) >= 3 and text[-3:] == chr(65533): @@ -96,10 +98,10 @@ def put(self, token_id: int) -> bool: elif len(text) > self.print_len: # It is possible to have a shorter text after adding new token. # Print to output only if text lengh is increaesed. - word = text[self.print_len:] + word = text[self.print_len :] self.print_len = len(text) - self.put_word(word) - + self.put_word(word) + if self.get_stop_flag(): # When generation is stopped from streamer then end is not called, need to call it here manually. self.end() @@ -113,7 +115,7 @@ def end(self): """ text = self.tokenizer.decode(self.tokens_cache) if len(text) > self.print_len: - word = text[self.print_len:] + word = text[self.print_len :] self.put_word(word) self.tokens_cache = [] self.print_len = 0 @@ -122,31 +124,34 @@ def end(self): def main(): parser = argparse.ArgumentParser() - parser.add_argument('model_dir') - parser.add_argument('prompt') + parser.add_argument("model_dir") + parser.add_argument("prompt") args = parser.parse_args() - device = 'CPU' # GPU can be used as well + device = "CPU" # GPU can be used as well pipe = openvino_genai.LLMPipeline(args.model_dir, device) - + text_print_streamer = IterableStreamer(pipe.get_tokenizer()) + def token_printer(): # Getting next elements from iterable will be blocked until a new token is available. for word in text_print_streamer: - print(word, end='', flush=True) + print(word, end="", flush=True) + printer_thread = threading.Thread(target=token_printer, daemon=True) printer_thread.start() - + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 config.do_sample = True config.top_p = 0.9 config.top_k = 30 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, text_print_streamer) printer_thread.join() -if '__main__' == __name__: + +if "__main__" == __name__: main() diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index d22b17ebf5..0f49eceb5c 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -7,19 +7,26 @@ from optimum.intel import OVModelForCausalLM from pathlib import Path -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationResult, + GenerationConfig, +) from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple TESTS_ROOT = Path(__file__).parent + def get_greedy() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 generation_config.max_new_tokens = 30 return generation_config + def get_greedy_with_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -27,6 +34,7 @@ def get_greedy_with_min_and_max_tokens() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_greedy_with_repetition_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -34,6 +42,7 @@ def get_greedy_with_repetition_penalty() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_greedy_with_penalties() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -42,6 +51,7 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_greedy_with_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -49,15 +59,17 @@ def get_greedy_with_min_and_max_tokens() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_greedy_with_single_stop_string() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 generation_config.min_new_tokens = 15 generation_config.max_new_tokens = 50 - generation_config.stop_strings = {"anag"} # expected match on "manage" + generation_config.stop_strings = {"anag"} # expected match on "manage" generation_config.include_stop_str_in_output = True return generation_config + def get_greedy_with_multiple_stop_strings() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -67,6 +79,7 @@ def get_greedy_with_multiple_stop_strings() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config + def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -76,6 +89,7 @@ def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config + def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -85,6 +99,7 @@ def get_beam_search() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config + def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -95,6 +110,7 @@ def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config + def get_beam_search_with_single_stop_string() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -105,6 +121,7 @@ def get_beam_search_with_single_stop_string() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config + def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -115,6 +132,7 @@ def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config + def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -125,6 +143,7 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config + def get_multinomial_temperature() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -133,6 +152,7 @@ def get_multinomial_temperature() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -141,6 +161,7 @@ def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_top_p() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -150,6 +171,7 @@ def get_multinomial_temperature_and_top_p() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_top_k() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -159,6 +181,7 @@ def get_multinomial_temperature_and_top_k() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -169,6 +192,7 @@ def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -178,6 +202,7 @@ def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_all_parameters() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -189,6 +214,7 @@ def get_multinomial_all_parameters() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -198,6 +224,7 @@ def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_temperature_and_presence_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -207,6 +234,7 @@ def get_multinomial_temperature_and_presence_penalty() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config + def get_multinomial_max_and_min_token() -> GenerationConfig: multinomial = GenerationConfig() multinomial.do_sample = True @@ -220,12 +248,13 @@ def get_multinomial_max_and_min_token() -> GenerationConfig: multinomial.max_new_tokens = 30 return multinomial + def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: prompts = [ "What is OpenVINO?", "How are you?", "What is your name?", - "Tell me something about Canada" + "Tell me something about Canada", ] generation_configs = [ get_greedy(), @@ -260,38 +289,37 @@ def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: def convert_to_hf( - default_generation_config : HFGenerationConfig, - generation_config : GenerationConfig + default_generation_config: HFGenerationConfig, generation_config: GenerationConfig ) -> HFGenerationConfig: kwargs = {} # generic parameters - kwargs['max_length'] = generation_config.max_length + kwargs["max_length"] = generation_config.max_length # has higher priority than 'max_length' - kwargs['max_new_tokens'] = generation_config.max_new_tokens + kwargs["max_new_tokens"] = generation_config.max_new_tokens if generation_config.stop_strings: - kwargs['stop_strings'] = generation_config.stop_strings + kwargs["stop_strings"] = generation_config.stop_strings # copy default parameters - kwargs['eos_token_id'] = default_generation_config.eos_token_id - kwargs['pad_token_id'] = default_generation_config.pad_token_id - kwargs['repetition_penalty'] = generation_config.repetition_penalty + kwargs["eos_token_id"] = default_generation_config.eos_token_id + kwargs["pad_token_id"] = default_generation_config.pad_token_id + kwargs["repetition_penalty"] = generation_config.repetition_penalty if generation_config.num_beams > 1: # beam search case - kwargs['num_beam_groups'] = generation_config.num_beam_groups - kwargs['num_beams'] = generation_config.num_beams - kwargs['diversity_penalty'] = generation_config.diversity_penalty - kwargs['length_penalty'] = generation_config.length_penalty - kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size - kwargs['num_return_sequences'] = generation_config.num_return_sequences - kwargs['output_scores'] = True + kwargs["num_beam_groups"] = generation_config.num_beam_groups + kwargs["num_beams"] = generation_config.num_beams + kwargs["diversity_penalty"] = generation_config.diversity_penalty + kwargs["length_penalty"] = generation_config.length_penalty + kwargs["no_repeat_ngram_size"] = generation_config.no_repeat_ngram_size + kwargs["num_return_sequences"] = generation_config.num_return_sequences + kwargs["output_scores"] = True elif generation_config.do_sample: # mulitinomial - kwargs['temperature'] = generation_config.temperature - kwargs['top_k'] = generation_config.top_k - kwargs['top_p'] = generation_config.top_p - kwargs['do_sample'] = generation_config.do_sample + kwargs["temperature"] = generation_config.temperature + kwargs["top_k"] = generation_config.top_k + kwargs["top_p"] = generation_config.top_p + kwargs["do_sample"] = generation_config.do_sample else: # greedy pass @@ -309,16 +337,29 @@ def run_hugging_face( generation_results = [] for prompt, generation_config in zip(prompts, generation_configs): inputs = hf_tokenizer(prompt, return_tensors="pt") - prompt_len = inputs['input_ids'].numel() - generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), - return_dict_in_generate=True, tokenizer=hf_tokenizer) - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + prompt_len = inputs["input_ids"].numel() + generate_outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + generation_config=convert_to_hf(model.generation_config, generation_config), + return_dict_in_generate=True, + tokenizer=hf_tokenizer, + ) + all_text_batch = hf_tokenizer.batch_decode( + [ + generated_ids[prompt_len:] + for generated_ids in generate_outputs.sequences + ], + skip_special_tokens=True, + ) generation_result = GenerationResult() generation_result.m_generation_ids = all_text_batch # sequences_scores are available only for beam search case if generation_config.is_beam_search(): - generation_result.m_scores = [score for score in generate_outputs.sequences_scores] + generation_result.m_scores = [ + score for score in generate_outputs.sequences_scores + ] generation_results.append(generation_result) del hf_tokenizer @@ -328,12 +369,14 @@ def run_hugging_face( def run_continuous_batching( - model_path : Path, - scheduler_config : SchedulerConfig, + model_path: Path, + scheduler_config: SchedulerConfig, prompts: List[str], - generation_configs : List[GenerationConfig] + generation_configs: List[GenerationConfig], ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) + pipe = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {} + ) output = pipe.generate(prompts, generation_configs) del pipe shutil.rmtree(model_path) @@ -346,13 +389,17 @@ def get_models_list(file_name: str): for model_name in f: model_name = model_name.strip() # skip comment in model scope file - if model_name.startswith('#'): + if model_name.startswith("#"): continue models.append(model_name) return models -def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): +def compare_results( + hf_result: GenerationResult, + ov_result: GenerationResult, + generation_config: GenerationConfig, +): if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): @@ -363,58 +410,111 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): assert hf_text == ov_text + def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path): model.save_pretrained(model_path) # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) + + tokenizer, detokenizer = convert_tokenizer( + hf_tokenizer, with_detokenizer=True, skip_special_tokens=True + ) serialize(tokenizer, model_path / "openvino_tokenizer.xml") serialize(detokenizer, model_path / "openvino_detokenizer.xml") -def get_model_and_tokenizer(model_id: str, use_optimum = True): + +def get_model_and_tokenizer(model_id: str, use_optimum=True): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ - AutoModelForCausalLM.from_pretrained(model_id) + model = ( + OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True + ) + if use_optimum + else AutoModelForCausalLM.from_pretrained(model_id) + ) return model, hf_tokenizer -def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): + +def generate_and_compare_with_hf( + model_id: str, + prompts: List[str], + generation_configs: List[GenerationConfig], + scheduler_config: SchedulerConfig, + tmp_path: Path, +): use_optimum = True - model_path : Path = tmp_path / model_id + model_path: Path = tmp_path / model_id model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) if use_optimum: save_ov_model_from_optimum(model, hf_tokenizer, model_path) - hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(model_path, prompts, hf_results, generation_configs, scheduler_config) + hf_results = run_hugging_face( + model=model, + hf_tokenizer=hf_tokenizer, + prompts=prompts, + generation_configs=generation_configs, + ) + _generate_and_compare_with_reference_results( + model_path, prompts, hf_results, generation_configs, scheduler_config + ) -def _generate_and_compare_with_reference_results(model_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) +def _generate_and_compare_with_reference_results( + model_path: Path, + prompts: List[str], + reference_results: List[GenerationResult], + generation_configs: List[GenerationConfig], + scheduler_config: SchedulerConfig, +): + ov_results: List[GenerationResult] = run_continuous_batching( + model_path, scheduler_config, prompts, generation_configs + ) assert len(prompts) == len(reference_results) assert len(prompts) == len(ov_results) - for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs): - print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}") + for prompt, ref_result, ov_result, generation_config in zip( + prompts, reference_results, ov_results, generation_configs + ): + print( + f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}" + ) compare_results(ref_result, ov_result, generation_config) -def generate_and_compare_with_reference_text(model_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) +def generate_and_compare_with_reference_text( + model_path: Path, + prompts: List[str], + reference_texts_per_prompt: List[List[str]], + generation_configs: List[GenerationConfig], + scheduler_config: SchedulerConfig, +): + ov_results: List[GenerationResult] = run_continuous_batching( + model_path, scheduler_config, prompts, generation_configs + ) assert len(prompts) == len(reference_texts_per_prompt) assert len(prompts) == len(ov_results) - for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs): - print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}") + for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip( + prompts, reference_texts_per_prompt, ov_results, generation_configs + ): + print( + f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}" + ) assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids) - for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): + for ref_text, ov_text in zip( + ref_texts_for_this_prompt, ov_result.m_generation_ids + ): assert ref_text == ov_text -def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): + +def run_test_pipeline( + tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config=None +): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) @@ -422,7 +522,16 @@ def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = Non generation_config.rng_seed = 0 generation_configs = [generation_config] * len(prompts) - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + generate_and_compare_with_hf( + model_id, prompts, generation_configs, scheduler_config, tmp_path + ) -DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}) +DEFAULT_SCHEDULER_CONFIG = get_scheduler_config( + { + "num_kv_blocks": 300, + "dynamic_split_fuse": True, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + } +) diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index f98f47ecf3..eef3865ef0 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -2,23 +2,24 @@ def pytest_make_parametrize_id(config, val, argname): - if argname in ['prompt', 'prompts', 'batched_prompts']: - return f'{val}' - elif argname == 'model_descr': + if argname in ["prompt", "prompts", "batched_prompts"]: + return f"{val}" + elif argname == "model_descr": return f"{val[0]}" - elif argname == 'chat_config': + elif argname == "chat_config": return f"{val[0]}" - elif argname in ['stop_criteria', 'generation_config']: + elif argname in ["stop_criteria", "generation_config"]: return str(val) elif isinstance(val, (int, float, str)): - return f'{argname}={val}' + return f"{argname}={val}" return None + def pytest_addoption(parser): parser.addoption("--model_ids", help="Select models to run") + def pytest_configure(config: pytest.Config): - marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' + marker = "precommit" if config.getoption("-m") == "precommit" else "nightly" pytest.run_marker = marker - pytest.selected_model_ids = config.getoption('--model_ids', default=None) - + pytest.selected_model_ids = config.getoption("--model_ids", default=None) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index b437eee007..08243d6554 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -32,7 +32,6 @@ def get_models_list(): "HuggingFaceH4/zephyr-7b-beta", "ikala/redpajama-3b-chat", "mistralai/Mistral-7B-v0.1", - # "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token # "google/gemma-2b-it", # Cannot be downloaded without access token. # "google/gemma-7b-it", # Cannot be downloaded without access token. @@ -49,12 +48,16 @@ def get_models_list(): model_ids = precommit_models else: model_ids = nightly_models - + if pytest.selected_model_ids: - model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + model_ids = [ + model_id + for model_id in model_ids + if model_id in pytest.selected_model_ids.split(" ") + ] # pytest.set_trace() - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", "")) + return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids] def get_whisper_models_list(tiny_only=False): @@ -75,10 +78,14 @@ def get_whisper_models_list(tiny_only=False): model_ids = nightly_models if pytest.selected_model_ids: - model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + model_ids = [ + model_id + for model_id in model_ids + if model_id in pytest.selected_model_ids.split(" ") + ] # pytest.set_trace() - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", "")) + return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids] def get_chat_models_list(): @@ -98,20 +105,19 @@ def get_chat_models_list(): model_ids = precommit_models else: model_ids = nightly_models - - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", "")) + return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids] def get_chat_templates(): - # Returns chat templates saved in tokenizer_configs.py, + # Returns chat templates saved in tokenizer_configs.py, # but skips some models that currently are not processed correctly. skipped_models = { # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. # Need to enable and unskip, since it's preset in continious batching and has >100 000 downloads. "openchat/openchat-3.5-0106", - # These models fail even on HF so no need to check if applying chat matches. "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", "codellama/CodeLlama-34b-Instruct-hf", @@ -123,15 +129,14 @@ def get_chat_templates(): "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", "AliAbdelrasheed/maqa_llama_4bit", "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored", - # TODO: Need to support chat templates in more models: CVS-145963 # Either ov_genai is unable to parse chat_template or results do not match with HF. "meta-llama/Meta-Llama-3-8B-Instruct", - "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp + "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp "mosaicml/mpt-30b-chat", - "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp - "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp - "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp + "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp + "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp + "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp "openchat/openchat-3.5-0106", "casperhansen/llama-3-70b-instruct-awq", "TheBloke/deepseek-coder-33B-instruct-GPTQ", @@ -142,82 +147,93 @@ def get_chat_templates(): "maywell/Synatra-Mixtral-8x7B", "MediaTek-Research/Breeze-7B-Instruct-v1_0", "bofenghuang/vigostral-7b-chat", - "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp + "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp "openchat/openchat-3.6-8b-20240522", "tenyx/TenyxChat-7B-v1", "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", "yam-peleg/Hebrew-Gemma-11B-V2", - "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError + "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError "nlpai-lab/KULLM3", "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", - "MediaTek-Research/Breeze-7B-Instruct-v0_1", - "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError + "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError "MLP-KTLim/llama-3-Korean-Bllossom-8B", - "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp + "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp "codellama/CodeLlama-70b-Instruct-hf", - "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp - "BramVanroy/Llama-2-13b-chat-dutch" + "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp + "BramVanroy/Llama-2-13b-chat-dutch", } from tokenizer_configs import get_tokenizer_configs - return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] + + return [ + (k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models + ] @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): model_id, path = params - + from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) if (path / "openvino_model.xml").exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, - compile=False, device='CPU') + opt_model = OVModelForCausalLM.from_pretrained( + path, trust_remote_code=True, compile=False, device="CPU" + ) else: - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, - with_detokenizer=True, - **tokenizer_kwargs) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( + tokenizer, with_detokenizer=True, **tokenizer_kwargs + ) openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - + # to store tokenizer config jsons with special tokens tokenizer.save_pretrained(path) - - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False) + + opt_model = OVModelForCausalLM.from_pretrained( + model_id, + export=True, + trust_remote_code=True, + compile=False, + device="CPU", + load_in_8bit=False, + ) opt_model.generation_config.save_pretrained(path) opt_model.config.save_pretrained(path) opt_model.save_pretrained(path) - + return ( model_id, path, tokenizer, opt_model, - ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}), + ov_genai.LLMPipeline(str(path), device="CPU", config={"ENABLE_MMAP": False}), ) # in OpenVINO GenAI this parameter is called stop_criteria, -# while in HF it's called early_stopping. +# while in HF it's called early_stopping. # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" STOP_CRITERIA_MAP = { - ov_genai.StopCriteria.NEVER: "never", - ov_genai.StopCriteria.EARLY: True, - ov_genai.StopCriteria.HEURISTIC: False + ov_genai.StopCriteria.NEVER: "never", + ov_genai.StopCriteria.EARLY: True, + ov_genai.StopCriteria.HEURISTIC: False, } @pytest.fixture(scope="module") def model_tmp_path(tmpdir_factory): model_id, path, _, _, _ = read_model(get_models_list()[0]) - temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) + temp_path = tmpdir_factory.mktemp(model_id.replace("/", "_")) # copy openvino converted model and tokenizers - for pattern in ['*.xml', '*.bin']: + for pattern in ["*.xml", "*.bin"]: for src_file in path.glob(pattern): if src_file.is_file(): - shutil.copy(src_file, temp_path / src_file.name) + shutil.copy(src_file, temp_path / src_file.name) yield model_id, Path(temp_path) @@ -228,7 +244,7 @@ def load_tok(configs: List[Tuple], temp_path): json_file.unlink() for config_json, config_name in configs: - with (temp_path / config_name).open('w') as f: + with (temp_path / config_name).open("w") as f: json.dump(config_json, f) return ov_genai.Tokenizer(str(temp_path), {}) @@ -240,7 +256,7 @@ def load_pipe(configs: List[Tuple], temp_path): json_file.unlink() for config_json, config_name in configs: - with (temp_path / config_name).open('w') as f: + with (temp_path / config_name).open("w") as f: json.dump(config_json, f) return ov_genai.LLMPipeline(str(temp_path)) @@ -249,4 +265,9 @@ def load_pipe(configs: List[Tuple], temp_path): def get_continuous_batching(path): scheduler_config = ov_genai.SchedulerConfig() scheduler_config.cache_size = 1 - return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), device='CPU', config={"scheduler_config": scheduler_config}) + return ov_genai.LLMPipeline( + str(path), + ov_genai.Tokenizer(str(path)), + device="CPU", + config={"scheduler_config": scheduler_config}, + ) diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index 6fdf5446c7..7067c67e10 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -10,7 +10,14 @@ import whowhatbench from optimum.intel.openvino import OVModelForCausalLM -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig, CacheEvictionConfig, AggregationMode +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationResult, + GenerationConfig, + CacheEvictionConfig, + AggregationMode, +) from openvino_tokenizers import convert_tokenizer from openvino import serialize @@ -19,11 +26,12 @@ from common import TESTS_ROOT -def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: - file_path = TESTS_ROOT / 'data' / file_name - with open(file_path, 'r') as f: +def load_prompts_dataset(file_name: str) -> Dict[str, List[str]]: + file_path = TESTS_ROOT / "data" / file_name + with open(file_path, "r") as f: return {"questions": [s for s in f]} + def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: scheduler_config = SchedulerConfig() scheduler_config.num_kv_blocks = num_kv_blocks @@ -33,6 +41,7 @@ def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: scheduler_config.use_cache_eviction = False return scheduler_config + @dataclass class ConvertedModel: model: OVModelForCausalLM @@ -40,14 +49,18 @@ class ConvertedModel: model_path: Path -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) + model = OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True + ) tokenizer = AutoTokenizer.from_pretrained(model_id) model_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(model_path) - ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) + ov_tokenizer, ov_detokenizer = convert_tokenizer( + tokenizer, with_detokenizer=True, skip_special_tokens=True + ) serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") converted_model = ConvertedModel(model, tokenizer, model_path) @@ -64,39 +77,69 @@ class CacheOptTestStruct: use_cache_eviction: bool cache_eviction_config: Optional[CacheEvictionConfig] similarity_threshold: float - avg_cache_usage_optimization_ratio: float # expecting no less than these optimization ratios + avg_cache_usage_optimization_ratio: ( + float # expecting no less than these optimization ratios + ) max_cache_usage_optimization_ratio: float -SHORT_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=32, max_cache_size=128, aggregation_mode=AggregationMode.NORM_SUM) +SHORT_CACHE_EVICTION_CONFIG = CacheEvictionConfig( + start_size=32, + recent_size=32, + max_cache_size=128, + aggregation_mode=AggregationMode.NORM_SUM, +) + @pytest.mark.precommit -@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="doesn't work on win due to optimum-intel export bug, segfault on mac") -@pytest.mark.parametrize("test_struct", [ - # prompts + generation length are longer than the eviction arena, eviction expected w/ impact to similarity - CacheOptTestStruct(prompt_file="long_prompts.txt", max_new_tokens=128, num_kv_blocks=100, use_cache_eviction=True, - cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, - similarity_threshold=0.8, - max_cache_usage_optimization_ratio=1.8, - avg_cache_usage_optimization_ratio=1.35), - - # prompts + generation length are shorter than the eviction arena, no eviction expected - CacheOptTestStruct(prompt_file="short_prompts.txt", max_new_tokens=32, num_kv_blocks=100, use_cache_eviction=True, - cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, - similarity_threshold=0.98, - max_cache_usage_optimization_ratio=0.95, # no improvement expected - avg_cache_usage_optimization_ratio=0.95), - - # short prompts, long generation - eviction expected - CacheOptTestStruct(prompt_file="short_prompts.txt", max_new_tokens=384, num_kv_blocks=100, use_cache_eviction=True, - cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, - similarity_threshold=0.94, - max_cache_usage_optimization_ratio=1.75, - avg_cache_usage_optimization_ratio=1.35), - -]) -@pytest.mark.parametrize("enable_prefix_caching", [True, False]) # prefix caching shouldn't impact similarity -def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, test_struct, enable_prefix_caching): +@pytest.mark.skipif( + sys.platform in ("win32", "darwin"), + reason="doesn't work on win due to optimum-intel export bug, segfault on mac", +) +@pytest.mark.parametrize( + "test_struct", + [ + # prompts + generation length are longer than the eviction arena, eviction expected w/ impact to similarity + CacheOptTestStruct( + prompt_file="long_prompts.txt", + max_new_tokens=128, + num_kv_blocks=100, + use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.8, + max_cache_usage_optimization_ratio=1.8, + avg_cache_usage_optimization_ratio=1.35, + ), + # prompts + generation length are shorter than the eviction arena, no eviction expected + CacheOptTestStruct( + prompt_file="short_prompts.txt", + max_new_tokens=32, + num_kv_blocks=100, + use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.98, + max_cache_usage_optimization_ratio=0.95, # no improvement expected + avg_cache_usage_optimization_ratio=0.95, + ), + # short prompts, long generation - eviction expected + CacheOptTestStruct( + prompt_file="short_prompts.txt", + max_new_tokens=384, + num_kv_blocks=100, + use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.94, + max_cache_usage_optimization_ratio=1.75, + avg_cache_usage_optimization_ratio=1.35, + ), + ], +) +@pytest.mark.parametrize( + "enable_prefix_caching", [True, False] +) # prefix caching shouldn't impact similarity +def test_cache_optimized_generation_is_similar_to_unoptimized( + converted_model, test_struct, enable_prefix_caching +): seqs_per_request = 5 scheduler_config = get_scheduler_config(test_struct.num_kv_blocks) @@ -111,30 +154,49 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching model_path = converted_model.model_path - model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}) - model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) + model_cb_noopt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config, "CPU", {} + ) + model_cb_opt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {} + ) tokenizer = converted_model.tokenizer data_dict = load_prompts_dataset(test_struct.prompt_file) - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, - generation_config=generation_config, - generation_config_base=generation_config, - max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request) + evaluator = whowhatbench.Evaluator( + base_model=model_cb_noopt, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + generation_config_base=generation_config, + max_new_tokens=test_struct.max_new_tokens, + seqs_per_request=seqs_per_request, + ) _, all_metrics = evaluator.score(model_cb_opt) - similarity_metric = float(all_metrics['similarity'][0]) + similarity_metric = float(all_metrics["similarity"][0]) pipeline_opt_metrics = model_cb_opt.get_metrics() pipeline_noopt_metrics = model_cb_noopt.get_metrics() print(f"Similarity: {similarity_metric}") - print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}") - print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}") - max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage) - avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage) - print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x") + print( + f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}" + ) + print( + f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}" + ) + max_optimization_ratio = ( + pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage + ) + avg_optimization_ratio = ( + pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage + ) + print( + f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x" + ) assert similarity_metric > test_struct.similarity_threshold assert max_optimization_ratio >= test_struct.max_cache_usage_optimization_ratio @@ -142,5 +204,3 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t del model_cb_opt del model_cb_noopt - - diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index b68de6372d..da913b1fa0 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -20,15 +20,21 @@ configs = [ dict(max_new_tokens=20), - dict(num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) + dict( + num_beam_groups=3, + num_beams=15, + num_return_sequences=1, + max_new_tokens=10, + diversity_penalty=1.0, + ), ] quenstions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?' + "1+1=", + "What is the previous answer?", + "Why is the Sun yellow?", + "What was my first question?", ] @@ -37,34 +43,52 @@ @pytest.mark.precommit @pytest.mark.nightly def test_chat_compare_with_HF(model_descr, generation_config: Dict): - device = 'CPU' + device = "CPU" chat_history_hf = [] chat_history_ov = [] +<<<<<<< Updated upstream chat_prompt = '' # Will set add_special_tokens=False inside pipeline when start_chat() is called. model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) +======= + chat_prompt = "" - pipe.start_chat() + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model( + (model_descr[0], model_descr[1] / "_test_chat"), add_special_tokens=False + ) +>>>>>>> Stashed changes + + pipe.start_chat() for prompt in quenstions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + chat_history_hf.append({"role": "user", "content": prompt}) + chat_history_ov.append({"role": "user", "content": prompt}) + + chat_prompt = tokenizer.apply_chat_template( + chat_history_hf, tokenize=False, add_generation_prompt=True + ) + tokenized = tokenizer( + chat_prompt, return_tensors="pt", add_special_tokens=False + ) + + answer = model_opt.generate( + **tokenized, **generation_config, do_sample=False, repetition_penalty=None + ) + answer_str = tokenizer.decode( + answer[0, tokenized["input_ids"].numel() :], skip_special_tokens=True + ) + chat_history_hf.append({"role": "assistant", "content": answer_str}) answer_ov = pipe.generate(prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + chat_history_ov.append({"role": "assistant", "content": answer_ov}) pipe.finish_chat() - + if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') + print(f"hf_output: {chat_history_hf}") + print(f"ov_output: {chat_history_ov}") assert chat_history_ov == chat_history_hf @@ -74,33 +98,45 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): @pytest.mark.nightly def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): # compares with HF when history in ov_genai is save as a text - device = 'CPU' + device = "CPU" chat_history_hf = [] chat_history_ov = [] - chat_prompt = '' - + chat_prompt = "" + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - + model_id, path, tokenizer, model_opt, pipe = read_model( + (model_descr[0], model_descr[1] / "_test_chat"), add_special_tokens=False + ) + for prompt in quenstions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + chat_history_hf.append({"role": "user", "content": prompt}) + chat_history_ov.append({"role": "user", "content": prompt}) + + chat_prompt = tokenizer.apply_chat_template( + chat_history_hf, tokenize=False, add_generation_prompt=True + ) + tokenized = tokenizer( + chat_prompt, return_tensors="pt", add_special_tokens=False + ) + + answer = model_opt.generate( + **tokenized, **generation_config, do_sample=False, repetition_penalty=None + ) + answer_str = tokenizer.decode( + answer[0, tokenized["input_ids"].numel() :], skip_special_tokens=True + ) + chat_history_hf.append({"role": "assistant", "content": answer_str}) + + chat_prompt = pipe.get_tokenizer().apply_chat_template( + chat_history_ov, add_generation_prompt=True + ) answer_ov = pipe.generate(chat_prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - + chat_history_ov.append({"role": "assistant", "content": answer_ov}) + if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') + print(f"hf_output: {chat_history_hf}") + print(f"ov_output: {chat_history_ov}") assert chat_history_ov == chat_history_hf @@ -110,63 +146,75 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict) @pytest.mark.nightly def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict): # Check that when history is stored in KV cache results are the same as when history stored in a text. - device ='CPU' - + device = "CPU" + chat_history_with_kv_cache = [] chat_history_ov = [] - + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False}) - + model_id, path, tokenizer, model_opt, pipe = read_model( + (model_descr[0], model_descr[1] / "_test_chat"), add_special_tokens=False + ) + pipe_with_kv_cache = ov_genai.LLMPipeline( + str(path), device, config={"ENABLE_MMAP": False} + ) + pipe_with_kv_cache.start_chat() for question in quenstions: - chat_history_with_kv_cache.append({'role': 'user', 'content': question}) + chat_history_with_kv_cache.append({"role": "user", "content": question}) answer = pipe_with_kv_cache.generate(question, **generation_config) - chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer}) - - chat_history_ov.append({'role': 'user', 'content': question}) - prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + chat_history_with_kv_cache.append({"role": "assistant", "content": answer}) + + chat_history_ov.append({"role": "user", "content": question}) + prompt = pipe.get_tokenizer().apply_chat_template( + chat_history_ov, add_generation_prompt=True + ) answer = pipe.generate(prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer}) + chat_history_ov.append({"role": "assistant", "content": answer}) pipe_with_kv_cache.finish_chat() if chat_history_ov != chat_history_with_kv_cache: - print(f'kvcache_hist: {chat_history_with_kv_cache}') - print(f'text_history: {chat_history_ov}') + print(f"kvcache_hist: {chat_history_with_kv_cache}") + print(f"text_history: {chat_history_ov}") assert chat_history_ov == chat_history_with_kv_cache conversation = [ - {'role': 'user', 'content': '1+1='}, - {'role': 'assistant', 'content': '1 + 1 = 2'}, - {'role': 'user', 'content': 'What is the previous answer?'}, - {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'}, - {'role': 'user', 'content': 'Why is the sun yellow?'}, - {'role': 'assistant', 'content': 'Because it emits yeloow light.'}, - {'role': 'user', 'content': 'What was my first question?'}, + {"role": "user", "content": "1+1="}, + {"role": "assistant", "content": "1 + 1 = 2"}, + {"role": "user", "content": "What is the previous answer?"}, + { + "role": "assistant", + "content": "The previous answer was: 1 + 1 = 2. Please ask me your next question.", + }, + {"role": "user", "content": "Why is the sun yellow?"}, + {"role": "assistant", "content": "Because it emits yeloow light."}, + {"role": "user", "content": "What was my first question?"}, ] + + @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.parametrize('chat_config', get_chat_templates()) +@pytest.mark.parametrize("chat_config", get_chat_templates()) def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): tokenizer_config = chat_config[1] # Will load openvino_model for tiny-random-phi as a placeholder # but indeed only Tokenizer and apply_chat_template will be tested. model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0]) - - full_history_str_hf = tokenizer.apply_chat_template(conversation, - add_generation_prompt=False, - tokenize=False, - **tokenizer_config) - + + full_history_str_hf = tokenizer.apply_chat_template( + conversation, add_generation_prompt=False, tokenize=False, **tokenizer_config + ) + tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) - full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False) + full_history_str = tok.apply_chat_template( + conversation, add_generation_prompt=False + ) if full_history_str != full_history_str_hf: - print(f'hf reference: {full_history_str_hf}') - print(f'ov_genai out: {full_history_str}') + print(f"hf reference: {full_history_str_hf}") + print(f"ov_genai out: {full_history_str}") assert full_history_str == full_history_str_hf @@ -174,7 +222,9 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): - model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat')) + model_id, path, tokenizer, model, stateful = read_model( + (model_descr[0], model_descr[1] / "_test_chat") + ) cb = get_continuous_batching(path) stateful.start_chat() cb.start_chat() @@ -185,12 +235,17 @@ def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Di # Test that finish_chat() doesn't fail just in case. cb.finish_chat() + @pytest.mark.precommit @pytest.mark.nightly def test_set_chat_template(): model_descr = get_chat_models_list()[0] - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}") + model_id, path, tokenizer, model_opt, pipe = read_model( + (model_descr[0], model_descr[1] / "_test_chat") + ) + pipe.get_tokenizer().set_chat_template( + "{% for message in messages %}{{ message['content'] }}{% endfor %}" + ) pipe.start_chat() generated = pipe.generate("a", max_new_tokens=1) pipe.finish_chat() diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index f80729d425..902fd1c45b 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -13,49 +13,69 @@ import torch import math from ov_genai_test_utils import ( - get_models_list, - read_model, + get_models_list, + read_model, load_pipe, - load_tok, - model_tmp_path, - STOP_CRITERIA_MAP, + load_tok, + model_tmp_path, + STOP_CRITERIA_MAP, get_continuous_batching, ) -def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): +def run_hf_ov_genai_comparison_batched( + model_descr, generation_config: Dict, prompts: Union[str, List[str]] +): model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects - num_beams = config['num_beams'] if 'num_beams' in config else 1 - config['num_return_sequences'] = num_beams - + num_beams = config["num_beams"] if "num_beams" in config else 1 + config["num_return_sequences"] = num_beams + if not isinstance(prompts, list): prompts = [prompts] - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + if "do_sample" not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set exlicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = None - + config["do_sample"] = False + config["repetition_penalty"] = None + generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) + if generation_config_hf.get("stop_criteria"): + generation_config_hf["early_stopping"] = STOP_CRITERIA_MAP[ + generation_config_hf.pop("stop_criteria") + ] + generation_config_hf.pop("ignore_eos", None) # Encode the batch of prompts tokenizer.padding_side = "left" - encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) - prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] - - hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) + encoded_prompts = tokenizer( + prompts, + return_tensors="pt", + padding=True, + truncation=True, + add_special_tokens=True, + ) + prompt_ids, attention_mask = ( + encoded_prompts["input_ids"], + encoded_prompts["attention_mask"], + ) + + hf_encoded_outputs = model.generate( + prompt_ids, attention_mask=attention_mask, **generation_config_hf + ) hf_outputs = [] for idx, hf_encoded_out in enumerate(hf_encoded_outputs): prompt_count = idx // num_beams - hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) + hf_outputs.append( + tokenizer.decode( + hf_encoded_out[prompt_ids[prompt_count].shape[0] :], + skip_special_tokens=True, + ) + ) ov_outputs = pipe.generate(prompts, **config).texts @@ -63,69 +83,83 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro ov_outputs.sort() for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') + print(f"hf_output: {hf_output}") + print(f"ov_output: {ov_output}") assert hf_output == ov_output + def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str): model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + if "do_sample" not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = None + config["do_sample"] = False + config["repetition_penalty"] = None generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) + if generation_config_hf.get("stop_criteria"): + generation_config_hf["early_stopping"] = STOP_CRITERIA_MAP[ + generation_config_hf.pop("stop_criteria") + ] + generation_config_hf.pop("ignore_eos", None) + + encoded_prompt = tokenizer.encode( + prompt, return_tensors="pt", add_special_tokens=True + ) hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) - hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:], skip_special_tokens=True) + hf_output = tokenizer.decode( + hf_encoded_output[0, encoded_prompt.shape[1] :], skip_special_tokens=True + ) ov_output = pipe.generate(prompt, **config) - if config.get('num_return_sequences', 1) > 1: + if config.get("num_return_sequences", 1) > 1: assert hf_output in ov_output.texts else: if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') + print(f"hf_output: {hf_output}") + print(f"ov_output: {ov_output}") assert hf_output == ov_output + def hf_ov_genai_tensors_comparison( - model_descr, - generation_config: Dict, - input_ids: np.ndarray, - attention_mask: Optional[np.array] = None - ): - device = 'CPU' + model_descr, + generation_config: Dict, + input_ids: np.ndarray, + attention_mask: Optional[np.array] = None, +): + device = "CPU" model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + if "do_sample" not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set exlicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = None - + config["do_sample"] = False + config["repetition_penalty"] = None + generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - + if generation_config_hf.get("stop_criteria"): + generation_config_hf["early_stopping"] = STOP_CRITERIA_MAP[ + generation_config_hf.pop("stop_criteria") + ] + generation_config_hf.pop("ignore_eos", None) + if attention_mask is not None: - inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) - inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) + inputs_ov = ov_genai.TokenizedInputs( + ov.Tensor(input_ids), ov.Tensor(attention_mask) + ) + inputs_hf = dict( + inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask) + ) else: inputs_hf = dict(inputs=torch.tensor(input_ids)) inputs_ov = ov.Tensor(input_ids) @@ -135,19 +169,57 @@ def hf_ov_genai_tensors_comparison( pipe = ov_genai.LLMPipeline(str(path), device) ov_output = pipe.generate(inputs_ov, **config) - hf_res = hf_output[0, input_ids.shape[1]:].numpy() + hf_res = hf_output[0, input_ids.shape[1] :].numpy() ov_res = np.array(ov_output.tokens, dtype=np.int64) assert np.all(ov_res == hf_res) test_cases = [ - (dict(max_new_tokens=20), 'table is made of'), - (dict(max_new_tokens=20), '你好! 你好嗎?'), - (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), + (dict(max_new_tokens=20), "table is made of"), + (dict(max_new_tokens=20), "你好! 你好嗎?"), + ( + dict( + num_beam_groups=3, + num_beams=15, + num_return_sequences=15, + max_new_tokens=30, + diversity_penalty=1.0, + ), + "Alan Turing was a", + ), + ( + dict( + num_beam_groups=2, + num_beams=8, + num_return_sequences=8, + max_new_tokens=20, + diversity_penalty=1.0, + ), + "table is made of", + ), + ( + dict( + num_beam_groups=2, + num_beams=8, + num_return_sequences=8, + max_new_tokens=20, + diversity_penalty=1.0, + ), + "The Sun is yellow because", + ), + ( + dict( + num_beam_groups=2, + num_beams=8, + num_return_sequences=8, + max_new_tokens=20, + diversity_penalty=1.5, + ), + "The Sun is yellow because", + ), ] + + @pytest.mark.parametrize("generation_config,prompt", test_cases) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @@ -155,62 +227,73 @@ def hf_ov_genai_tensors_comparison( def test_decoding(model_descr, generation_config, prompt): run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + input_tensors_list = [ # input_ids, attention_mask (np.array([[1, 4, 42]], dtype=np.int64), None), (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)), ] + + @pytest.mark.parametrize("inputs", input_tensors_list) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly def test_ov_tensors(model_descr, inputs): - hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs) + hf_ov_genai_tensors_comparison( + read_model(model_descr), dict(max_new_tokens=20), *inputs + ) prompts = [ - 'table is made of', - '你好! 你好嗎?', - 'Alan Turing was a', - 'The Sun is yellow because', - ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a'] + "table is made of", + "你好! 你好嗎?", + "Alan Turing was a", + "The Sun is yellow because", + ["The Sun is yellow because", "Alan Turing was a", "Alan Turing was a"], ] + + @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.xfail( - raises=TypeError, + raises=TypeError, reason="pybind was unable to find ov::Tensor from openvino yet", strict=False, - condition=sys.platform in ["linux", "win32"] + condition=sys.platform in ["linux", "win32"], ) def test_genai_tokenizer_encode(model_descr, prompt): model_id, path, tokenizer, model, pipe = read_model(model_descr) tok = pipe.get_tokenizer() - + encoded_ov = tok.encode(prompt).input_ids.data if isinstance(prompt, list): - encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids'] + encoded_hf = tokenizer.batch_encode_plus(prompt)["input_ids"] for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf): assert np.all(tokens_ov == tokens_hf) else: encoded_hf = tokenizer.encode(prompt) assert np.all(encoded_hf == encoded_ov[0]) + encoded_prompts = [ [1, 1591, 338, 1754, 310], - [1, 17102, 323, 3864, 471, 263], - + [1, 17102, 323, 3864, 471, 263], # chineze characters [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], - # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token [3113, 264, 364, 267], - # batched tokens - [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263]] + [ + [1, 1591, 338, 1754, 310], + [1, 1591, 338, 1754, 310], + [1, 17102, 323, 3864, 471, 263], + ], ] + + @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.parametrize("encoded_prompt", encoded_prompts) @pytest.mark.precommit @@ -218,7 +301,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt): model_id, path, tokenizer, model, pipe = read_model(model_descr) tok = pipe.get_tokenizer() decoded_ov = tok.decode(encoded_prompt) - + if isinstance(encoded_prompt[0], list): decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True) for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf): @@ -231,45 +314,62 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt): test_configs = [ dict(max_new_tokens=20), dict(max_new_tokens=200, ignore_eos=True), - dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0), ] batched_prompts = [ - ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], - ['hello', 'Here is the longest nowel ever: '], - ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], - ['table is made', 'table is made [force left pad tokens]'] + [ + "table is made", + "They sky is blue because", + "Difference between Jupiter and Mars is that", + ], + ["hello", "Here is the longest nowel ever: "], + ["Alan Turing was a", "return 0", "你好! 你好嗎?"], + ["table is made", "table is made [force left pad tokens]"], ] + + @pytest.mark.parametrize("generation_config", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly def test_multibatch(model_descr, generation_config, prompts): - run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) + run_hf_ov_genai_comparison_batched( + read_model(model_descr), generation_config, prompts + ) + + +prompts = [ + "The Sun is yellow because", + "Difference between Jupiter and Mars is that", + "table is made of", +] -prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] @pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) @pytest.mark.parametrize("group_size", [5, 3, 10]) @pytest.mark.parametrize("max_new_tokens", [20, 15]) -@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) +@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5]) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_beam_search_decoding(model_descr, num_beam_groups, group_size, - max_new_tokens, diversity_penalty, prompt): +def test_beam_search_decoding( + model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt +): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=diversity_penalty, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) -@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) +@pytest.mark.parametrize( + "stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC] +) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("max_new_tokens", [10, 80]) @pytest.mark.parametrize("model_descr", get_models_list()) @@ -278,14 +378,14 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence # while genai ends sentence with - if (stop_criteria == StopCriteria.EARLY): + if stop_criteria == StopCriteria.EARLY: pytest.skip() generation_config = dict( - num_beam_groups=2, - num_beams=2 * 3, - diversity_penalty=1.0, - num_return_sequences=2 * 3, - max_new_tokens=max_new_tokens, + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, stop_criteria=stop_criteria, ) run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) @@ -298,14 +398,15 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.nightly -def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, - max_new_tokens, prompt): +def test_beam_search_long_sentences( + model_descr, num_beam_groups, group_size, max_new_tokens, prompt +): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=1.0, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) @@ -314,53 +415,64 @@ def user_defined_callback(subword): print(subword) -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) @pytest.mark.precommit @pytest.mark.nightly def test_callback_one_string(callback): pipe = read_model(get_models_list()[0])[4] generation_config = pipe.get_generation_config() generation_config.max_new_tokens = 10 - pipe.generate('table is made of', generation_config, callback) + pipe.generate("table is made of", generation_config, callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) @pytest.mark.precommit @pytest.mark.nightly def test_callback_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): - pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback) + pipe.generate(["1", "2"], ov_genai.GenerationConfig(), callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) @pytest.mark.precommit @pytest.mark.nightly def test_callback_kwargs_one_string(callback): pipe = read_model(get_models_list()[0])[4] - pipe.generate('table is made of', max_new_tokens=10, streamer=callback) + pipe.generate("table is made of", max_new_tokens=10, streamer=callback) + -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("model_descr", get_models_list()) def test_callback_decoding_metallama(model_descr, callback): # On metallam this prompt generates output which can shorten after adding new tokens. # Test that streamer correctly handles such cases. - prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' - if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct': + prompt = "I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature" + if model_descr[0] != "meta-llama/Meta-Llama-3-8B-Instruct": pytest.skip() pipe = read_model(model_descr)[4] pipe.generate(prompt, max_new_tokens=300, streamer=callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) @pytest.mark.precommit @pytest.mark.nightly def test_callback_kwargs_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): - pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) + pipe.generate(["1", "2"], max_new_tokens=10, streamer=callback) class Printer(ov_genai.StreamerBase): @@ -370,11 +482,15 @@ def __init__(self, tokenizer): # differences between Python’s MRO and C++’s mechanisms. ov_genai.StreamerBase.__init__(self) self.tokenizer = tokenizer + def put(self, token_id): # print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement - print(token_id) # print only token because self.tokenizer.decode([token_id]) are not implemented yet + print( + token_id + ) # print only token because self.tokenizer.decode([token_id]) are not implemented yet + def end(self): - print('end') + print("end") @pytest.mark.precommit @@ -384,7 +500,7 @@ def test_streamer_one_string(): generation_config = pipe.get_generation_config() generation_config.max_new_tokens = 10 printer = Printer(pipe.get_tokenizer()) - pipe.generate('table is made of', generation_config, printer) + pipe.generate("table is made of", generation_config, printer) @pytest.mark.precommit @@ -393,7 +509,7 @@ def test_streamer_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer) + pipe.generate(["1", "2"], ov_genai.GenerationConfig(), printer) @pytest.mark.precommit @@ -401,7 +517,9 @@ def test_streamer_batch_fail(): def test_streamer_kwargs_one_string(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) - pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) + pipe.generate( + "table is made of", max_new_tokens=10, do_sample=False, streamer=printer + ) @pytest.mark.precommit @@ -410,26 +528,30 @@ def test_streamer_kwargs_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe.generate('', num_beams=2, streamer=printer) + pipe.generate("", num_beams=2, streamer=printer) @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) def test_operator_with_callback_one_string(callback): pipe = read_model(get_models_list()[0])[4] ten_tokens = pipe.get_generation_config() ten_tokens.max_new_tokens = 10 - pipe('talbe is made of', ten_tokens, callback) + pipe("talbe is made of", ten_tokens, callback) @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.parametrize( + "callback", [print, user_defined_callback, lambda subword: print(subword)] +) def test_operator_with_callback_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): - pipe(['1', '2'], ov_genai.GenerationConfig(), callback) + pipe(["1", "2"], ov_genai.GenerationConfig(), callback) @pytest.mark.precommit @@ -437,7 +559,7 @@ def test_operator_with_callback_batch_fail(callback): def test_operator_with_streamer_kwargs_one_string(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) - pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) + pipe("hi", max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit @@ -446,43 +568,45 @@ def test_operator_with_streamer_kwargs_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe('', num_beams=2, streamer=printer) + pipe("", num_beams=2, streamer=printer) @pytest.mark.precommit @pytest.mark.nightly def test_load_special_tokens_ids_1(model_tmp_path): # test when there is an available config.json - config_json = { + config_json = { "pad_token_id": 422, - "bos_token_id": 42, + "bos_token_id": 42, "eos_token_id": 37, } tok = load_tok([(config_json, "config.json")], model_tmp_path[1]) - assert tok.get_pad_token_id() == config_json['pad_token_id'] - assert tok.get_bos_token_id() == config_json['bos_token_id'] - assert tok.get_eos_token_id() == config_json['eos_token_id'] + assert tok.get_pad_token_id() == config_json["pad_token_id"] + assert tok.get_bos_token_id() == config_json["bos_token_id"] + assert tok.get_eos_token_id() == config_json["eos_token_id"] @pytest.mark.precommit @pytest.mark.nightly def test_load_special_tokens_str_2(model_tmp_path): # test with special_tokens_map - special_tokens_map_json = { + special_tokens_map_json = { "pad_token": {"content": ""}, "bos_token": {"content": ""}, "eos_token": {"content": ""}, } - tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1]) - assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"] - assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"] - assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"] + tok = load_tok( + [(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1] + ) + assert tok.get_pad_token() == special_tokens_map_json["pad_token"]["content"] + assert tok.get_bos_token() == special_tokens_map_json["bos_token"]["content"] + assert tok.get_eos_token() == special_tokens_map_json["eos_token"]["content"] @pytest.mark.precommit @pytest.mark.nightly def test_load_special_tokens_3_(model_tmp_path): - # special_tokens_map is not available + # special_tokens_map is not available # but tokenize_config.json exists # will load both string and integer representations tok_config_json = { @@ -497,9 +621,9 @@ def test_load_special_tokens_3_(model_tmp_path): } tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1]) - assert tok.get_pad_token() == tok_config_json['pad_token'] - assert tok.get_bos_token() == tok_config_json['bos_token'] - assert tok.get_eos_token() == tok_config_json['eos_token'] + assert tok.get_pad_token() == tok_config_json["pad_token"] + assert tok.get_bos_token() == tok_config_json["bos_token"] + assert tok.get_eos_token() == tok_config_json["eos_token"] assert tok.get_pad_token_id() == 422 assert tok.get_bos_token_id() == 37 @@ -512,78 +636,87 @@ def test_load_special_tokens_3(model_tmp_path): # both config.json is availabel and tokenizer_config.json available # check that it does not read int values from tokenizer_config.json if they are in config.json tok_config_json = { - "added_tokens_decoder": { - # integers differ from config.json to check they don't override config.json - "777": {"content": ""}, - "888": {"content": ""}, - "656": {"content": ""}, - }, - "pad_token": "", - "bos_token": "", - "eos_token": "", + "added_tokens_decoder": { + # integers differ from config.json to check they don't override config.json + "777": {"content": ""}, + "888": {"content": ""}, + "656": {"content": ""}, + }, + "pad_token": "", + "bos_token": "", + "eos_token": "", } - config_json = { + config_json = { "pad_token_id": 422, - "bos_token_id": 42, + "bos_token_id": 42, "eos_token_id": 37, } - configs = [ - (tok_config_json, "tokenizer_config.json"), - (config_json, "config.json") - ] + configs = [(tok_config_json, "tokenizer_config.json"), (config_json, "config.json")] tok = load_tok(configs, model_tmp_path[1]) - assert tok.get_pad_token_id() == config_json['pad_token_id'] - assert tok.get_bos_token_id() == config_json['bos_token_id'] - assert tok.get_eos_token_id() == config_json['eos_token_id'] + assert tok.get_pad_token_id() == config_json["pad_token_id"] + assert tok.get_bos_token_id() == config_json["bos_token_id"] + assert tok.get_eos_token_id() == config_json["eos_token_id"] - assert tok.get_pad_token() == tok_config_json['pad_token'] - assert tok.get_bos_token() == tok_config_json['bos_token'] - assert tok.get_eos_token() == tok_config_json['eos_token'] + assert tok.get_pad_token() == tok_config_json["pad_token"] + assert tok.get_bos_token() == tok_config_json["bos_token"] + assert tok.get_eos_token() == tok_config_json["eos_token"] @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.xfail( - raises=AssertionError, + raises=AssertionError, reason="CVS-143410 ov tokenizer should be aligned with hf", strict=False, ) def test_load_special_tokens_4(model_tmp_path): # only string representation is provided, find token integers by inference model_id, temp_path = model_tmp_path - tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_id, trust_remote_code=True + ) + special_tokens_map_json = {} token_str_int_map = {} - special_token_names = ['pad_token', 'bos_token', 'eos_token'] + special_token_names = ["pad_token", "bos_token", "eos_token"] for token_str in special_token_names: if hasattr(tokenizer, token_str): token_val = getattr(tokenizer, token_str) special_tokens_map_json.update({token_str: {"content": token_val}}) - token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0] + token_id = tokenizer(token_val, add_special_tokens=False)["input_ids"][0] token_str_int_map.update({token_str: token_id}) # since only string representations are present in the json will try to get by inference tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path) # check ids inferred correctly for special tokens existing if HF tokenizer - if 'pad_token' in token_str_int_map: - assert tok.get_pad_token_id() == token_str_int_map['pad_token'] - if 'bos_token' in token_str_int_map: - assert tok.get_bos_token_id() == token_str_int_map['bos_token'] - if 'eos_token' in token_str_int_map: - assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + if "pad_token" in token_str_int_map: + assert tok.get_pad_token_id() == token_str_int_map["pad_token"] + if "bos_token" in token_str_int_map: + assert tok.get_bos_token_id() == token_str_int_map["bos_token"] + if "eos_token" in token_str_int_map: + assert tok.get_eos_token_id() == token_str_int_map["eos_token"] invalid_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k + dict( + eos_token_id=42, ignore_eos=True + ), # no max_new_tokens, no max_len with ignore_eos + dict( + repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20 + ), # invalid penalty + dict( + temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20 + ), # invalid temp + dict( + top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20 + ), # invalid top_p + dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k ] + + @pytest.mark.parametrize("generation_config", invalid_configs) @pytest.mark.precommit @pytest.mark.nightly @@ -592,7 +725,7 @@ def test_invalid_configs(model_tmp_path, generation_config): config_json = {} pipe = load_pipe([(config_json, "config.json")], temp_path) with pytest.raises(RuntimeError): - pipe.generate('blah blah', **generation_config) + pipe.generate("blah blah", **generation_config) @pytest.mark.precommit @@ -605,25 +738,38 @@ def test_valid_configs(model_tmp_path): config.do_sample = True # no eos_token_id but it's loaded from config.json pipe.set_generation_config(config) + invalid_py_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k + dict( + eos_token_id=42, ignore_eos=True + ), # no max_new_tokens, no max_len with ignore_eos + dict( + repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20 + ), # invalid penalty + dict( + temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20 + ), # invalid temp + dict( + top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20 + ), # invalid top_p + dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k ] + + @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("generation_config", invalid_py_configs) def test_python_generation_config_validation(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) - + # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned # instead of RuntimeError, which is returned when GenerationConfig values are validated - return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError + return_exception_type = ( + ValueError if "unexisting_key_name" in generation_config else RuntimeError + ) with pytest.raises(return_exception_type): pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) @@ -633,11 +779,10 @@ def test_python_generation_config_validation(model_tmp_path, generation_config): def test_unicode_pybind_decoding_1(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. - model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + model_id, path = "katuni4ka/tiny-random-phi3", Path("tiny-random-phi3") pipe = read_model((model_id, path))[4] - res_str = pipe.generate(',', max_new_tokens=4) - assert '�' == res_str[-1] - + res_str = pipe.generate(",", max_new_tokens=4) + assert "�" == res_str[-1] @pytest.mark.precommit @@ -645,10 +790,10 @@ def test_unicode_pybind_decoding_1(): def test_unicode_pybind_decoding_2(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. - model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + model_id, path = "katuni4ka/tiny-random-phi3", Path("tiny-random-phi3") pipe = read_model((model_id, path))[4] res_str = pipe.generate([","], max_new_tokens=4) - assert '�' == res_str.texts[0][-1] + assert "�" == res_str.texts[0][-1] @pytest.mark.precommit @@ -656,22 +801,24 @@ def test_unicode_pybind_decoding_2(): def test_unicode_pybind_decoding_3(): # On this model this prompt generates unfinished utf-8 string # and streams it. Test that pybind will not fail while we pass string to python. - model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + model_id, path = "katuni4ka/tiny-random-phi3", Path("tiny-random-phi3") pipe = read_model((model_id, path))[4] res_str = [] pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) - assert '�' == res_str[-1] + assert "�" == res_str[-1] @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") +@pytest.mark.skipif( + sys.platform.startswith("win"), reason="not enough space for this model on Win" +) def test_left_pad(): # test left pad tokenizer post processing implementation prompts = [ "The Sun is yellow because", - "The Sun is yellow because [force left pad tokens]" + "The Sun is yellow because [force left pad tokens]", ] models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) @@ -694,13 +841,14 @@ def test_left_pad(): @pytest.mark.parametrize("generation_config", test_configs) -@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. +@pytest.mark.parametrize( + "prompt", batched_prompts[1:] +) # num_beams=15 diverges on the first prompt. @pytest.mark.precommit def test_continuous_batching_vs_stateful(prompt, generation_config): - model_id, path, tokenizer, model, stateful = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) + model_id, path, tokenizer, model, stateful = read_model( + ("facebook/opt-125m", Path("opt-125m")) + ) cb = get_continuous_batching(path) generated = cb.generate(prompt, **generation_config) reference = stateful.generate(prompt, **generation_config) @@ -710,86 +858,117 @@ def test_continuous_batching_vs_stateful(prompt, generation_config): for gen, ref in zip(generated.scores, reference.scores): assert math.isclose(gen, ref, abs_tol=0.0003) + @pytest.mark.parametrize("prompt", prompts) @pytest.mark.precommit def test_cb_streamer_vs_return_vs_stateful(prompt): - model_id, path, tokenizer, model, stateful = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) + model_id, path, tokenizer, model, stateful = read_model( + ("facebook/opt-125m", Path("opt-125m")) + ) cb = get_continuous_batching(path) streamed = [] - generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + generated = cb.generate( + prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword) + ) reference = stateful.generate(prompt, max_new_tokens=20) assert generated == "".join(streamed) assert "".join(streamed) == reference -def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: + +def run_perf_metrics_collection( + model_descr, generation_config: Dict, prompt: str +) -> ov_genai.PerfMetrics: model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + if "do_sample" not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = None + config["do_sample"] = False + config["repetition_penalty"] = None return pipe.generate([prompt], **config).perf_metrics test_cases = [ - (dict(max_new_tokens=20), 'table is made of'), + (dict(max_new_tokens=20), "table is made of"), ] + + @pytest.mark.parametrize("generation_config,prompt", test_cases) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly def test_perf_metrics(model_descr, generation_config, prompt): import time + start_time = time.perf_counter() - perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + perf_metrics = run_perf_metrics_collection( + read_model(model_descr), generation_config, prompt + ) total_time = (time.perf_counter() - start_time) * 1000 - + # Check that load time is adequate. load_time = perf_metrics.get_load_time() - assert load_time > 0 and load_time < 1000.0 - + assert load_time > 0 and load_time < 1000.0 + # Check that num input and generated tokens are adequate. num_generated_tokens = perf_metrics.get_num_generated_tokens() - assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] - + assert ( + num_generated_tokens > 0 + and num_generated_tokens <= generation_config["max_new_tokens"] + ) + num_input_tokens = perf_metrics.get_num_input_tokens() assert num_input_tokens > 0 and num_input_tokens <= len(prompt) mean_ttft, std_ttft = perf_metrics.get_ttft() - assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) + assert (mean_ttft, std_ttft) == ( + perf_metrics.get_ttft().mean, + perf_metrics.get_ttft().std, + ) assert mean_ttft > 0 and mean_ttft < 1000.0 mean_tpot, std_tpot = perf_metrics.get_tpot() - assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) + assert (mean_tpot, std_tpot) == ( + perf_metrics.get_tpot().mean, + perf_metrics.get_tpot().std, + ) assert mean_tpot > 0 and mean_ttft < 1000.0 mean_throughput, std_throughput = perf_metrics.get_throughput() - assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) + assert (mean_throughput, std_throughput) == ( + perf_metrics.get_throughput().mean, + perf_metrics.get_throughput().std, + ) assert mean_throughput > 0 and mean_throughput < 20000.0 - + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() - assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) + assert (mean_gen_duration, std_gen_duration) == ( + perf_metrics.get_generate_duration().mean, + perf_metrics.get_generate_duration().std, + ) assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time assert std_gen_duration == 0 mean_tok_duration, std_tok_duration = perf_metrics.get_tokenization_duration() - assert (mean_tok_duration, std_tok_duration) == (perf_metrics.get_tokenization_duration().mean, perf_metrics.get_tokenization_duration().std) + assert (mean_tok_duration, std_tok_duration) == ( + perf_metrics.get_tokenization_duration().mean, + perf_metrics.get_tokenization_duration().std, + ) assert mean_tok_duration > 0 and mean_tok_duration < mean_gen_duration assert std_tok_duration == 0 mean_detok_duration, std_detok_duration = perf_metrics.get_detokenization_duration() - assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) + assert (mean_detok_duration, std_detok_duration) == ( + perf_metrics.get_detokenization_duration().mean, + perf_metrics.get_detokenization_duration().std, + ) assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration assert std_detok_duration == 0 - + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics raw_metrics = perf_metrics.raw_metrics raw_dur = np.array(raw_metrics.generate_durations) / 1000 diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py index 239ae6399c..daa0c8262f 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_preemption.py @@ -4,10 +4,20 @@ import pytest from openvino_genai import GenerationConfig -from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p +from common import ( + get_model_and_tokenizer, + save_ov_model_from_optimum, + generate_and_compare_with_reference_text, + get_scheduler_config, + run_test_pipeline, + get_beam_search, + get_greedy, + get_multinomial_all_parameters, + get_multinomial_temperature_and_num_return_sequence, + get_multinomial_temperature_and_top_k, + get_multinomial_temperature, + get_multinomial_temperature_and_top_p, +) from test_sampling import RandomSamplingTestStruct, get_current_plarform_ref_texts @@ -17,6 +27,7 @@ def get_greedy_seq_len_300() -> GenerationConfig: generation_config.max_new_tokens = 300 return generation_config + def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -25,14 +36,67 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config -scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": True}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": False}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), - ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), - ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": False}, get_beam_search_seq_len_300())] + +scheduler_params_list = [ + ( + { + "num_kv_blocks": 2, + "block_size": 32, + "dynamic_split_fuse": True, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + }, + get_greedy(), + ), + ( + { + "num_kv_blocks": 2, + "block_size": 32, + "dynamic_split_fuse": False, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + }, + get_greedy(), + ), + ( + {"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": True}, + get_greedy_seq_len_300(), + ), + ( + {"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": False}, + get_greedy_seq_len_300(), + ), + ( + { + "num_kv_blocks": 34, + "block_size": 32, + "dynamic_split_fuse": True, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + }, + get_beam_search(), + ), + ( + { + "num_kv_blocks": 34, + "block_size": 32, + "dynamic_split_fuse": False, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + }, + get_beam_search(), + ), + ( + {"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": True}, + get_beam_search_seq_len_300(), + ), + ( + {"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": False}, + get_beam_search_seq_len_300(), + ), +] + + @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): @@ -50,50 +114,68 @@ def test_preemption(tmp_path, params): "How are you?", "Tell me something about Canada?", ], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - "\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is" - ], - [ - " You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far" - ], - [ - "\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version." - ], - ], - "win32": [ - [ - "\n\nOpenVINO is a live platform that allows users to create and manage a new library of applications on the Virtuoso server, which can" - ], - [ - " You're getting much better results from doing this, than you are by not doing this. If you are truly trying to do something good," + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far" + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version." + ], ], - [ - "\nI'm from Canada, and I'm from the US, so I'm not sure what you're talking about.\nI'm Canadian and I" + "win32": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library of applications on the Virtuoso server, which can" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. If you are truly trying to do something good," + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure what you're talking about.\nI'm Canadian and I" + ], ], - ], - }), + } + ), ) # todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits::max() @pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.precommit -@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") +@pytest.mark.skip( + reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally." +) def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): generation_configs = multinomial_params.generation_config for config in generation_configs: config.rng_seed = 0 config.max_new_tokens = 30 - model_id : str = "facebook/opt-125m" + model_id: str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id + model_path: Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) + scheduler_config = get_scheduler_config( + { + "num_kv_blocks": 3, + "block_size": 32, + "dynamic_split_fuse": dynamic_split_fuse, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + } + ) + generate_and_compare_with_reference_text( + model_path, + multinomial_params.prompts, + multinomial_params.ref_texts, + generation_configs, + scheduler_config, + ) multinomial_params_n_seq = RandomSamplingTestStruct( @@ -107,72 +189,90 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): "What is the current", "Tell me something about UAE?", ], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier" - ], - [ - " position of the Z-shaped groove?\n0.41\nWhat is the current position of the Z-shaped groove?\n0.11\n", - " status of all of this? I can't stop thinking about it.\nIt's been a while since I've seen it. I found it a", - " status of your blog? Do you accept feedback?\nYes, I’m happy to accept feedback at this time (I’m a" - ], - [ - "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take", - "\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab", - "\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year", - "\nI don't know anything. I'm not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos", - ], - ], - "win32": [ - [ - "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the economics of" + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier" + ], + [ + " position of the Z-shaped groove?\n0.41\nWhat is the current position of the Z-shaped groove?\n0.11\n", + " status of all of this? I can't stop thinking about it.\nIt's been a while since I've seen it. I found it a", + " status of your blog? Do you accept feedback?\nYes, I’m happy to accept feedback at this time (I’m a", + ], + [ + "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take", + "\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab", + "\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year", + "\nI don't know anything. I'm not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos", + ], ], - [ - " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", - " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", - " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" + "win32": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the economics of" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey", + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], ], - [ - "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", - "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", - "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", - "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + "darwin": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the rigidity" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey", + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], ], - ], - "darwin": [ - [ - "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the rigidity" - ], - [ - " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", - " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", - " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" - ], - [ - "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", - "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", - "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", - "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", - ], - ], - }), + } + ), ) @pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.precommit -@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") +@pytest.mark.skip( + reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally." +) def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): generation_configs = multinomial_params_n_seq.generation_config for config in generation_configs: config.rng_seed = 0 - model_id : str = "facebook/opt-125m" + model_id: str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id + model_path: Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) - scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file + scheduler_config = get_scheduler_config( + { + "num_kv_blocks": 8, + "block_size": 32, + "dynamic_split_fuse": dynamic_split_fuse, + "max_num_batched_tokens": 256, + "max_num_seqs": 256, + } + ) + generate_and_compare_with_reference_text( + model_path, + multinomial_params_n_seq.prompts, + multinomial_params_n_seq.ref_texts, + generation_configs, + scheduler_config, + ) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 1e7a1b81a5..a846198e3a 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,23 +10,52 @@ from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer from typing import List, TypedDict -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ - generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ - get_greedy_with_penalties, get_multinomial_temperature, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ - get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ - get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ - get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ - get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ - get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ - generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \ - run_continuous_batching +from common import ( + run_test_pipeline, + get_models_list, + get_model_and_tokenizer, + save_ov_model_from_optimum, + generate_and_compare_with_reference_text, + get_greedy, + get_beam_search, + get_multinomial_temperature, + get_greedy_with_penalties, + get_multinomial_temperature, + get_multinomial_temperature_and_top_k, + get_multinomial_temperature_and_top_p, + get_multinomial_temperature_top_p_and_top_k, + DEFAULT_SCHEDULER_CONFIG, + get_greedy_with_repetition_penalty, + get_multinomial_all_parameters, + get_multinomial_temperature_and_num_return_sequence, + generate_and_compare_with_reference_text, + get_greedy, + get_greedy_with_min_and_max_tokens, + get_greedy_with_single_stop_string, + get_greedy_with_multiple_stop_strings, + get_greedy_with_multiple_stop_strings_no_match, + get_beam_search, + get_beam_search_min_and_max_tokens, + get_beam_search_with_single_stop_string, + get_beam_search_with_multiple_stop_strings, + get_beam_search_with_multiple_stop_strings_no_match, + get_multinomial_max_and_min_token, + get_multinomial_temperature_and_frequence_penalty, + get_multinomial_temperature_and_presence_penalty, + generate_and_compare_with_hf, + get_multinomial_temperature_and_repetition_penalty, + get_scheduler_config, + run_continuous_batching, +) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +@pytest.mark.parametrize( + "model_id", + get_models_list( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit") + ), +) @pytest.mark.xfail( raises=RuntimeError, reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.", @@ -37,70 +66,102 @@ def test_sampling_precommit(tmp_path, model_id): @pytest.mark.nightly -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +@pytest.mark.parametrize( + "model_id", + get_models_list( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly") + ), +) def test_sampling_nightly(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) + @pytest.mark.real_models -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +@pytest.mark.parametrize( + "model_id", + get_models_list( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), "models", "real_models" + ) + ), +) def test_real_models(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) @pytest.mark.precommit def test_eos_beam_search(tmp_path): - ''' + """ Current test checks that in case of beam search, some generation results explicitly have EOS token at the end, which is aligned with HF Example of current output: { -1.23264, that I don't know about. I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } - ''' + """ model_id = "facebook/opt-125m" prompts = ["Tell me something about Canada"] generation_configs = [get_beam_search()] scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + generate_and_compare_with_hf( + model_id, prompts, generation_configs, scheduler_config, tmp_path + ) @pytest.mark.precommit def test_eos_greedy(tmp_path): - ''' + """ Current test checks that in case of gready, some generation results explicitly have EOS token at the end, which is aligned with HF: Example of current output: { a software program } - ''' + """ model_id = "bigscience/bloomz-560m" prompts = ["What is OpenVINO?"] generation_configs = [get_greedy()] scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + generate_and_compare_with_hf( + model_id, prompts, generation_configs, scheduler_config, tmp_path + ) + @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ], - ids=[ - "greedy", - "greedy_with_min_and_max_tokens", - "greedy_with_repetition_penalty", - "greedy_with_single_stop_string", - "greedy_with_multiple_stop_strings", - "greedy_with_multiple_stop_strings_no_match", - "beam", - "beam_search_min_and_max_tokens", - "beam_search_with_multiple_stop_strings_no_match", - ]) +@pytest.mark.parametrize( + "generation_config", + [ + get_greedy(), + get_greedy_with_min_and_max_tokens(), + get_greedy_with_repetition_penalty(), + get_greedy_with_single_stop_string(), + get_greedy_with_multiple_stop_strings(), + get_greedy_with_multiple_stop_strings_no_match(), + get_beam_search(), + get_beam_search_min_and_max_tokens(), + get_beam_search_with_multiple_stop_strings_no_match(), + ], + ids=[ + "greedy", + "greedy_with_min_and_max_tokens", + "greedy_with_repetition_penalty", + "greedy_with_single_stop_string", + "greedy_with_multiple_stop_strings", + "greedy_with_multiple_stop_strings_no_match", + "beam", + "beam_search_min_and_max_tokens", + "beam_search_with_multiple_stop_strings_no_match", + ], +) def test_individual_generation_configs_deterministic(tmp_path, generation_config): prompts = [ - "What is OpenVINO?", - ] + "What is OpenVINO?", + ] generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + model_id: str = "facebook/opt-125m" + generate_and_compare_with_hf( + model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path + ) + @pytest.mark.precommit @pytest.mark.xfail( @@ -108,18 +169,26 @@ def test_individual_generation_configs_deterministic(tmp_path, generation_config reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings(),], - ids=[ - "beam_search_with_single_stop_string", - "beam_search_with_multiple_stop_strings", - ]) +@pytest.mark.parametrize( + "generation_config", + [ + get_beam_search_with_single_stop_string(), + get_beam_search_with_multiple_stop_strings(), + ], + ids=[ + "beam_search_with_single_stop_string", + "beam_search_with_multiple_stop_strings", + ], +) def test_beam_search_with_stop_string(tmp_path, generation_config): prompts = [ - "What is OpenVINO?", - ] + "What is OpenVINO?", + ] generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + model_id: str = "facebook/opt-125m" + generate_and_compare_with_hf( + model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path + ) class PlatformsRefTexts(TypedDict, total=False): @@ -160,18 +229,20 @@ class RandomSamplingTestStruct: RandomSamplingTestStruct( generation_config=get_multinomial_temperature_and_top_p(), prompts=["What is OpenVINO?"], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - "\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application" - ] - ], - "win32": [ - [ - "\n\nOpenVINO is a software development platform designed to allow developers to develop and commercialize the most important software products on the web. OpenV" - ] - ], - }) + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + "\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software development platform designed to allow developers to develop and commercialize the most important software products on the web. OpenV" + ] + ], + } + ), ), RandomSamplingTestStruct( generation_config=get_multinomial_temperature_and_top_k(), @@ -185,18 +256,20 @@ class RandomSamplingTestStruct: RandomSamplingTestStruct( generation_config=get_multinomial_temperature_top_p_and_top_k(), prompts=["What is OpenVINO?"], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - "\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers" - ] - ], - "win32": [ - [ - "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" - ] - ], - }), + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + "\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" + ] + ], + } + ), ), RandomSamplingTestStruct( generation_config=get_multinomial_temperature_and_repetition_penalty(), @@ -214,31 +287,33 @@ class RandomSamplingTestStruct: [ " the exact same image?\nI've tried multiple times to find it, but I'm still not sure. I am sure it's the exact same", " your new house?\nAnywhere that has a GPS. It will be up to you.", - " your cat? He is more likely to be on the floor with him.\nTalduck" + " your cat? He is more likely to be on the floor with him.\nTalduck", ] ], ), RandomSamplingTestStruct( generation_config=get_multinomial_all_parameters(), prompts=["Tell me something about UAE"], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - " and how it's not like we're all in the same boat right now lol (or even close) 😂😁! Just curious :) If", - "? You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?", - "?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons", - "? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain", - ] - ], - "win32": [ - [ - "? I think that is a bit of an anomaly, especially since there aren't many Americans living here (like us). What makes you say they've", - "? You are my country... so what does our future have to do with your problems?? \U0001f609\U0001f608\U0001f495 \U0001f5a4\ufffd", - "?\nThe U.S government has been doing quite well for decades now when compared strictly directly or indirectly as regards security issues.. They even made some", - " and how it's not like we're all in the same boat either! We had such fun meeting each other at different times this past summer :) It", - ] - ], - }), + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + " and how it's not like we're all in the same boat right now lol (or even close) 😂😁! Just curious :) If", + "? You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?", + "?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons", + "? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain", + ] + ], + "win32": [ + [ + "? I think that is a bit of an anomaly, especially since there aren't many Americans living here (like us). What makes you say they've", + "? You are my country... so what does our future have to do with your problems?? \U0001f609\U0001f608\U0001f495 \U0001f5a4\ufffd", + "?\nThe U.S government has been doing quite well for decades now when compared strictly directly or indirectly as regards security issues.. They even made some", + " and how it's not like we're all in the same boat either! We had such fun meeting each other at different times this past summer :) It", + ] + ], + } + ), ), RandomSamplingTestStruct( generation_config=get_multinomial_temperature_and_presence_penalty(), @@ -270,61 +345,74 @@ class RandomSamplingTestStruct: RandomSamplingTestStruct( generation_config=get_multinomial_max_and_min_token(), prompts=["What is OpenVINO?"], - ref_texts=get_current_plarform_ref_texts({ - "linux": [ - [ - "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist", - "\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,", - "\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability", - ] - ], - "win32": [ - [ - "\nOpenVINO is the latest addition to the OpenVINO series of platforms. OpenVINO is an open source software development framework for all platforms", - "\nOpenVINO is a browser-based virtual assistant that enables developers and developers to quickly communicate with their own virtual machines. Using this virtual assistant,", - "\n\nOpenVINO is a program designed to help you find the best open source open source software. The program, which is a lightweight package and", - ] - ], - }), + ref_texts=get_current_plarform_ref_texts( + { + "linux": [ + [ + "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist", + "\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,", + "\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability", + ] + ], + "win32": [ + [ + "\nOpenVINO is the latest addition to the OpenVINO series of platforms. OpenVINO is an open source software development framework for all platforms", + "\nOpenVINO is a browser-based virtual assistant that enables developers and developers to quickly communicate with their own virtual machines. Using this virtual assistant,", + "\n\nOpenVINO is a program designed to help you find the best open source open source software. The program, which is a lightweight package and", + ] + ], + } + ), ), ] @pytest.mark.precommit -@pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, - ids=["multinomial_temperature", - "multinomial_temperature_and_top_p", - "multinomial_temperature_and_top_k", - "multinomial_temperature_top_p_and_top_k", - "multinomial_temperature_and_repetition_penalty", - "multinomial_temperature_and_num_return_sequence", - "multinomial_all_parameters", - "multinomial_temperature_and_presence_penalty", - "multinomial_temperature_and_frequence_penalty", - "greedy_with_penalties", - "multinomial_max_and_min_token"]) -def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): +@pytest.mark.parametrize( + "test_struct", + RANDOM_SAMPLING_TEST_CASES, + ids=[ + "multinomial_temperature", + "multinomial_temperature_and_top_p", + "multinomial_temperature_and_top_k", + "multinomial_temperature_top_p_and_top_k", + "multinomial_temperature_and_repetition_penalty", + "multinomial_temperature_and_num_return_sequence", + "multinomial_all_parameters", + "multinomial_temperature_and_presence_penalty", + "multinomial_temperature_and_frequence_penalty", + "greedy_with_penalties", + "multinomial_max_and_min_token", + ], +) +def test_individual_generation_configs_random( + tmp_path, test_struct: RandomSamplingTestStruct +): generation_config = test_struct.generation_config prompts = test_struct.prompts generation_config.rng_seed = 0 generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" + model_id: str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id + model_path: Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) # run multinomial without comparison with reference - _ = run_continuous_batching(model_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) + _ = run_continuous_batching( + model_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs + ) # Reference comparison is not performed as sampling results are non-deterministic. # Discrete_distribution impl depends on platform, model inference results may depend on CPU. - @pytest.mark.precommit -@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()]) +@pytest.mark.parametrize( + "sampling_config", + [get_greedy(), get_beam_search(), get_multinomial_all_parameters()], +) def test_post_oom_health(tmp_path, sampling_config): generation_config = sampling_config generation_config.ignore_eos = True @@ -334,20 +422,26 @@ def test_post_oom_health(tmp_path, sampling_config): # Low cache size to trigger OOM quickly scheduler_config.num_kv_blocks = 10 generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" + model_id: str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id + model_path: Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), + Tokenizer(model_path.absolute().as_posix(), {}), + scheduler_config, + "CPU", + {}, + ) # First run should return incomplete response output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) + assert len(output) + assert len(output[0].m_generation_ids) # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) + assert len(output) + assert len(output[0].m_generation_ids) del pipe shutil.rmtree(model_path) diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py index 45d60f998d..eb457134c2 100644 --- a/tests/python_tests/tokenizer_configs.py +++ b/tests/python_tests/tokenizer_configs.py @@ -1,1012 +1,1011 @@ - def get_tokenizer_configs(): return { "meta-llama/Meta-Llama-3-8B-Instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "TheBloke/Mistral-7B-OpenOrca-GPTQ": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|im_end|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", }, "upstage/SOLAR-10.7B-Instruct-v1.0": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}", }, "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}", }, "Qwen/Qwen1.5-0.5B": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "<|endoftext|>", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "Felladrin/Llama-68M-Chat-v1": { - "bos_token": "<|im_start|>", - "eos_token": "<|im_end|>", - "pad_token": "<|im_end|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|im_start|>", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "databricks/dbrx-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|pad|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}", }, "speakleash/Bielik-7B-Instruct-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}", }, "internlm/internlm2-chat-7b": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "Qwen/Qwen2-7B-Instruct": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "codellama/CodeLlama-34b-Instruct-hf": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", }, "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": { - "bos_token": None, - "eos_token": "<|end|>", - "pad_token": "<|pad|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}" + "bos_token": None, + "eos_token": "<|end|>", + "pad_token": "<|pad|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}", }, "mosaicml/mpt-30b-chat": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}", }, "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "deepseek-ai/deepseek-coder-6.7b-instruct": { - "bos_token": { - "__type": "AddedToken", - "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "<|EOT|>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "<|EOT|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", }, "deepseek-ai/deepseek-math-7b-rl": { - "bos_token": { - "__type": "AddedToken", - "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", }, "FINGU-AI/FinguAI-Chat-v1": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "allenai/tulu-2-7b": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", }, "maldv/winter-garden-7b-alpha": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}", }, "mlabonne/NeuralMonarch-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}", }, "meta-llama/Llama-2-7b-chat-hf": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", }, "GritLM/GritLM-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", }, "ishorn5/RTLCoder-Deepseek-v1.1": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n", }, "jondurbin/bagel-34b-v0.2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}", }, "openchat/openchat-3.5-0106": { - "bos_token": "", - "eos_token": "<|end_of_turn|>", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}" + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}", }, "mobiuslabsgmbh/aanaphi2-v0.1": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "[PAD]", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "[PAD]", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}", }, "typeof/mistral-60m": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}", }, "turboderp/Cat-Llama-3-70B-instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|im_end|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "saltlux/Ko-Llama3-Luxia-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}", }, "h2oai/h2o-danube2-1.8b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}", }, "abhishek/autotrain-llama3-70b-orpo-v1": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}", }, "casperhansen/llama-3-70b-instruct-awq": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", }, "01-ai/Yi-1.5-34B-Chat": { - "bos_token": "<|startoftext|>", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}", }, "allenai/OLMo-7B-Instruct": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|padding|>", - "unk_token": None, - "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": None, + "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", }, "TheBloke/deepseek-coder-33B-instruct-GPTQ": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<|EOT|>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n", }, "cognitivecomputations/dolphin-2.8-mistral-7b-v02": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "alexsobolev/IcaroLM": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "tokyotech-llm/Swallow-7b-instruct-v0.1": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}", }, "instructlab/merlinite-7b-lab": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|pad|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}", }, "microsoft/Phi-3-medium-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|placeholder6|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", }, "katuni4ka/tiny-random-phi3": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", }, "microsoft/Phi-3-mini-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|placeholder6|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", }, "VAGOsolutions/SauerkrautLM-Qwen-32b": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}", }, "AI-Sweden-Models/gpt-sw3-356m-instruct": { - "bos_token": None, - "eos_token": None, - "pad_token": None, - "unk_token": None, - "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:" + "bos_token": None, + "eos_token": None, + "pad_token": None, + "unk_token": None, + "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:", }, "google/gemma-7b-it": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", }, "ise-uiuc/Magicoder-S-DS-6.7B": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}", }, "Deci/DeciLM-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}", }, "katuni4ka/tiny-random-minicpm": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", }, "UnicomLLM/Unichat-llama3-Chinese-8B-28K": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}", }, "RLHFlow/LLaMA3-SFT": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}", }, "bofenghuang/vigogne-2-7b-chat": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}", }, "aisingapore/sea-lion-7b-instruct": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|padding|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}", }, "microsoft/Phi-3-small-8k-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", }, "THUDM/cogvlm2-llama3-chat-19B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}", }, "tiiuae/falcon-11B": { - "bos_token": ">>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}" + "bos_token": ">>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}", }, "Mihaiii/Pallas-0.5": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}", }, "prithivida/Asimov-7B-v2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}", }, "dreamgen/opus-v1.2-7b": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}", }, "KnutJaegersberg/internlm-20b-llama": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}", }, "alpindale/WizardLM-2-8x22B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}", }, "yentinglin/Taiwan-LLM-7B-v2.0-base": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}", }, "maywell/Synatra-Mixtral-8x7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}", }, "MediaTek-Research/Breeze-7B-Instruct-v1_0": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "MTSAIR/multi_verse_model": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}", }, "bofenghuang/vigostral-7b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", }, "SeaLLMs/SeaLLM-7B-v2.5": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "qnguyen3/Master-Yi-9B": { - "bos_token": "<|startoftext|>", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}", }, "meetkai/functionary-small-v2.5": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "h2oai/h2o-danube-1.8b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}", }, "TheBloke/CodeLlama-70B-Instruct-AWQ": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}", }, "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", }, "ibm-granite/granite-8b-code-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}", }, "dicta-il/dictalm2.0-instruct": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "nvidia/Llama3-ChatQA-1.5-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}", }, "openchat/openchat-3.6-8b-20240522": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}", }, "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}", }, "tenyx/TenyxChat-7B-v1": { - "bos_token": "", - "eos_token": "<|end_of_turn|>", - "pad_token": "<|end_of_turn|>", - "unk_token": "", - "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": "<|end_of_turn|>", + "unk_token": "", + "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}", }, "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}", }, "SeaLLMs/SeaLLM-7B-v2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + ''}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + ''}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", }, "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "<|im_end|>", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}", }, "vaiv/llamion-14b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", }, "yam-peleg/Hebrew-Gemma-11B-V2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", }, "shenzhi-wang/Llama3-8B-Chinese-Chat": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "ericzzz/falcon-rw-1b-chat": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}", }, "NLPark/AnFeng_v3_Avocet": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}", }, "microsoft/Phi-3-vision-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}", }, "jphme/em_german_leo_mistral": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}", }, "nlpai-lab/KULLM3": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", }, "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}", }, "MediaTek-Research/Breeze-7B-Instruct-v0_1": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "microsoft/DialoGPT-large": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}", }, "meta-llama/Meta-Llama-Guard-2-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n\n\n\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n\n\n\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}", }, "chinoll/Yi-6b-200k-dpo": { - "bos_token": "<|startoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}", }, "shanchen/llama3-8B-slerp-biomed-chat-chinese": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "MLP-KTLim/llama-3-Korean-Bllossom-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "UnfilteredAI/UNfilteredAI-1B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}", }, "abacusai/Smaug-Mixtral-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", }, "ProbeMedicalYonseiMAILab/medllama3-v20": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}", }, "vinai/PhoGPT-4B-Chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}", }, "lucyknada/microsoft_WizardLM-2-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}", }, "bigcode/starcoder2-15b-instruct-v0.1": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}", }, "AliAbdelrasheed/maqa_llama_4bit": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|reserved_special_token_250|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|reserved_special_token_250|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", }, "lightonai/alfred-40b-1023": { - "bos_token": None, - "eos_token": "", - "pad_token": None, - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'system' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'assistant' %}{{ '' + message['content'] + '' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '' }}{% endif %}{% endfor %}" + "bos_token": None, + "eos_token": "", + "pad_token": None, + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'system' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'assistant' %}{{ '' + message['content'] + '' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '' }}{% endif %}{% endfor %}", }, "aloobun/CosmicBun-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}", }, "Undi95/Mixtral-8x7B-MoE-RP-Story": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n", }, "TIGER-Lab/MAmmoTH2-8B-Plus": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}", }, "codellama/CodeLlama-70b-Instruct-hf": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}", }, "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": { - "bos_token": "", - "eos_token": "", - "pad_token": "[control_768]", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "[control_768]", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", }, "gorilla-llm/gorilla-openfunctions-v2": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<|EOT|>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", }, "ghost-x/ghost-7b-alpha": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", }, "winninghealth/WiNGPT2-Llama-3-8B-Chat": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a", }, "BramVanroy/Llama-2-13b-chat-dutch": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", }, "THUDM/chatglm3-6b": { - "bos_token": None, - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}" + "bos_token": None, + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", }, "microsoft/Phi-3-mini-4k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", }, "mistralai/Mistral-7B-Instruct-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n", }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", - } + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": '{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = "26 Jul 2024" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n {%- set system_message = messages[0][\'content\']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = "" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- "Environment: ipython\\n" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- "Tools: " + builtin_tools | reject(\'equalto\', \'code_interpreter\') | join(", ") + "\\n\\n"}}\n{%- endif %}\n{{- "Cutting Knowledge Date: December 2023\\n" }}\n{{- "Today Date: " + date_string + "\\n\\n" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n {{- "Do not use variables.\\n\\n" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- "\\n\\n" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- "<|eot_id|>" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0][\'content\']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n {{- "Given the following functions, please respond with a JSON for a function call " }}\n {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n {{- "Do not use variables.\\n\\n" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- "\\n\\n" }}\n {%- endfor %}\n {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\'+ message[\'content\'] | trim + \'<|eot_id|>\' }}\n {%- elif \'tool_calls\' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception("This model only supports single tool-calls at once!") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n {{- "<|python_tag|>" + tool_call.name + ".call(" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + \'="\' + arg_val + \'"\' }}\n {%- if not loop.last %}\n {{- ", " }}\n {%- endif %}\n {%- endfor %}\n {{- ")" }}\n {%- else %}\n {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n {{- \'{"name": "\' + tool_call.name + \'", \' }}\n {{- \'"parameters": \' }}\n {{- tool_call.arguments | tojson }}\n {{- "}" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we\'re in ipython mode #}\n {{- "<|eom_id|>" }}\n {%- else %}\n {{- "<|eot_id|>" }}\n {%- endif %}\n {%- elif message.role == "tool" or message.role == "ipython" %}\n {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- "<|eot_id|>" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n', + }, } diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers deleted file mode 160000 index b6c36a3026..0000000000 --- a/thirdparty/openvino_tokenizers +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b6c36a302696329f008e4425c9d98c4e00194a24 diff --git a/tools/cacheviz/__init__.py b/tools/cacheviz/__init__.py index 88b5a71df7..88d510f769 100644 --- a/tools/cacheviz/__init__.py +++ b/tools/cacheviz/__init__.py @@ -1,3 +1,2 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - diff --git a/tools/cacheviz/cacheviz.py b/tools/cacheviz/cacheviz.py index f242a10c96..985c918714 100644 --- a/tools/cacheviz/cacheviz.py +++ b/tools/cacheviz/cacheviz.py @@ -24,7 +24,8 @@ import numpy as np import tqdm from matplotlib import patches -plt.switch_backend('TkAgg') + +plt.switch_backend("TkAgg") BLOCK_SIZE = 32 EVICTION_START_SIZE = 32 @@ -33,23 +34,35 @@ def is_evictable(logical_block_idx: int, total_occupied_logical_blocks: int): - assert(logical_block_idx < total_occupied_logical_blocks) - if total_occupied_logical_blocks <= (EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + EVICTION_RECENT_SIZE) / BLOCK_SIZE: + assert logical_block_idx < total_occupied_logical_blocks + if ( + total_occupied_logical_blocks + <= (EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + EVICTION_RECENT_SIZE) + / BLOCK_SIZE + ): return False logical_block_idx_in_tokens = logical_block_idx * BLOCK_SIZE - return EVICTION_START_SIZE <= logical_block_idx_in_tokens < EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + return ( + EVICTION_START_SIZE + <= logical_block_idx_in_tokens + < EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + ) def get_hashed_rgb_color(idx: int) -> str: - return '#' + hashlib.sha1(str(idx).encode()).hexdigest()[0:6] # nosec + return "#" + hashlib.sha1(str(idx).encode()).hexdigest()[0:6] # nosec @dataclass class StepDumpData: dump_file_name: str = None num_blocks: int = None - occupied_blocks: Dict[int, List[Tuple[int, int]]] = field(default_factory=lambda: defaultdict(list)) - occupied_blocks_per_sequence: Dict[int, List[int]] = field(default_factory=lambda: defaultdict(list)) + occupied_blocks: Dict[int, List[Tuple[int, int]]] = field( + default_factory=lambda: defaultdict(list) + ) + occupied_blocks_per_sequence: Dict[int, List[int]] = field( + default_factory=lambda: defaultdict(list) + ) sequence_groups: Dict[int, List[int]] = field(default_factory=dict) @@ -59,7 +72,7 @@ def load_data(dump_dir: pathlib.Path) -> List[StepDumpData]: step_file_names_dict: Dict[int, List[pathlib.Path]] = defaultdict(list) for f in dump_dir.iterdir(): - if f.is_file() and f.suffix == '.txt' and 'usage' not in f.name: + if f.is_file() and f.suffix == ".txt" and "usage" not in f.name: file_name = f.stem step_number = int(file_name.split("_")[-1]) step_file_names_dict[step_number].append(f) @@ -70,8 +83,11 @@ def load_data(dump_dir: pathlib.Path) -> List[StepDumpData]: exit(-1) print(f"Step files found: {num_step_files}") - step_file_names_in_order = [name_lex_sorted for _, names_for_step in sorted(step_file_names_dict.items()) for - name_lex_sorted in sorted(names_for_step)] + step_file_names_in_order = [ + name_lex_sorted + for _, names_for_step in sorted(step_file_names_dict.items()) + for name_lex_sorted in sorted(names_for_step) + ] for dump_file_name in tqdm.tqdm(step_file_names_in_order): collected_data = StepDumpData() @@ -86,15 +102,23 @@ def load_data(dump_dir: pathlib.Path) -> List[StepDumpData]: sequence_group_tokens = sequence_group_line.split() sequence_group_id = int(sequence_group_tokens[0]) sequence_group_seq_ids = [int(s) for s in sequence_group_tokens[1:]] - collected_data.sequence_groups[sequence_group_id] = sequence_group_seq_ids + collected_data.sequence_groups[sequence_group_id] = ( + sequence_group_seq_ids + ) - for (i, line) in enumerate(f): + for i, line in enumerate(f): tokens = line.split() - seq_id, block_idx, ref_count = int(tokens[0]), int(tokens[1]), int(tokens[2]) + seq_id, block_idx, ref_count = ( + int(tokens[0]), + int(tokens[1]), + int(tokens[2]), + ) if block_idx not in collected_data.occupied_blocks: collected_data.occupied_blocks[block_idx] = [(seq_id, ref_count)] else: - collected_data.occupied_blocks[block_idx].append((seq_id, ref_count)) + collected_data.occupied_blocks[block_idx].append( + (seq_id, ref_count) + ) collected_data.occupied_blocks_per_sequence[seq_id].append(block_idx) retval.append(collected_data) return retval @@ -110,7 +134,11 @@ def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axe occupied_blocks_per_sequence = step_data.occupied_blocks_per_sequence sequence_groups = step_data.sequence_groups - seq_id_to_sequence_group_id: Dict[int, int] = { seq_id: seq_group_id for seq_group_id, seq_id_list in sequence_groups.items() for seq_id in seq_id_list } + seq_id_to_sequence_group_id: Dict[int, int] = { + seq_id: seq_group_id + for seq_group_id, seq_id_list in sequence_groups.items() + for seq_id in seq_id_list + } nrows = 1 ncols = num_blocks // nrows @@ -127,7 +155,7 @@ def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axe for occupied_block_idx in occupied_blocks: vspan_from = patch_x_positions[occupied_block_idx] vspan_to = vspan_from + 1 - plot_axes.axvspan(vspan_from, vspan_to, alpha=0.5, color='gray') + plot_axes.axvspan(vspan_from, vspan_to, alpha=0.5, color="gray") max_ylim = 1 @@ -135,20 +163,29 @@ def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axe for block_idx, patch_xpos in enumerate(patch_x_positions): # Block table usage indicator (occupying position -1 on the Y axis) base_pos = (patch_xpos, -1.5) - base_face_color = '1' + base_face_color = "1" num_occupying_sequences = 0 - base_text_color = 'black' + base_text_color = "black" if block_idx in occupied_blocks: num_occupying_sequences = occupied_blocks[block_idx][0][1] base_face_color = str(1 / (2 * num_occupying_sequences)) - base_text_color = 'white' - sq = patches.Rectangle(base_pos, width, height, fill=True, facecolor=base_face_color, edgecolor='black') + base_text_color = "white" + sq = patches.Rectangle( + base_pos, + width, + height, + fill=True, + facecolor=base_face_color, + edgecolor="black", + ) plot_axes.add_patch(sq) # Mark the block with the number of occupying sequences text = str(num_occupying_sequences) center = (base_pos[0] + 0.5, base_pos[1] + 0.5) - plot_axes.annotate(text, center, ha='center', va='center', color=base_text_color) + plot_axes.annotate( + text, center, ha="center", va="center", color=base_text_color + ) if block_idx in occupied_blocks: for seq_idx, ref_count in occupied_blocks[block_idx]: @@ -159,21 +196,45 @@ def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axe seq_sq_pos = (base_pos[0], base_pos[1] + (seq_idx + 1)) max_ylim = max(max_ylim, seq_idx + 1) seq_color = get_hashed_rgb_color(seq_idx) - seq_group_color = get_hashed_rgb_color(-seq_id_to_sequence_group_id[seq_idx] - 1) - linestyle = 'solid' - logical_idx_in_seq = occupied_blocks_per_sequence[seq_idx].index(block_idx) - if is_evictable(logical_idx_in_seq, len(occupied_blocks_per_sequence[seq_idx])): - linestyle = 'dotted' - seq_sq = patches.Rectangle(seq_sq_pos, width, height, fill=True, facecolor=seq_color, edgecolor=seq_group_color, lw=3, - linestyle=linestyle) + seq_group_color = get_hashed_rgb_color( + -seq_id_to_sequence_group_id[seq_idx] - 1 + ) + linestyle = "solid" + logical_idx_in_seq = occupied_blocks_per_sequence[seq_idx].index( + block_idx + ) + if is_evictable( + logical_idx_in_seq, len(occupied_blocks_per_sequence[seq_idx]) + ): + linestyle = "dotted" + seq_sq = patches.Rectangle( + seq_sq_pos, + width, + height, + fill=True, + facecolor=seq_color, + edgecolor=seq_group_color, + lw=3, + linestyle=linestyle, + ) plot_axes.add_patch(seq_sq) - plot_axes.annotate(sequence_local_text, sequence_local_center, ha='center', va='center') + plot_axes.annotate( + sequence_local_text, sequence_local_center, ha="center", va="center" + ) # Display total blocks used on the right side of the plot - pos_on_right_of_plot_at_sequence_idx = (num_blocks, sequence_local_center[1]) - plot_axes.annotate(str(len(occupied_blocks_per_sequence[seq_idx])), pos_on_right_of_plot_at_sequence_idx, - ha='center', va='center', - color=seq_color, weight='bold') + pos_on_right_of_plot_at_sequence_idx = ( + num_blocks, + sequence_local_center[1], + ) + plot_axes.annotate( + str(len(occupied_blocks_per_sequence[seq_idx])), + pos_on_right_of_plot_at_sequence_idx, + ha="center", + va="center", + color=seq_color, + weight="bold", + ) # Set limits and ticks so that only integer ticks are visible and all the range is shown plot_axes.set_yticks(np.arange(max_ylim)) @@ -182,86 +243,127 @@ def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axe plot_axes.set_xlim(-0.5, num_blocks + 0.5) # Labels - plot_axes.set_xlabel('Block index') - plot_axes.set_ylabel('Sequence index') + plot_axes.set_xlabel("Block index") + plot_axes.set_ylabel("Sequence index") plot_axes.set_title(step_data.dump_file_name) # Legend for sequence group colors - plot_axes.legend(handles=[patches.Patch(facecolor=get_hashed_rgb_color(-seq_group_idx - 1), - label=f'Sequence group {seq_group_idx}') for seq_group_idx in - sequence_groups], loc='center left', bbox_to_anchor=(1, 0.5)) + plot_axes.legend( + handles=[ + patches.Patch( + facecolor=get_hashed_rgb_color(-seq_group_idx - 1), + label=f"Sequence group {seq_group_idx}", + ) + for seq_group_idx in sequence_groups + ], + loc="center left", + bbox_to_anchor=(1, 0.5), + ) return plot_axes -def load_and_draw_usage(plot_axes: plt.Axes, usage_dump_file: pathlib.Path, current_step: int, allocated_usage_series: List[float], eviction_relation='before') -> Tuple[plt.Axes, float, Tuple[List, List]]: +def load_and_draw_usage( + plot_axes: plt.Axes, + usage_dump_file: pathlib.Path, + current_step: int, + allocated_usage_series: List[float], + eviction_relation="before", +) -> Tuple[plt.Axes, float, Tuple[List, List]]: usage_values: Dict[int, Tuple[float, float]] = {} with open(usage_dump_file, "r") as f: while True: before_eviction_line = f.readline() after_eviction_line = f.readline() - if before_eviction_line is None or after_eviction_line is None or before_eviction_line == '' or after_eviction_line == '': + if ( + before_eviction_line is None + or after_eviction_line is None + or before_eviction_line == "" + or after_eviction_line == "" + ): break before_step_num, before_cache_usage = before_eviction_line.split() after_step_num, after_cache_usage = after_eviction_line.split() assert before_step_num == after_step_num step_num = int(before_step_num) - usage_values[step_num] = (float(before_cache_usage), float(after_cache_usage)) + usage_values[step_num] = ( + float(before_cache_usage), + float(after_cache_usage), + ) step_numbers = [k for k in usage_values.keys()] before_series = [v[0] for v in usage_values.values()] after_series = [v[1] for v in usage_values.values()] # plot "after" first so that it ends up under the "before" plot for better visibility of eviction - plot_axes.plot(step_numbers, after_series, color='blue') - plot_axes.plot(step_numbers, before_series, color='green') + plot_axes.plot(step_numbers, after_series, color="blue") + plot_axes.plot(step_numbers, before_series, color="green") allocated_usage_before_series = [v for v in allocated_usage_series[0::2]] allocated_usage_after_series = [v for v in allocated_usage_series[1::2]] - leaked_before_series = [r - a if (r - a) > 0 else 0 for r, a in zip(before_series, allocated_usage_before_series)] - leaked_after_series = [r - a if (r - a) > 0 else 0 for r, a in zip(after_series, allocated_usage_after_series)] - plot_axes.plot(step_numbers, leaked_after_series, color='orange') - plot_axes.plot(step_numbers, leaked_before_series, color='red') + leaked_before_series = [ + r - a if (r - a) > 0 else 0 + for r, a in zip(before_series, allocated_usage_before_series) + ] + leaked_after_series = [ + r - a if (r - a) > 0 else 0 + for r, a in zip(after_series, allocated_usage_after_series) + ] + plot_axes.plot(step_numbers, leaked_after_series, color="orange") + plot_axes.plot(step_numbers, leaked_before_series, color="red") plot_axes.set_yticks(np.arange(0, 100, 10)) plot_axes.set_ylim(0, 100) - plot_axes.grid(visible=True, which='major', axis='y') + plot_axes.grid(visible=True, which="major", axis="y") plot_axes.set_xticks(np.arange(0, step_num, 100)) plot_axes.set_xlim(0, step_num) # Labels - plot_axes.set_xlabel('Step') - plot_axes.set_ylabel('Cache usage, %') + plot_axes.set_xlabel("Step") + plot_axes.set_ylabel("Cache usage, %") - plot_axes.vlines(current_step, ymin=0, ymax=100, colors='red') + plot_axes.vlines(current_step, ymin=0, ymax=100, colors="red") - plot_axes.legend(['after eviction', 'before eviction', 'leaked (after eviction)', 'leaked (before eviction)']) + plot_axes.legend( + [ + "after eviction", + "before eviction", + "leaked (after eviction)", + "leaked (before eviction)", + ] + ) - if eviction_relation == 'before': + if eviction_relation == "before": reported_cache_usage = usage_values[current_step][0] allocated_usage_series = allocated_usage_before_series[current_step] - if eviction_relation == 'after': + if eviction_relation == "after": reported_cache_usage = usage_values[current_step][1] allocated_usage_series = allocated_usage_after_series[current_step] plot_axes.annotate( - f'Block table usage: {allocated_usage_series:.2f}% (occupied), {reported_cache_usage:.2f}% (reported)', - xy=(0.5, 0), xytext=(0, 10), - xycoords=('axes fraction', 'figure fraction'), - textcoords='offset points', - size=14, ha='center', va='bottom') + f"Block table usage: {allocated_usage_series:.2f}% (occupied), {reported_cache_usage:.2f}% (reported)", + xy=(0.5, 0), + xytext=(0, 10), + xycoords=("axes fraction", "figure fraction"), + textcoords="offset points", + size=14, + ha="center", + va="bottom", + ) def get_eviction_relation(dump_file_name: str) -> str: - return 'before' if 'before' in str(dump_file_name) else 'after' + return "before" if "before" in str(dump_file_name) else "after" def main(): parser = argparse.ArgumentParser() parser.add_argument("--dump_folder", help="Cache info dump folder", required=True) - parser.add_argument("--step", help="Step ID to show at startup", required=False, default=0, type=int) + parser.add_argument( + "--step", help="Step ID to show at startup", required=False, default=0, type=int + ) args = parser.parse_args() dump_folder = args.dump_folder @@ -271,51 +373,64 @@ def main(): fig = plt.figure(figsize=(10, 10)) fig.tight_layout() - plot_axes = fig.add_subplot(211, aspect='equal') + plot_axes = fig.add_subplot(211, aspect="equal") - current_file_idx_displayed: int = args.step * 2 # 2 files per step - before and after eviction + current_file_idx_displayed: int = ( + args.step * 2 + ) # 2 files per step - before and after eviction usage_dump_file = dump_folder_path / "cache_usage.txt" def on_press(event): nonlocal current_file_idx_displayed - if event.key == 'd' or event.key == 'right': + if event.key == "d" or event.key == "right": current_file_idx_displayed += 1 - elif event.key == 'a' or event.key == 'left': + elif event.key == "a" or event.key == "left": current_file_idx_displayed -= 1 - if event.key == 'alt+d' or event.key == 'alt+right': + if event.key == "alt+d" or event.key == "alt+right": current_file_idx_displayed += 10 * 2 - elif event.key == 'alt+a' or event.key == 'alt+left': + elif event.key == "alt+a" or event.key == "alt+left": current_file_idx_displayed -= 10 * 2 - if event.key == 'D' or event.key == 'shift+right': + if event.key == "D" or event.key == "shift+right": current_file_idx_displayed += 100 * 2 - elif event.key == 'A' or event.key == 'shift+left': + elif event.key == "A" or event.key == "shift+left": current_file_idx_displayed -= 100 * 2 current_file_idx_displayed %= len(step_data) - mode = get_eviction_relation(step_data[current_file_idx_displayed].dump_file_name) + mode = get_eviction_relation( + step_data[current_file_idx_displayed].dump_file_name + ) plot_axes.clear() draw_from_step_data(plot_axes, step_data[current_file_idx_displayed]) usage_plot_axes.clear() - load_and_draw_usage(usage_plot_axes, usage_dump_file, current_file_idx_displayed // 2, allocated_usage_series=allocated_usage_series, eviction_relation=mode) + load_and_draw_usage( + usage_plot_axes, + usage_dump_file, + current_file_idx_displayed // 2, + allocated_usage_series=allocated_usage_series, + eviction_relation=mode, + ) fig.canvas.draw_idle() - fig.canvas.mpl_connect('key_press_event', on_press) - usage_plot_axes = fig.add_subplot(212, aspect='auto') + fig.canvas.mpl_connect("key_press_event", on_press) + usage_plot_axes = fig.add_subplot(212, aspect="auto") curr_step_file_data = step_data[current_file_idx_displayed] mode = get_eviction_relation(curr_step_file_data.dump_file_name) draw_from_step_data(plot_axes, curr_step_file_data) - load_and_draw_usage(usage_plot_axes, usage_dump_file, args.step, allocated_usage_series=allocated_usage_series, eviction_relation=mode) + load_and_draw_usage( + usage_plot_axes, + usage_dump_file, + args.step, + allocated_usage_series=allocated_usage_series, + eviction_relation=mode, + ) plt.show() if __name__ == "__main__": main() - - -