diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index b8b222e347..4ea75e94c5 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. * Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). * + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param presence_penalty reduces absolute log prob if the token was generated at least once. + * @param frequency_penalty reduces absolute log prob as many times as the token was generated. + * * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; * "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). * - * Random sampling parameters: + * Random (or multinomial) sampling parameters: + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. * @param temperature the value used to modulate token probabilities for random sampling. * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. - * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. - * @param presence_penalty reduces absolute log prob if the token was generated at least once. - * @param frequency_penalty reduces absolute log prob as many times as the token was generated. * @param rng_seed initializes random generator. + * @param num_return_sequences the number of sequences to generate from a single prompt. * * Assisting generation parameters: * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update. @@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t min_new_tokens = 0; bool echo = false; size_t logprobs = 0; - + std::set stop_strings; // Default setting in vLLM (and OpenAI API) is not to include stop string in the output bool include_stop_str_in_output = false; diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 35ae92d605..4ff184547e 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -185,6 +185,9 @@ void GenerationConfig::validate() const { "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); if (is_beam_search()) { OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + if (num_beam_groups > 1) { + OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search"); + } } else { OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 3d27b23052..8510a8389f 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -361,10 +361,10 @@ class ContinuousBatchingPipeline: This class is used for generation with LLMs with continuous batchig """ @typing.overload - def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload - def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle: @@ -522,17 +522,17 @@ class FluxTransformer2DModel: class GenerationConfig: """ - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -540,6 +540,10 @@ class GenerationConfig: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -550,8 +554,8 @@ class GenerationConfig: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -560,7 +564,7 @@ class GenerationConfig: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ adapters: AdapterConfig | None assistant_confidence_threshold: float @@ -951,17 +955,17 @@ class LLMPipeline: :rtype: DecodedResults, EncodedResults, str - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -969,6 +973,10 @@ class LLMPipeline: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -979,8 +987,8 @@ class LLMPipeline: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -989,7 +997,7 @@ class LLMPipeline: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ @typing.overload def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: @@ -1032,17 +1040,17 @@ class LLMPipeline: :rtype: DecodedResults, EncodedResults, str - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -1050,6 +1058,10 @@ class LLMPipeline: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -1060,8 +1072,8 @@ class LLMPipeline: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -1070,7 +1082,7 @@ class LLMPipeline: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ def get_generation_config(self) -> GenerationConfig: ... @@ -1420,7 +1432,7 @@ class StopCriteria: """ StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index b1a5c6cd2e..f49bcf29bd 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -20,7 +20,7 @@ namespace { auto stop_criteria_docstring = R"( StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. @@ -30,17 +30,17 @@ auto stop_criteria_docstring = R"( } // namespace char generation_config_docstring[] = R"( - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -48,6 +48,10 @@ char generation_config_docstring[] = R"( logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -58,8 +62,8 @@ char generation_config_docstring[] = R"( length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -68,7 +72,7 @@ char generation_config_docstring[] = R"( top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. )"; void init_generation_config(py::module_& m) { diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 5e1498588d..7e3c075405 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -289,10 +289,12 @@ def convert_to_hf( kwargs['max_length'] = generation_config.max_length # has higher priority than 'max_length' kwargs['max_new_tokens'] = generation_config.max_new_tokens + kwargs['min_new_tokens'] = generation_config.min_new_tokens if generation_config.stop_strings: kwargs['stop_strings'] = generation_config.stop_strings # copy default parameters + kwargs['bos_token_id'] = default_generation_config.bos_token_id kwargs['eos_token_id'] = default_generation_config.eos_token_id kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty @@ -301,11 +303,12 @@ def convert_to_hf( # beam search case kwargs['num_beam_groups'] = generation_config.num_beam_groups kwargs['num_beams'] = generation_config.num_beams - kwargs['diversity_penalty'] = generation_config.diversity_penalty kwargs['length_penalty'] = generation_config.length_penalty kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size kwargs['num_return_sequences'] = generation_config.num_return_sequences kwargs['output_scores'] = True + if generation_config.num_beam_groups > 1: + kwargs['diversity_penalty'] = generation_config.diversity_penalty elif generation_config.do_sample: # mulitinomial kwargs['temperature'] = generation_config.temperature @@ -321,7 +324,7 @@ def convert_to_hf( def run_hugging_face( - model, + opt_model, hf_tokenizer, prompts: List[str], generation_configs: List[GenerationConfig], @@ -330,8 +333,9 @@ def run_hugging_face( for prompt, generation_config in zip(prompts, generation_configs): inputs = hf_tokenizer(prompt, return_tensors="pt") prompt_len = inputs['input_ids'].numel() - generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), - return_dict_in_generate=True, tokenizer=hf_tokenizer) + generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], + generation_config=convert_to_hf(opt_model.generation_config, generation_config), + return_dict_in_generate=True, tokenizer=hf_tokenizer) all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) generation_result = GenerationResult() @@ -342,7 +346,7 @@ def run_hugging_face( generation_results.append(generation_result) del hf_tokenizer - del model + del opt_model return generation_results @@ -388,6 +392,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): assert hf_text == ov_text + +def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True): + hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ + AutoModelForCausalLM.from_pretrained(model_id) + return opt_model, hf_tokenizer + + def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): model.save_pretrained(models_path) # convert tokenizers as well @@ -397,23 +409,6 @@ def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): serialize(tokenizer, models_path / "openvino_tokenizer.xml") serialize(detokenizer, models_path / "openvino_detokenizer.xml") -def get_model_and_tokenizer(model_id: str, use_optimum = True): - hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ - AutoModelForCausalLM.from_pretrained(model_id) - return model, hf_tokenizer - -def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): - use_optimum = True - models_path : Path = tmp_path / model_id - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) - - if use_optimum: - save_ov_model_from_optimum(model, hf_tokenizer, models_path) - - hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) - def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) @@ -426,19 +421,32 @@ def _generate_and_compare_with_reference_results(models_path: Path, prompts: Lis compare_results(ref_result, ov_result, generation_config) +def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): + use_optimum = True + models_path : Path = tmp_path / model_id + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum) + + if use_optimum: + save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + + hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) + _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) + + def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) assert len(prompts) == len(reference_texts_per_prompt) assert len(prompts) == len(ov_results) - for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs): + for prompt, ref_texts_for_this_prompt, ov_result in zip(prompts, reference_texts_per_prompt, ov_results): print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}") assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids) for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): assert ref_text == ov_text + def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py index 49d6c8f6b0..7c648e73dc 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_preemption.py @@ -4,7 +4,7 @@ import pytest from openvino_genai import GenerationConfig -from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ +from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p @@ -87,7 +87,7 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): config.rng_seed = 0 config.max_new_tokens = 30 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) @@ -168,7 +168,7 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): for config in generation_configs: config.rng_seed = 0 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index e2ebcc9aa2..fbcce76bf7 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,7 +10,7 @@ from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer from typing import List, TypedDict -from common import run_test_pipeline, read_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ +from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ get_greedy_with_penalties, get_multinomial_temperature, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ @@ -313,7 +313,7 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl generation_config.rng_seed = 0 generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) @@ -337,7 +337,7 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) @@ -364,7 +364,7 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) @@ -392,7 +392,7 @@ def test_post_oom_health(tmp_path, sampling_config): scheduler_config.num_kv_blocks = 10 generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path)