diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp index 6e0cb5034f..0561f88179 100644 --- a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp +++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp @@ -39,16 +39,13 @@ int main(int argc, char* argv[]) try { const size_t num_prompts = result["num_prompts"].as(); const bool dynamic_split_fuse = result["dynamic_split_fuse"].as(); - const std::string models_path = result["model"].as(); + const std::string models_path = "/home/panas/llm/models/TinyLlama-1.1B-Chat-v1.0/"; // create dataset std::vector prompt_examples = { - "What is OpenVINO?", - "How are you?", - "What is your name?", - "Tell me something about Canada", - "What is OpenVINO?", + "hello", + "Here is the longest novel ever: " }; std::vector sampling_params_examples { @@ -57,7 +54,7 @@ int main(int argc, char* argv[]) try { ov::genai::multinomial(), }; - std::vector prompts(num_prompts); + std::vector prompts(2); std::vector sampling_params(num_prompts); for (size_t request_id = 0; request_id < num_prompts; ++request_id) { @@ -79,7 +76,49 @@ int main(int argc, char* argv[]) try { scheduler_config.max_num_seqs = 2; ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config); - std::vector generation_results = pipe.generate(prompts, sampling_params); + ov::genai::GenerationConfig prototype; + prototype.max_new_tokens = 20; + prototype.num_beam_groups = 3; + prototype.num_beams = 15; + prototype.diversity_penalty = 1.0; + std::vector generation_results = pipe.generate({ + "hello", + "Here is the longest novel ever: " + }, std::vector(2, prototype)); + + + for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) { + const ov::genai::GenerationResult & generation_result = generation_results[request_id]; + std::cout << "Question: " << prompts[request_id] << std::endl; + switch (generation_result.m_status) + { + case ov::genai::GenerationStatus::FINISHED: + print_generation_result(generation_result); + break; + case ov::genai::GenerationStatus::IGNORED: + std::cout << "Request was ignored due to lack of memory." < 0) { + std::cout << "Partial result:" << std::endl; + print_generation_result(generation_result); + } + break; + case ov::genai::GenerationStatus::DROPPED_BY_PIPELINE: + std::cout << "Request was aborted." < 0) { + std::cout << "Partial result:" << std::endl; + print_generation_result(generation_result); + } + break; + default: + break; + } + std::cout << std::endl; + } + + generation_results = pipe.generate({ + "hello", + "Here is the longest novel ever: " + }, std::vector(2, prototype)); for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) { const ov::genai::GenerationResult & generation_result = generation_results[request_id];