-
Notifications
You must be signed in to change notification settings - Fork 198
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Accept buffer in LLMPipeline ctor #1262
Changes from 18 commits
624d9f5
9ed8f6e
dd69db2
3856770
2cdf0d3
5b73eb4
0f45144
cb7f55e
add6268
ef736e6
44aede3
a7081c4
ab64515
380966d
4b7c4c2
0fc3bbe
62ba450
4a45257
2457b89
9befd0c
bbe1b7b
c05febe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. multinomial_causal_lm, beam_search_causal_lm and chat_sample should also move to |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,67 @@ | ||||||||
// Copyright (C) 2023-2024 Intel Corporation | ||||||||
// SPDX-License-Identifier: Apache-2.0 | ||||||||
|
||||||||
#include "openvino/genai/llm_pipeline.hpp" | ||||||||
#include <fstream> | ||||||||
|
||||||||
std::pair<std::string, ov::Tensor> decrypt_model(const std::string& model_path, const std::string& weights_path) { | ||||||||
std::ifstream model_file(model_path); | ||||||||
ilya-lavrenov marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
std::ifstream weights_file(weights_path, std::ios::binary); | ||||||||
if (!model_file.is_open() || !weights_file.is_open()) { | ||||||||
throw std::runtime_error("Cannot open model or weights file"); | ||||||||
} | ||||||||
|
||||||||
// User can add file decryption of model_file and weights_file in memory here. | ||||||||
|
||||||||
std::string model_str((std::istreambuf_iterator<char>(model_file)), std::istreambuf_iterator<char>()); | ||||||||
std::vector<char> weights_buffer((std::istreambuf_iterator<char>(weights_file)), std::istreambuf_iterator<char>()); | ||||||||
auto weights_tensor = ov::Tensor(ov::element::u8, {weights_buffer.size()}, weights_buffer.data()); | ||||||||
return {model_str, weights_tensor}; | ||||||||
} | ||||||||
|
||||||||
ov::genai::Tokenizer decrypt_tokenizer(const std::string& models_path) { | ||||||||
std::string tok_model_path = models_path + "/openvino_tokenizer.xml"; | ||||||||
std::string tok_weights_path = models_path + "/openvino_tokenizer.bin"; | ||||||||
auto [tok_model_str, tok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path); | ||||||||
|
||||||||
std::string detok_model_path = models_path + "/openvino_detokenizer.xml"; | ||||||||
std::string detok_weights_path = models_path + "/openvino_detokenizer.bin"; | ||||||||
auto [detok_model_str, detok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path); | ||||||||
|
||||||||
return ov::genai::Tokenizer(tok_model_str, tok_weights_tensor, detok_model_str, detok_weights_tensor); | ||||||||
} | ||||||||
|
||||||||
int main(int argc, char* argv[]) try { | ||||||||
if (3 > argc) | ||||||||
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\""); | ||||||||
|
||||||||
std::string device = "CPU"; // GPU, NPU can be used as well | ||||||||
std::string models_path = argv[1]; | ||||||||
std::string prompt = argv[2]; | ||||||||
|
||||||||
auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); | ||||||||
ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path); | ||||||||
|
||||||||
// NPU reads some properties from the config file, but when LLMPipeline is initialized | ||||||||
// from the model_str and weights_tensor, there not such folder. Therefore, we need to | ||||||||
// pass these properties manually. | ||||||||
// This is necessary only for NPU, for other plugins can be ommited. | ||||||||
ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, | ||||||||
{"type", "llama"}, | ||||||||
{"num_key_value_heads", 32}}; | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can avoid demonstrating such NPU hacks in samples? I hope that @dmatveev and @TolyaTalamanov can come up with more generic approach which does not require reading this information from openvino.genai/src/cpp/src/utils/paged_attention_transformations.cpp Lines 23 to 25 in e2fa0d0
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed this from the sample. In order not to forget about that created a ticket for |
||||||||
|
||||||||
ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device, model_descr_properties); | ||||||||
|
||||||||
std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100)); | ||||||||
std::cout << result << std::endl; | ||||||||
} catch (const std::exception& error) { | ||||||||
try { | ||||||||
std::cerr << error.what() << '\n'; | ||||||||
} catch (const std::ios_base::failure&) {} | ||||||||
return EXIT_FAILURE; | ||||||||
} catch (...) { | ||||||||
try { | ||||||||
std::cerr << "Non-exception object thrown\n"; | ||||||||
} catch (const std::ios_base::failure&) {} | ||||||||
return EXIT_FAILURE; | ||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a test for the new sample