-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
llm: use stateful model #8
Changes from all commits
20a7db5
08e0b3c
25a2ddb
077fdbd
f7df4c7
ec3322a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,75 +34,35 @@ int main(int argc, char* argv[]) try { | |
core.add_extension(USER_OV_EXTENSIONS_PATH); // USER_OV_EXTENSIONS_PATH is defined in root CMakeLists.txt | ||
auto [input_ids, mask] = tokenize(core.compile_model(argv[2], "CPU").create_infer_request(), argv[4]); | ||
ov::InferRequest detokenizer = core.compile_model(argv[3], "CPU").create_infer_request(); | ||
std::shared_ptr<ov::Model> model = core.read_model(argv[1]); | ||
std::map<size_t, ov::PartialShape> shapes = { | ||
{0, ov::PartialShape{ | ||
-1, -1 | ||
}}, | ||
{1, ov::PartialShape{ | ||
-1, -1 | ||
}}, | ||
{2, ov::PartialShape{ | ||
-1, -1 | ||
}} | ||
}; | ||
std::vector<ov::Output<ov::Node>> inputs = model->inputs(); | ||
for (size_t idx = 3; idx < inputs.size(); ++idx) { | ||
ov::PartialShape shape = inputs.at(idx).get_partial_shape(); | ||
shape[0] = -1; | ||
shapes.emplace(idx, shape); | ||
} | ||
model->reshape(shapes); | ||
ov::InferRequest ireq = core.compile_model(model, "CPU").create_infer_request(); | ||
ov::InferRequest ireq = core.compile_model(argv[1], "CPU").create_infer_request(); | ||
ireq.set_tensor("input_ids", input_ids); | ||
ireq.set_tensor("attention_mask", mask); | ||
ov::Tensor position_ids = ireq.get_tensor("position_ids"); | ||
position_ids.set_shape(input_ids.get_shape()); | ||
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0); | ||
for (size_t idx = 3; idx < inputs.size(); ++idx) { | ||
ov::Shape shape = inputs.at(idx).get_partial_shape().get_min_shape(); | ||
shape.at(0) = 1; | ||
ireq.get_input_tensor(idx).set_shape(shape); | ||
} | ||
ireq.get_tensor("beam_idx").set_shape({1}); | ||
ireq.get_tensor("beam_idx").data<int32_t>()[0] = 0; | ||
Parameters parameters; | ||
const int64_t* prompt_data = input_ids.data<const int64_t>(); | ||
parameters.prompt = std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}; | ||
GroupBeamSearcher group_beam_searcher{parameters}; | ||
std::vector<int64_t> next_tokens; | ||
std::vector<int32_t> next_beams; | ||
for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { | ||
ireq.infer(); | ||
std::vector<TokenToBeam> next_tokens = group_beam_searcher.process(ireq.get_tensor("logits")); | ||
std::tie(next_tokens, next_beams) = group_beam_searcher.process(ireq.get_tensor("logits")); | ||
if (next_tokens.empty()) { | ||
break; | ||
} | ||
size_t batch_size = next_tokens.size(); | ||
ireq.get_tensor("input_ids").set_shape({batch_size, 1}); | ||
ireq.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); | ||
ov::Tensor attention_mask = ireq.get_tensor("attention_mask"); | ||
ov::Shape mask_shape = attention_mask.get_shape(); | ||
mask_shape.at(0) = batch_size; | ||
++mask_shape.at(1); | ||
ov::Shape mask_shape{batch_size, attention_mask.get_shape().at(1) + 1}; | ||
attention_mask.set_shape(mask_shape); | ||
std::fill_n(attention_mask.data<int64_t>(), shape_size(mask_shape), 1); | ||
ireq.get_tensor("position_ids").set_shape({batch_size, 1}); | ||
std::fill_n(ireq.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1); | ||
for (size_t tensor_idx = 3; tensor_idx < inputs.size(); ++tensor_idx) { | ||
ov::Shape shape = ireq.get_output_tensor(tensor_idx - 2).get_shape(); | ||
shape.at(0) = batch_size; | ||
ireq.get_input_tensor(tensor_idx).set_shape(shape); | ||
} | ||
for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { | ||
ireq.get_tensor("input_ids").data<int64_t>()[batch_idx] = next_tokens.at(batch_idx).token_idx; | ||
for (size_t tensor_idx = 3; tensor_idx < inputs.size(); ++tensor_idx) { | ||
ov::Tensor present = ireq.get_output_tensor(tensor_idx - 2); | ||
ov::Shape present_begin = {next_tokens.at(batch_idx).beam_idx, 0, 0, 0}; | ||
ov::Shape present_end = present.get_shape(); | ||
present_end.at(0) = next_tokens.at(batch_idx).beam_idx + 1; | ||
ov::Tensor past = ireq.get_input_tensor(tensor_idx); | ||
ov::Shape past_begin = {batch_idx, 0, 0, 0}; | ||
ov::Shape past_end = past.get_shape(); | ||
past_end.at(0) = batch_idx + 1; | ||
ov::Tensor{present, present_begin, present_end}.copy_to(ov::Tensor{past, past_begin, past_end}); | ||
} | ||
} | ||
position_ids.set_shape({batch_size, 1}); | ||
std::fill_n(position_ids.data<int64_t>(), batch_size, mask_shape.at(1) - 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can also create a Tensor with a ctor which initializes all elements with a given value? ireq.set_tensor("position_ids", ov::Tensor(ov::element::i64, {batch_size, 1}, length_count + prompt_lenght));
ireq.set_tensor("attention_mask", ov::Tensor(ov::element::i64, {batch_size, length_count + prompt_lenght + 1}, 1)); instead of: ov::Tensor attention_mask = ireq.get_tensor("attention_mask");
ov::Shape mask_shape{batch_size, attention_mask.get_shape().at(1) + 1};
attention_mask.set_shape(mask_shape);
std::fill_n(attention_mask.data<int64_t>(), shape_size(mask_shape), 1);
position_ids.set_shape({batch_size, 1});
std::fill_n(position_ids.data<int64_t>(), batch_size, mask_shape.at(1) - 1); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally ov::Tensor should have functions similar to torch.Tensor. Broadcasting and all the stuff. Sure you can use use cases like this to prioritize function implementation. But my perception was that ov tries to keep Tensor API minimal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But we also want to avoid boilerplate code in samples, so we can extend ov::Tensor with more helpers / constructors There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The proposed method is similar to https://pytorch.org/docs/stable/generated/torch.Tensor.new_full.html#torch.Tensor.new_full |
||
ireq.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); | ||
} | ||
for (Group& group : group_beam_searcher.groups) { | ||
if (!group.done) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can get a tensor once and use it in all places below, it would be shorter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's no longer applicable for
beam_idx
, but I did it forposition_ids