-
Notifications
You must be signed in to change notification settings - Fork 193
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update streaming in LM Encoding & CB #1377
Changes from 3 commits
3daf0de
310461e
be74340
61e55a6
ad44b32
02622c9
dd5ca50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results( | |
get_active_sequence_groups), | ||
active_sequence_groups.end()); | ||
|
||
while (active_sequence_groups.size() > 0) { | ||
ilya-lavrenov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
do { | ||
size_t total_num_tokens = 0; | ||
|
||
for (auto& sequence_group : active_sequence_groups) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW @sbalandi should we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it looks like yes, we can use |
||
|
@@ -203,11 +203,18 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results( | |
raw_perf_counters.m_batch_sizes.emplace_back(batch_size); | ||
|
||
if (streamer_ptr) { | ||
// stream data from first sequence | ||
int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back(); | ||
if (streamer_ptr->put(out_token)) { | ||
break; | ||
// not generated tokens like several prompt phase | ||
if (!generations.at(0).get()->can_read()) { | ||
continue; | ||
} | ||
std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back(); | ||
OPENVINO_ASSERT(1 <= token.size()); | ||
OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size()); | ||
for (const auto& gen_token : token.begin()->second.generated_ids) { | ||
if (!streamer_ptr->put(gen_token)) { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits")); | ||
|
@@ -216,13 +223,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results( | |
active_sequence_groups.end(), | ||
get_active_sequence_groups), | ||
active_sequence_groups.end()); | ||
} | ||
|
||
if (streamer_ptr) { | ||
int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back(); | ||
sbalandi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
streamer_ptr->put(out_token); | ||
streamer_ptr->end(); | ||
} | ||
} while (active_sequence_groups.size() > 0); | ||
|
||
size_t next_selected_beam = 0; | ||
for (size_t i = 0; i < sequence_groups.size(); i++) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
continue_generation
assignment is dropped here and hence, we can have abandoned requests with allocated block tables asdrop_requests()
is not called below.