Wovchena · Wovchena · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 12, 2023
diff --git a/llm/cpp/README.md b/llm/cpp/README.md
@@ -49,13 +49,14 @@ This pipeline can work with other similar topologies produced by `optimum-intel`
 source <OpenVINO dir>/setupvars.sh
 python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" ../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
 python -m pip uninstall --yes optimum-intel
-python -m pip install git+https://github.com/huggingface/optimum-intel.git@5dac93d6e8d15c96fe061c653d82b7afd54954db
-optimum-cli export openvino -m meta-llama/Llama-2-7b-hf ./Llama-2-7b-hf/
-python ./llm/cpp/convert_tokenizers.py ./Llama-2-7b-hf/
+python -m pip install git+https://github.com/slyalin/optimum-intel.git@stateful
+optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v0.6 ./TinyLlama-1.1B-Chat-v0.6/
+python ../../llm_bench/python/convert.py --model_id ./TinyLlama-1.1B-Chat-v0.6/ --make_stateful --bettertransformer --output_dir ./TinyLlama-1.1B-Chat-v0.6/
+python ./llm/cpp/convert_tokenizers.py ./TinyLlama-1.1B-Chat-v0.6/
 ```
 
 ## Run
 
 Usage: `llm <openvino_model.xml> <tokenizer.xml> <detokenizer.xml> "<prompt>"`
 
-Example: `./build/llm/cpp/llm ./Llama-2-7b-hf/openvino_model.xml ./tokenizer.xml ./detokenizer.xml "Why is the Sun yellow?"`
+Example: `./build/llm/cpp/llm ./TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP32\openvino_model.xml ./tokenizer.xml ./detokenizer.xml "Why is the Sun yellow?"`
diff --git a/llm/cpp/group_beam_searcher.hpp b/llm/cpp/group_beam_searcher.hpp
@@ -133,7 +133,7 @@ struct Group {
     }
 };
 
-struct TokenToBeam {int64_t token_idx; size_t beam_idx;};
+struct TokenToBeam {int64_t token_idx; int32_t beam_idx;};
 
 struct GroupBeamSearcher {
     Parameters parameters;
@@ -147,9 +147,9 @@ struct GroupBeamSearcher {
             group.ongoing.front().score = 0.0;
         }
     }
-    std::vector<TokenToBeam> process(const ov::Tensor& logits) {
-        std::vector<TokenToBeam> next_tokens;
-        next_tokens.reserve(parameters.n_groups * parameters.group_size);
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> process(const ov::Tensor& logits) {
+        std::vector<int64_t> next_tokens;
+        std::vector<int32_t> next_beams;
         size_t beam_count = 0;
         for (Group& group : groups) {
             if (!group.done) {
@@ -163,6 +163,8 @@ struct GroupBeamSearcher {
                 }
             }
         }
+        next_tokens.reserve(beam_count);
+        next_beams.reserve(beam_count);
         for (auto group = groups.begin(); group != groups.end(); ++group) {
             if (group->done) {
                 continue;
@@ -230,10 +232,11 @@ struct GroupBeamSearcher {
             group->is_done(parameters);
             if (!group->done) {
                 for (const Beam& beam : group->ongoing) {
-                    next_tokens.push_back({beam.tokens.back(), beam.global_beam_idx});
+                    next_tokens.push_back(beam.tokens.back());
+                    next_beams.push_back(int32_t(beam.global_beam_idx));
                 }
             }
         }
-        return next_tokens;
+        return {next_tokens, next_beams};
     }
 };
diff --git a/llm/cpp/llm.cpp b/llm/cpp/llm.cpp
@@ -34,75 +34,35 @@ int main(int argc, char* argv[]) try {
     core.add_extension(USER_OV_EXTENSIONS_PATH);  // USER_OV_EXTENSIONS_PATH is defined in root CMakeLists.txt
     auto [input_ids, mask] = tokenize(core.compile_model(argv[2], "CPU").create_infer_request(), argv[4]);
     ov::InferRequest detokenizer = core.compile_model(argv[3], "CPU").create_infer_request();
-    std::shared_ptr<ov::Model> model = core.read_model(argv[1]);
-    std::map<size_t, ov::PartialShape> shapes = {
-        {0, ov::PartialShape{
-            -1, -1
-        }},
-        {1, ov::PartialShape{
-            -1, -1
-        }},
-        {2, ov::PartialShape{
-            -1, -1
-        }}
-    };
-    std::vector<ov::Output<ov::Node>> inputs = model->inputs();
-    for (size_t idx = 3; idx < inputs.size(); ++idx) {
-        ov::PartialShape shape = inputs.at(idx).get_partial_shape();
-        shape[0] = -1;
-        shapes.emplace(idx, shape);
-    }
-    model->reshape(shapes);
-    ov::InferRequest ireq = core.compile_model(model, "CPU").create_infer_request();
+    ov::InferRequest ireq = core.compile_model(argv[1], "CPU").create_infer_request();
     ireq.set_tensor("input_ids", input_ids);
     ireq.set_tensor("attention_mask", mask);
     ov::Tensor position_ids = ireq.get_tensor("position_ids");
     position_ids.set_shape(input_ids.get_shape());
     std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    for (size_t idx = 3; idx < inputs.size(); ++idx) {
-        ov::Shape shape = inputs.at(idx).get_partial_shape().get_min_shape();
-        shape.at(0) = 1;
-        ireq.get_input_tensor(idx).set_shape(shape);
-    }
+    ireq.get_tensor("beam_idx").set_shape({1});
+    ireq.get_tensor("beam_idx").data<int32_t>()[0] = 0;
     Parameters parameters;
     const int64_t* prompt_data = input_ids.data<const int64_t>();
     parameters.prompt = std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()};
     GroupBeamSearcher group_beam_searcher{parameters};
+    std::vector<int64_t> next_tokens;
+    std::vector<int32_t> next_beams;
     for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
         ireq.infer();
-        std::vector<TokenToBeam> next_tokens = group_beam_searcher.process(ireq.get_tensor("logits"));
+        std::tie(next_tokens, next_beams) = group_beam_searcher.process(ireq.get_tensor("logits"));
         if (next_tokens.empty()) {
             break;
         }
         size_t batch_size = next_tokens.size();
-        ireq.get_tensor("input_ids").set_shape({batch_size, 1});
+        ireq.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
         ov::Tensor attention_mask = ireq.get_tensor("attention_mask");
-        ov::Shape mask_shape = attention_mask.get_shape();
-        mask_shape.at(0) = batch_size;
-        ++mask_shape.at(1);
+        ov::Shape mask_shape{batch_size, attention_mask.get_shape().at(1) + 1};
         attention_mask.set_shape(mask_shape);
         std::fill_n(attention_mask.data<int64_t>(), shape_size(mask_shape), 1);
-        ireq.get_tensor("position_ids").set_shape({batch_size, 1});
-        std::fill_n(ireq.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
-        for (size_t tensor_idx = 3; tensor_idx < inputs.size(); ++tensor_idx) {
-            ov::Shape shape = ireq.get_output_tensor(tensor_idx - 2).get_shape();
-            shape.at(0) = batch_size;
-            ireq.get_input_tensor(tensor_idx).set_shape(shape);
-        }
-        for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
-            ireq.get_tensor("input_ids").data<int64_t>()[batch_idx] = next_tokens.at(batch_idx).token_idx;
-            for (size_t tensor_idx = 3; tensor_idx < inputs.size(); ++tensor_idx) {
-                ov::Tensor present = ireq.get_output_tensor(tensor_idx - 2);
-                ov::Shape present_begin = {next_tokens.at(batch_idx).beam_idx, 0, 0, 0};
-                ov::Shape present_end = present.get_shape();
-                present_end.at(0) = next_tokens.at(batch_idx).beam_idx + 1;
-                ov::Tensor past = ireq.get_input_tensor(tensor_idx);
-                ov::Shape past_begin = {batch_idx, 0, 0, 0};
-                ov::Shape past_end = past.get_shape();
-                past_end.at(0) = batch_idx + 1;
-                ov::Tensor{present, present_begin, present_end}.copy_to(ov::Tensor{past, past_begin, past_end});
-            }
-        }
+        position_ids.set_shape({batch_size, 1});
+        std::fill_n(position_ids.data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+        ireq.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
     }
     for (Group& group : group_beam_searcher.groups) {
         if (!group.done) {

diff --git a/thirdparty/openvino_contrib b/thirdparty/openvino_contrib
+3 −2		modules/custom_operations/pyproject.toml
+23 −17		modules/custom_operations/user_ie_extensions/CMakeLists.txt
+7 −2		modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt
+18 −14		modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/__init__.py