From ac7d39ffe66b04a52df69ad7950b4d7963d7f681 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 21 Nov 2024 22:08:05 +0100
Subject: [PATCH 1/6] parametrize decode in Tokenizers

---
 src/cpp/include/openvino/genai/tokenizer.hpp  | 45 +++++++++-
 .../src/make_combine_segments_stateful.cpp    | 44 ++++++++++
 .../src/make_combine_segments_stateful.hpp    | 37 +++++++++
 src/cpp/src/tokenizer.cpp                     | 82 +++++++++++--------
 src/python/py_tokenizer.cpp                   | 24 ++++--
 tests/python_tests/test_chat_generate_api.py  | 21 +++++
 6 files changed, 208 insertions(+), 45 deletions(-)
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index bcb8da68a3..e90e9c80de 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -87,23 +87,59 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
     * @return sequence string
     */
-    std::string decode(std::vector<int64_t> tokens);
-    
+    std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens vector storing tokens
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @return sequence string
+    */
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
+
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @return vector of std::string, with size = batch_size
+    */
+    std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
-    std::vector<std::string> decode(ov::Tensor tokens);
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
 
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
     * @return vector of std::string, with size equal to batch_size
     */
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens);
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @return vector of std::string, with size = batch_size
+    */
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
 
     /**
      * @brief Embeds input prompts with special tags for a chat scenario.
@@ -143,6 +179,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 };
 
 static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
+static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_combine_segments_stateful.cpp
index 2285c172dc..26c58b8fca 100644
--- a/src/cpp/src/make_combine_segments_stateful.cpp
+++ b/src/cpp/src/make_combine_segments_stateful.cpp
@@ -4,6 +4,8 @@
 #include "make_combine_segments_stateful.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/select.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/multiply.hpp"
 #include "openvino/op/read_value.hpp"
 #include "openvino/op/assign.hpp"
 
@@ -44,3 +46,45 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr
     model->add_variables({variable});
     return true;
 }
+
+bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
+
+    std::shared_ptr<ov::Node> vocab_decoder_node;
+    for (auto node: model->get_ordered_ops()) {
+        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) {
+            vocab_decoder_node = node;
+        }
+    }
+    auto val = vocab_decoder_node->input_value(4);
+    auto val_type = vocab_decoder_node->input_value(4).get_element_type();
+
+    if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) {
+        return false;
+    }
+    
+    std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
+    if (!skip_tokens_const) {
+        return false;
+    }
+
+
+    auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
+    auto int_max_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits<int>::max()});
+    auto one_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{1});
+    
+    // By default, INT_MAX will multiply with 1 and all skip_tokens will be selected.
+    op::util::VariableInfo var_info{ov::Shape{1}, ov::element::i32, SKIP_SPECIAL_TOKENS_VAR_ID};
+    auto variable = std::make_shared<op::util::Variable>(var_info);
+    auto read_value = std::make_shared<v6::ReadValue>(one_const, variable);
+    // if flag is set, then slice up to the int_max which means skip all tokens.
+    auto stop = std::make_shared<v1::Multiply>(int_max_const, read_value);
+
+    std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+    
+    vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
+
+    auto assign = std::make_shared<v6::Assign>(read_value, variable);
+    model->add_sinks({assign});
+    model->add_variables({variable});
+    return true;
+}
\ No newline at end of file
diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_combine_segments_stateful.hpp
index 6365497140..307c6199c8 100644
--- a/src/cpp/src/make_combine_segments_stateful.hpp
+++ b/src/cpp/src/make_combine_segments_stateful.hpp
@@ -38,7 +38,44 @@ class MakeCombineSegmentsSatateful : public ov::pass::ModelPass {
     bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
 };
 
+/** 
+ * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be
+ *  enabled or disabled depending on stateful value.
+ *                                          
+ *                                  +--------------+
+ *                                  |  DefaultMode |
+ *                                  +--------------+
+ *                                         |
+ *                                         v
+ *                                  +------------+   +-----------+
+ *                                  |  ReadValue |   |  INT_MAX  |
+ *                                  +------------+   +-----------+
+ *                                          \           /
+ *                                           \         /
+ *                                            v       v
+ *   +--------------------+     +---------+  +---------+
+ *   |  Const with tokens |     |  start  |  |   Mul   |
+ *   +--------------------+     +---------+  +---------+
+ *                         \          |          /
+ *                           \        |         /
+ *                             v      v        v
+ *                            +-----------------+
+ *                            |      Slice      |
+ *                            +-----------------+
+ *                                     |
+ *                                     v
+ *                          +----------------------+
+ *                          |     VocabDecoder     |
+ *                          +----------------------+
+**/
+class MakeVocabDecoderSatateful : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("MakeVocabDecoderSatateful", "0");
+    bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
+};
+
 const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
+const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index f52417a94e..78b94915dd 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -10,6 +10,7 @@
 #include <jinja2cpp/generic_list.h>
 #include <jinja2cpp/generic_list_iterator.h>
 
+#include "openvino/pass/visualize_tree.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/genai/tokenizer.hpp"
@@ -73,7 +74,8 @@ class Tokenizer::TokenizerImpl {
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
     // To change the adding special tokens mode we use a statefull subgraph, 
     // this flag holds the current state value of the CompiledModel.
-    bool m_add_special_tokens = true;  
+    bool m_add_special_tokens = true;
+    bool m_skip_special_tokens = false;
     bool m_older_than_24_5 = false;
     
     int64_t m_pad_token_id = -1;
@@ -86,11 +88,16 @@ class Tokenizer::TokenizerImpl {
 
     std::string m_chat_template = {};
 
-    void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, bool add_special_tokens) {
+    void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
+        bool add_special_tokens_flag = true;
+        bool skip_special_tokens_flag = false;
+        ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
+        ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
+
         // If user requested add_special_tokens mode different from the current one,
         // need to set state variable.
         // If requested mode matches the stored state set, then don't touch states.
-        if (add_special_tokens == m_add_special_tokens) {
+        if (add_special_tokens_flag == m_add_special_tokens && skip_special_tokens_flag == m_skip_special_tokens) {
             return;
         }
         if (m_older_than_24_5) {
@@ -100,19 +107,23 @@ class Tokenizer::TokenizerImpl {
             return;
         }
         
-        // auto states = m_ireq_queue_tokenizer->get(0).query_state();
+        // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
-        *add_special_tensor.data<bool>() = add_special_tokens;
+        *add_special_tensor.data<bool>() = add_special_tokens_flag;
+        
+        // skip_special_tokens is managed by multiplication with a number, therefore i32.
+        ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
+        *skip_special_tensor.data<int>() = skip_special_tokens_flag;
 
         for (auto& state: infer_request_guard.get().query_state()) {
-            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) == std::string::npos) {
-                // It's not add_special_tokens flag state.
-                continue;
+            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+                state.set_state(add_special_tensor);
+            } else if (state.get_name().find(ov::genai::SKIP_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+                state.set_state(skip_special_tensor);
             }
-            state.set_state(add_special_tensor);
-            break;            
         }
-        m_add_special_tokens = add_special_tokens;
+        m_add_special_tokens = add_special_tokens_flag;
+        m_skip_special_tokens = skip_special_tokens_flag;
     }
 
     TokenizerImpl() = default;
@@ -135,15 +146,25 @@ class Tokenizer::TokenizerImpl {
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
         auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
+        std::shared_ptr<ov::Model> ov_detokenizer;
+        if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
+            ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml");
+        }
         m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
         
-        ov::pass::Manager manager;
-        manager.register_pass<MakeCombineSegmentsSatateful>();
-        manager.run_passes(ov_tokenizer);
+        ov::pass::Manager manager_tok;
+        manager_tok.register_pass<MakeCombineSegmentsSatateful>();
+        manager_tok.run_passes(ov_tokenizer);
+        
+        ov::pass::Manager manager_detok;
+        manager_detok.register_pass<ov::pass::VisualizeTree>("before.svg");
+        manager_detok.register_pass<MakeVocabDecoderSatateful>();
+        manager_detok.register_pass<ov::pass::VisualizeTree>("after.svg");
+        manager_detok.run_passes(ov_detokenizer);
         
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
         if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
-            m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, properties);
+            m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
         
@@ -298,11 +319,8 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) {
-        bool add_special_tokens_flag = true;
-        ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
-
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
-        set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
+        set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
         infer_request_guard.get().start_async();
@@ -316,11 +334,8 @@ class Tokenizer::TokenizerImpl {
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
         TokenizedInputs unpadded;
         {
-            bool add_special_tokens_flag = true;
-            ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
-
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
-            set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
+            set_state_if_necessary(infer_request_guard, tokenization_params);
             infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
             auto size_ = infer_request_guard.get().get_input_tensor().get_shape();
             infer_request_guard.get().start_async();
@@ -343,10 +358,11 @@ class Tokenizer::TokenizerImpl {
         return {input_ids_, attention_mask_};
     }
 
-    std::string decode(std::vector<int64_t> tokens) {
+    std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
         infer_request_guard.get().start_async();
@@ -354,12 +370,13 @@ class Tokenizer::TokenizerImpl {
         return infer_request_guard.get().get_output_tensor().data<std::string>()[0];
     }
 
-    std::vector<std::string> decode(ov::Tensor tokens) {
+    std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         infer_request_guard.get().set_input_tensor(tokens);
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
@@ -369,7 +386,7 @@ class Tokenizer::TokenizerImpl {
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
 
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines) {
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
 
         auto compare_lengths = [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
@@ -388,6 +405,7 @@ class Tokenizer::TokenizerImpl {
         }
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         infer_request_guard.get().set_input_tensor(tokens);
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
@@ -517,16 +535,16 @@ TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, cons
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 
-std::string Tokenizer::decode(std::vector<int64_t> tokens) {
-    return m_pimpl->decode(tokens);
+std::string Tokenizer::decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(tokens, detokenization_params);
 }
 
-std::vector<std::string> Tokenizer::decode(ov::Tensor tokens) {
-    return m_pimpl->decode(tokens);
+std::vector<std::string> Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(tokens, detokenization_params);
 }
 
-std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines) {
-    return m_pimpl->decode(lines);
+std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(lines, detokenization_params);
 }
 
 int64_t Tokenizer::get_bos_token_id() const {
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index 2ccccff4c0..db4643a65c 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -63,27 +63,33 @@ void init_tokenizer(py::module_& m) {
 
         .def(
             "decode",
-            [](Tokenizer& tok, std::vector<int64_t>& tokens) -> py::str {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, std::vector<int64_t>& tokens, bool skip_special_tokens) -> py::str {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a sequence into a string prompt.)"
         )
 
         .def(
             "decode",
-            [](Tokenizer& tok, ov::Tensor& tokens) -> py::typing::List<py::str> {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, ov::Tensor& tokens, bool skip_special_tokens) -> py::typing::List<py::str> {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
             "decode",
-            [](Tokenizer& tok, std::vector<std::vector<int64_t>>& tokens) -> py::typing::List<py::str> {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, std::vector<std::vector<int64_t>>& tokens, bool skip_special_tokens) -> py::typing::List<py::str> {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 25d0798994..a87a2c7555 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -217,3 +217,24 @@ def test_add_special_tokens(add_special_tokens, prompt):
     res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data
     res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
     assert np.all(res_genai == res_hf)
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("prompt", prompts)
+def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    genai_tokenizer = pipe.get_tokenizer()
+    
+    # Calling encode with add_special_tokens will set state flag.
+    res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data
+    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
+    assert np.all(res_genai == res_hf)
+    
+    # Decode with skip_special_tokens
+    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)
+    decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
+    assert decoded_genai == decoded_hf

From e46466d94124aa73daa34a91cf94a7e0ce4e1265 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 21 Nov 2024 22:10:47 +0100
Subject: [PATCH 2/6] rename pass

---
 .github/labeler.yml                                          | 4 ++--
 ...ine_segments_stateful.cpp => make_tokenizer_stateful.cpp} | 2 +-
 ...ine_segments_stateful.hpp => make_tokenizer_stateful.hpp} | 0
 src/cpp/src/tokenizer.cpp                                    | 5 +----
 4 files changed, 4 insertions(+), 7 deletions(-)
 rename src/cpp/src/{make_combine_segments_stateful.cpp => make_tokenizer_stateful.cpp} (98%)
 rename src/cpp/src/{make_combine_segments_stateful.hpp => make_tokenizer_stateful.hpp} (100%)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index c5d0db312c..c162f6aff4 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -8,8 +8,8 @@
 - 'src/cpp/src/tokenizers_path.hpp'
 - 'src/cpp/src/circular_buffer_queue.hpp'
 - 'src/cpp/src/synchronized_queue.hpp'
-- 'src/cpp/src/make_combine_segments_stateful.cpp'
-- 'src/cpp/src/make_combine_segments_stateful.hpp'
+- 'src/cpp/src/make_tokenizer_stateful.cpp'
+- 'src/cpp/src/make_tokenizer_stateful.hpp'
 - 'src/python/py_tokenizer.cpp'
 - 'thirdparty/openvino_tokenizers'
 - 'tests/python_tests/tokenizer_configs.py'
diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
similarity index 98%
rename from src/cpp/src/make_combine_segments_stateful.cpp
rename to src/cpp/src/make_tokenizer_stateful.cpp
index 26c58b8fca..538a935e56 100644
--- a/src/cpp/src/make_combine_segments_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "make_combine_segments_stateful.hpp"
+#include "make_tokenizer_stateful.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/select.hpp"
 #include "openvino/op/slice.hpp"
diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp
similarity index 100%
rename from src/cpp/src/make_combine_segments_stateful.hpp
rename to src/cpp/src/make_tokenizer_stateful.hpp
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 78b94915dd..fc6ba75d90 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -10,12 +10,11 @@
 #include <jinja2cpp/generic_list.h>
 #include <jinja2cpp/generic_list_iterator.h>
 
-#include "openvino/pass/visualize_tree.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/genai/tokenizer.hpp"
 
-#include "make_combine_segments_stateful.hpp"
+#include "make_tokenizer_stateful.hpp"
 #include "tokenizers_path.hpp"
 #include "circular_buffer_queue.hpp"
 #include "json_utils.hpp"
@@ -157,9 +156,7 @@ class Tokenizer::TokenizerImpl {
         manager_tok.run_passes(ov_tokenizer);
         
         ov::pass::Manager manager_detok;
-        manager_detok.register_pass<ov::pass::VisualizeTree>("before.svg");
         manager_detok.register_pass<MakeVocabDecoderSatateful>();
-        manager_detok.register_pass<ov::pass::VisualizeTree>("after.svg");
         manager_detok.run_passes(ov_detokenizer);
         
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);

From 4529dec255b603d711a479f1a90c4cbec9ae3ebf Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 22 Nov 2024 10:49:11 +0100
Subject: [PATCH 3/6] fix typos

---
 src/cpp/include/openvino/genai/tokenizer.hpp | 28 ++++++++++----------
 src/cpp/src/make_tokenizer_stateful.cpp      | 17 +++++-------
 src/python/py_tokenizer.cpp                  |  6 ++---
 tests/python_tests/test_chat_generate_api.py |  3 ++-
 4 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index e90e9c80de..8d2d63ea80 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode a single prompt
     * @param prompt std::string with input prompt
-    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false}
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
@@ -55,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
-    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false}
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(std::vector<std::string>& prompt, const ov::AnyMap& tokenization_params = {});
@@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return sequence string
     */
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
@@ -95,18 +95,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return sequence string
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
     */
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
@@ -114,18 +114,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
     */
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
@@ -133,12 +133,12 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
index 538a935e56..3551e713c9 100644
--- a/src/cpp/src/make_tokenizer_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -48,25 +48,20 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr
 }
 
 bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
-
     std::shared_ptr<ov::Node> vocab_decoder_node;
     for (auto node: model->get_ordered_ops()) {
-        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) {
+        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0)
             vocab_decoder_node = node;
-        }
     }
-    auto val = vocab_decoder_node->input_value(4);
-    auto val_type = vocab_decoder_node->input_value(4).get_element_type();
 
-    if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) {
+    if (!vocab_decoder_node || vocab_decoder_node->get_input_size() < 5)
+        return false;
+    if (!vocab_decoder_node->input_value(4).get_element_type().is_integral_number())
         return false;
-    }
     
     std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
-    if (!skip_tokens_const) {
+    if (!skip_tokens_const)
         return false;
-    }
-
 
     auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
     auto int_max_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits<int>::max()});
@@ -87,4 +82,4 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
     model->add_sinks({assign});
     model->add_variables({variable});
     return true;
-}
\ No newline at end of file
+}
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index db4643a65c..dae2ffe775 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode a sequence into a string prompt.)"
         )
 
@@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
@@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index a87a2c7555..efd1d87416 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -219,6 +219,7 @@ def test_add_special_tokens(add_special_tokens, prompt):
     assert np.all(res_genai == res_hf)
 
 @pytest.mark.precommit
+@pytest.mark.xfail(reason="Need to turn them back on when openvino_tokenizers will be updated.")
 @pytest.mark.nightly
 @pytest.mark.parametrize("add_special_tokens", [True, False])
 @pytest.mark.parametrize("skip_special_tokens", [True, False])
@@ -235,6 +236,6 @@ def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
     assert np.all(res_genai == res_hf)
     
     # Decode with skip_special_tokens
-    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)
+    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0]
     decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
     assert decoded_genai == decoded_hf

From 21037497e6958c7df020131d77984a953a4beb08 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 25 Nov 2024 12:09:04 +0100
Subject: [PATCH 4/6] align with the openvino_tokenizers

---
 src/cpp/include/openvino/genai/tokenizer.hpp |  6 +++---
 src/cpp/src/make_tokenizer_stateful.cpp      | 13 +++++++++----
 src/cpp/src/tokenizer.cpp                    |  5 ++---
 src/python/py_tokenizer.cpp                  |  6 +++---
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 8d2d63ea80..36f63d2b5e 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return sequence string
     */
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
@@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
     */
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
@@ -125,7 +125,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
     */
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
index 3551e713c9..4685b0e715 100644
--- a/src/cpp/src/make_tokenizer_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -60,7 +60,8 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
         return false;
     
     std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
-    if (!skip_tokens_const)
+    std::shared_ptr<v8::Slice> skip_tokens_slice = std::dynamic_pointer_cast<v8::Slice>(vocab_decoder_node->get_input_node_shared_ptr(4));
+    if (!skip_tokens_const && !skip_tokens_slice)
         return false;
 
     auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
@@ -74,10 +75,14 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
     // if flag is set, then slice up to the int_max which means skip all tokens.
     auto stop = std::make_shared<v1::Multiply>(int_max_const, read_value);
 
-    std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+    // If already has slice just replace the stop input.
+    if (skip_tokens_slice) {
+        skip_tokens_slice->input(2).replace_source_output(stop);
+    } else {
+        std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+        vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
+    }
     
-    vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
-
     auto assign = std::make_shared<v6::Assign>(read_value, variable);
     model->add_sinks({assign});
     model->add_variables({variable});
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index fc6ba75d90..d0a472a40f 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -74,7 +74,7 @@ class Tokenizer::TokenizerImpl {
     // To change the adding special tokens mode we use a statefull subgraph, 
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
-    bool m_skip_special_tokens = false;
+    bool m_skip_special_tokens = true;
     bool m_older_than_24_5 = false;
     
     int64_t m_pad_token_id = -1;
@@ -89,7 +89,7 @@ class Tokenizer::TokenizerImpl {
 
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = true;
-        bool skip_special_tokens_flag = false;
+        bool skip_special_tokens_flag = true;
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -164,7 +164,6 @@ class Tokenizer::TokenizerImpl {
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
-        
         const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
         m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
             INFER_REQUEST_QUEUE_SIZE,
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index dae2ffe775..db4643a65c 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a sequence into a string prompt.)"
         )
 
@@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
@@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,

From d26233b172d60063e50257058513a560e8e591b1 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 25 Nov 2024 12:56:37 +0100
Subject: [PATCH 5/6] update signature

---
 src/python/openvino_genai/py_openvino_genai.pyi | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index df290a9744..5e4d2dd7b2 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1303,17 +1303,17 @@ class Tokenizer:
         Embeds input prompts with special tags for a chat scenario.
         """
     @typing.overload
-    def decode(self, tokens: list[int]) -> str:
+    def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> str:
         """
         Decode a sequence into a string prompt.
         """
     @typing.overload
-    def decode(self, tokens: openvino._pyopenvino.Tensor) -> list[str]:
+    def decode(self, tokens: openvino._pyopenvino.Tensor, skip_special_tokens: bool = True) -> list[str]:
         """
         Decode tensor into a list of string prompts.
         """
     @typing.overload
-    def decode(self, tokens: list[list[int]]) -> list[str]:
+    def decode(self, tokens: list[list[int]], skip_special_tokens: bool = True) -> list[str]:
         """
         Decode a batch of tokens into a list of string prompt.
         """

From 111bb5bb2afe5b6cc4b01ea935ed7af38c6075de Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 26 Nov 2024 10:45:37 +0100
Subject: [PATCH 6/6] add barier for AnyMap key names, apply review comments

---
 src/cpp/src/tokenizer.cpp | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index d0a472a40f..41f9a6abd4 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -55,6 +55,14 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention
     return {input_ids, attention_mask};
 }
 
+void check_arguments(const ov::AnyMap& parameters, std::set<std::string> allowed_argnames) {
+    for (const auto& [key, value] : parameters) {
+        if (allowed_argnames.find(key) == allowed_argnames.end()) {
+            OPENVINO_THROW("unacceptable parameter key: " + key);
+        }
+    }
+}
+
 constexpr char bos_token_key_name[] = "bos_token";
 constexpr char eos_token_key_name[] = "eos_token";
 constexpr char pad_token_key_name[] = "pad_token";
@@ -88,8 +96,8 @@ class Tokenizer::TokenizerImpl {
     std::string m_chat_template = {};
 
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
-        bool add_special_tokens_flag = true;
-        bool skip_special_tokens_flag = true;
+        bool add_special_tokens_flag = m_add_special_tokens;
+        bool skip_special_tokens_flag = m_skip_special_tokens;
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -145,7 +153,7 @@ class Tokenizer::TokenizerImpl {
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
         auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
-        std::shared_ptr<ov::Model> ov_detokenizer;
+        std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
         if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
             ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml");
         }
@@ -155,12 +163,11 @@ class Tokenizer::TokenizerImpl {
         manager_tok.register_pass<MakeCombineSegmentsSatateful>();
         manager_tok.run_passes(ov_tokenizer);
         
-        ov::pass::Manager manager_detok;
-        manager_detok.register_pass<MakeVocabDecoderSatateful>();
-        manager_detok.run_passes(ov_detokenizer);
-        
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
-        if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
+        if (ov_detokenizer) {
+            ov::pass::Manager manager_detok;
+            manager_detok.register_pass<MakeVocabDecoderSatateful>();
+            manager_detok.run_passes(ov_detokenizer);
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
@@ -516,30 +523,37 @@ Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyM
 }
 
 TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(std::move(prompt), tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 
 std::string Tokenizer::decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(tokens, detokenization_params);
 }
 
 std::vector<std::string> Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(tokens, detokenization_params);
 }
 
 std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(lines, detokenization_params);
 }