Skip to content

Commit

Permalink
Build w/lo FastTokenizers (openvinotoolkit#305)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov authored Oct 29, 2024
1 parent 7213eac commit 1ddb12e
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 245 deletions.
10 changes: 6 additions & 4 deletions src/charsmap_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
using namespace ov;

namespace {
std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity");
return std::make_shared<sentencepiece::NormalizerSpec>(spec);
}

std::shared_ptr<sentencepiece::NormalizerSpec> make_identity_spec() {
auto spec = sentencepiece::SentencePieceTrainer::GetNormalizerSpec("identity");
return std::make_shared<sentencepiece::NormalizerSpec>(spec);
}

} // namespace


Expand Down
6 changes: 3 additions & 3 deletions src/ov_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ OPENVINO_CREATE_EXTENSIONS(
std::make_shared<ov::OpExtension<ByteFallback>>(),
std::make_shared<ov::OpExtension<SpecialTokensSplit>>(),
std::make_shared<ov::OpExtension<CharsMapNormalization>>(),
std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceTokenizer>>(),
std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceDetokenizer>>(),
std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceStreamDetokenizer>>(),
std::make_shared<ov::OpExtension<SentencepieceTokenizer>>(),
std::make_shared<ov::OpExtension<SentencepieceDetokenizer>>(),
std::make_shared<ov::OpExtension<SentencepieceStreamDetokenizer>>(),
OPENVINO_TOKENIZERS_FAST_TOKENIZER_BASED_EXTENSIONS
OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS
OPENVINO_TOKENIZERS_TENSORFLOW_CONVERSION_EXTENSIONS_FAST_TOKENIZER_BASED
Expand Down
2 changes: 1 addition & 1 deletion src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ const std::map<std::string, RegexSplit::SplitMode> split_modes_map = {
{"mergedwithnext", RegexSplit::SplitMode::MERGED_WITH_NEXT}
};

}
} // namespace

void RegexSplit::compile_pattern_if_necessary(std::string split_pattern) const {
m_split_mode = split_modes_map.at(m_behaviour);
Expand Down
5 changes: 4 additions & 1 deletion src/sentence_piece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "utils.hpp"

using sentencepiece::SentencePieceProcessor;
using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
using namespace ov::opset13;
Expand Down Expand Up @@ -55,6 +54,8 @@ int PieceToByte(absl::string_view piece) {
} // namespace
} // sentencepiece

namespace {

std::string form_extra_options(bool add_bos, bool add_eos, bool reverse) {
std::string extra_options = "";
if (add_bos) {
Expand Down Expand Up @@ -88,6 +89,8 @@ void init_sp_model_in_eval(const TensorVector& inputs, std::shared_ptr<SentenceP
CHECK_OK(sp->LoadFromSerializedProto(model_proto));
}

} // namespace

SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, int32_t nbest_size, float alpha,
bool add_bos, bool add_eos, bool reverse) : m_sp(std::make_shared<SentencePieceProcessor>()),
m_nbest_size(nbest_size), m_alpha(alpha), m_add_bos(add_bos), m_add_eos(add_eos),
Expand Down
134 changes: 67 additions & 67 deletions src/sentence_piece.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,96 +10,96 @@
#include "re2/re2.h"

namespace sentencepiece {
class SentencePieceProcessor;
int PieceToByte(absl::string_view piece);
}

namespace TemplateExtension {
class SentencepieceTokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceTokenizer");
class SentencePieceProcessor;
int PieceToByte(absl::string_view piece);

SentencepieceTokenizer() = default;
SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse);
SentencepieceTokenizer(
const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp,
const std::shared_ptr<re2::RE2>& special_tokens_re,
const std::shared_ptr<absl::flat_hash_map<std::string, int32_t>>& special_tokens_map,
int32_t nbest_size,
float alpha,
bool add_bos,
bool add_eos,
bool reverse
);
} // sentencepiece

bool visit_attributes(ov::AttributeVisitor& visitor) override;
class SentencepieceTokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceTokenizer");

void validate_and_infer_types() override;
SentencepieceTokenizer() = default;
SentencepieceTokenizer(const ov::OutputVector& args, int32_t nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse);
SentencepieceTokenizer(
const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp,
const std::shared_ptr<re2::RE2>& special_tokens_re,
const std::shared_ptr<absl::flat_hash_map<std::string, int32_t>>& special_tokens_map,
int32_t nbest_size,
float alpha,
bool add_bos,
bool add_eos,
bool reverse
);

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
bool visit_attributes(ov::AttributeVisitor& visitor) override;

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
void validate_and_infer_types() override;

bool has_evaluate() const override;
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
mutable std::shared_ptr<re2::RE2> m_special_tokens_re;
mutable std::shared_ptr<absl::flat_hash_map<std::string, int32_t>> m_special_tokens_map;
mutable std::mutex m_mutex;
int32_t m_nbest_size;
float m_alpha;
bool m_add_bos;
bool m_add_eos;
bool m_reverse;
};
bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

bool has_evaluate() const override;

class SentencepieceDetokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceDetokenizer");
private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
mutable std::shared_ptr<re2::RE2> m_special_tokens_re;
mutable std::shared_ptr<absl::flat_hash_map<std::string, int32_t>> m_special_tokens_map;
mutable std::mutex m_mutex;
int32_t m_nbest_size;
float m_alpha;
bool m_add_bos;
bool m_add_eos;
bool m_reverse;
};

SentencepieceDetokenizer() = default;
SentencepieceDetokenizer(const ov::OutputVector& args);
SentencepieceDetokenizer(const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp);

bool visit_attributes(ov::AttributeVisitor& visitor) override;
class SentencepieceDetokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceDetokenizer");

void validate_and_infer_types() override;
SentencepieceDetokenizer() = default;
SentencepieceDetokenizer(const ov::OutputVector& args);
SentencepieceDetokenizer(const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp);

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
bool visit_attributes(ov::AttributeVisitor& visitor) override;

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
void validate_and_infer_types() override;

bool has_evaluate() const override;
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
};
bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

bool has_evaluate() const override;

class SentencepieceStreamDetokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceStreamDetokenizer");
private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
};

SentencepieceStreamDetokenizer() = default;
SentencepieceStreamDetokenizer(const ov::OutputVector& args);
SentencepieceStreamDetokenizer(const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp);

bool visit_attributes(ov::AttributeVisitor& visitor) override;
class SentencepieceStreamDetokenizer : public ov::op::Op {
public:
OPENVINO_OP("SentencepieceStreamDetokenizer");

void validate_and_infer_types() override;
SentencepieceStreamDetokenizer() = default;
SentencepieceStreamDetokenizer(const ov::OutputVector& args);
SentencepieceStreamDetokenizer(const ov::OutputVector& args,
const std::shared_ptr<sentencepiece::SentencePieceProcessor>& sp);

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
bool visit_attributes(ov::AttributeVisitor& visitor) override;

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
void validate_and_infer_types() override;

bool has_evaluate() const override;
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
};
} // namespace TemplateExtension
bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

bool has_evaluate() const override;

private:
mutable std::shared_ptr<sentencepiece::SentencePieceProcessor> m_sp;
};
Loading

0 comments on commit 1ddb12e

Please sign in to comment.