Skip to content

Commit

Permalink
支持huggingface tokenizers中的ByteLevel处理
Browse files Browse the repository at this point in the history
  • Loading branch information
cgli authored and TylunasLi committed Feb 12, 2024
1 parent 6498f58 commit ad7bf0d
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 0 deletions.
6 changes: 6 additions & 0 deletions include/fastllm.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <iostream>
#include <functional>
#include <memory>
#include <codecvt>
#include "devices/cpu/cputhreadpool.h"

#ifdef USE_SENTENCEPIECE
Expand Down Expand Up @@ -363,10 +364,15 @@ namespace fastllm {

bool add_dummy_prefix = true; // 是否在首位添加空格
bool remove_extra_whitespaces = true; // 是否将多个空格合并为一个
bool byte_as_char = false; // 是否将byte变为展示字符

std::unordered_map <int, std::string> tokenToStringDict;
std::unordered_map <int, float> tokenToScoreDict;
std::unordered_map <std::string, int> stringToTokenDict;

std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::unordered_map <wchar_t, wchar_t> byteCharDict;
std::unordered_map <wchar_t, wchar_t> charByteDict;
#ifdef USE_SENTENCEPIECE
std::unique_ptr<sentencepiece::SentencePieceProcessor> spProcessor;
#endif
Expand Down
35 changes: 35 additions & 0 deletions src/fastllm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,18 @@ namespace fastllm {

Tokenizer::Tokenizer() {
root = new TrieNode();
int n = 0;
wchar_t special_token = L'\x0';
for (; special_token < L'!'; special_token++, n++) {
byteCharDict[L'\x100' + n] = special_token;
charByteDict[special_token] = L'\x100' + n;
}
for (special_token = L'\x7F'; special_token < L'\xA1'; special_token++, n++) {
byteCharDict[L'\x100' + n] = special_token;
charByteDict[special_token] = L'\x100' + n;
}
byteCharDict[L'\x100' + n++] = L'\xAD';
charByteDict[L'\xAD'] = L'\x100' + n++;
}

Tokenizer::~Tokenizer() {
Expand All @@ -799,8 +811,13 @@ namespace fastllm {
q.push_back(it.second);
}
}
for (TrieNode * node : q)
delete node;
q.clear();
root = new TrieNode();
tokenToStringDict.clear();
tokenToScoreDict.clear();
stringToTokenDict.clear();
}

void Tokenizer::Insert(const std::string &s, int tokenId, float score) {
Expand Down Expand Up @@ -851,6 +868,15 @@ namespace fastllm {
}

std::string Tokenizer::Normalize(const std::string &ori) {
if (this->byte_as_char) {
std::wstring ws = converter.from_bytes(ori);
for (int i=0; i < ws.length(); i++) {
if (charByteDict.find(ws[i]) != charByteDict.end()) {
ws[i] = charByteDict[ws[i]];
}
}
return converter.to_bytes(ws);
}
std::string blank = "";
blank += 226, blank += 150, blank += 129;
std::string s = this->add_dummy_prefix ? blank : "";
Expand Down Expand Up @@ -1232,6 +1258,15 @@ namespace fastllm {
ret.replace(pos, blank.length(), " ");
else break;
}
if (this->byte_as_char) {
std::wstring wret = converter.from_bytes(ret);
for (int i=0; i < wret.length(); i++) {
if (byteCharDict.find(wret[i]) != byteCharDict.end()) {
wret[i] = byteCharDict[wret[i]];
}
}
ret = converter.to_bytes(wret);
}
int pos = ret.find("<|blank_");
if (pos != -1) {
int space_num = atoi(ret.substr(8, ret.size() - 10).c_str());
Expand Down
6 changes: 6 additions & 0 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ namespace fastllm {
std::istringstream iss(value);
iss >> std::boolalpha >> this->weight.tokenizer.remove_extra_whitespaces;
}
if (this->weight.dicts.find("tokenizer_byte_as_char") != this->weight.dicts.end()) {
std::string value = this->weight.dicts["tokenizer_byte_as_char"];
transform(value.begin(), value.end(), value.begin(), ::tolower);
std::istringstream iss(value);
iss >> std::boolalpha >> this->weight.tokenizer.byte_as_char;
}

this->deviceMap = GetDeviceMap();
}
Expand Down
6 changes: 6 additions & 0 deletions tools/fastllm_pytools/hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import ctypes;
import numpy as np
import torch
from transformers import PreTrainedTokenizerFast
from tokenizers.decoders import ByteLevel

fastllm_data_type_dict = {
"int4": 8,
Expand Down Expand Up @@ -74,6 +76,10 @@ def create(model,
modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
except:
pass
elif isinstance(tokenizer, PreTrainedTokenizerFast):
if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
modelInfo["tokenizer_byte_as_char"] = True

peft_config = {}
active_adapter = ""
Expand Down
6 changes: 6 additions & 0 deletions tools/fastllm_pytools/torch2flm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import struct
import numpy as np
import torch
from transformers import PreTrainedTokenizerFast
from tokenizers.decoders import ByteLevel

def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
Expand Down Expand Up @@ -131,6 +133,10 @@ def tofile(exportPath,
modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
except:
pass
elif isinstance(tokenizer, PreTrainedTokenizerFast):
if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
modelInfo["tokenizer_byte_as_char"] = True

if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
Expand Down

0 comments on commit ad7bf0d

Please sign in to comment.