Skip to content

Commit

Permalink
对齐同一模型python Tokenizer、Huggingface Tokenizers和SentencePiece的空格转换逻辑
Browse files Browse the repository at this point in the history
  • Loading branch information
cgli committed Mar 28, 2024
1 parent 9affcbb commit 57b03e7
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 26 deletions.
38 changes: 25 additions & 13 deletions tools/fastllm_pytools/hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def create(model,
group = -1):
if (dtype not in fastllm_data_type_dict):
print("dtype should be one of ", list(fastllm_data_type_dict.keys()))
exit(0)
exit(0)

# 0.1 model info
modelInfo = model.config.__dict__
Expand Down Expand Up @@ -87,6 +87,8 @@ def create(model,
rope_scaling = modelInfo.pop("rope_scaling")
modelInfo["rope_scaling.type"] = rope_scaling["type"]
modelInfo["rope_scaling.factor"] = rope_scaling["factor"]

merges = {}
if tokenizer:
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if len(tokenizer.all_special_tokens) > 0:
Expand All @@ -105,16 +107,34 @@ def create(model,
sp_model_data = f.read()
sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data)
modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix
modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
if sp_model_proto.normalizer_spec.remove_extra_whitespaces:
modelInfo["tokenizer_remove_extra_whitespaces"] = True
except:
pass
elif isinstance(tokenizer, PreTrainedTokenizerFast):
modelInfo["tokenizer_add_dummy_prefix"] = False
tokenizer_file_name = tokenizer.vocab_file if hasattr(tokenizer, "vocab_file") else tokenizer.vocab_files_names['tokenizer_file']
tokenizer_file = tokenizer.name_or_path + tokenizer_file_name
if os.path.exists(tokenizer_file):
with open(tokenizer_file, "r", encoding='utf-8') as f:
tokenizer_data = json.load(f)
if "normalizers" in tokenizer_data["normalizer"]:
for normalizer in tokenizer_data["normalizer"]["normalizers"]:
if normalizer["type"] == "Prepend" and \
(normalizer["prepend"] == '▁' or normalizer["prepend"] == ' '):
modelInfo["tokenizer_add_dummy_prefix"] = True
if "merges" in tokenizer_data["model"]:
bpe_merges = tokenizer_data["model"]["merges"]
bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
modelInfo["tokenizer_byte_as_char"] = True
else:
if hasattr(tokenizer, "byte_encoder") and hasattr(tokenizer, "byte_decoder"):
modelInfo["tokenizer_byte_as_char"] = True
if not hasattr(tokenizer, "add_prefix_space") or not getattr(tokenizer, "add_prefix_space", True):
modelInfo["tokenizer_add_dummy_prefix"] = False

peft_config = {}
active_adapter = ""
Expand All @@ -124,6 +144,7 @@ def create(model,
# in transformers >= 4.33.0, active_adapter is a funtion in model, ignore it now
active_adapter = model.active_adapter

weight_type_dict = {}
model = model.cpu();
dict = model.state_dict()

Expand All @@ -132,6 +153,7 @@ def create(model,
lm_head = dict['lm_head.weight'].to(torch.float32)
dict['lm_head.weight'] = torch.nn.functional.normalize(lm_head).to(torch.float16)
model.load_state_dict(dict)
weight_type_dict['lm_head.weight'] = "linear"

model_type = modelInfo["model_type"];
model_handle = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
Expand All @@ -158,17 +180,8 @@ def create(model,
llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
else:
merges = {}
if hasattr(tokenizer, "bpe_ranks"):
merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])}
elif isinstance(tokenizer, PreTrainedTokenizerFast):
tokenizer_file_name = tokenizer.vocab_file if hasattr(tokenizer, "vocab_file") else tokenizer.vocab_files_names['tokenizer_file']
tokenizer_file = tokenizer.name_or_path + tokenizer_file_name
if os.path.exists(tokenizer_file):
with open(tokenizer_file, "r", encoding='utf-8') as f:
bpe_merges = json.load(f)["model"]["merges"]
bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
vocab = tokenizer.get_vocab()
for v in vocab.keys():
score = merges[v] if v in merges else 1.0
Expand All @@ -188,7 +201,6 @@ def create(model,
special_tokens_str.encode(),
(ctypes.c_int * len(special_tokens_ids))(*special_tokens_ids));

weight_type_dict = {}
module_dict = {}
weight_bits = {}
for key, m in model.named_modules():
Expand All @@ -199,7 +211,7 @@ def create(model,
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
weight_type_dict[key + ".weight"] = "embedding"

# 2. weight
tot = 0
Expand Down
37 changes: 24 additions & 13 deletions tools/fastllm_pytools/torch2flm.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def tofile(exportPath,
modelInfo["rope_scaling.type"] = rope_scaling["type"]
modelInfo["rope_scaling.factor"] = rope_scaling["factor"]

merges = {}
if tokenizer:
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if len(tokenizer.all_special_tokens) > 0:
Expand All @@ -187,23 +188,41 @@ def tofile(exportPath,
sp_model_data = f.read()
sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data)
modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix
modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
if sp_model_proto.normalizer_spec.remove_extra_whitespaces:
modelInfo["tokenizer_remove_extra_whitespaces"] = True
except:
pass
elif isinstance(tokenizer, PreTrainedTokenizerFast):
modelInfo["tokenizer_add_dummy_prefix"] = False
tokenizer_file_name = tokenizer.vocab_file if hasattr(tokenizer, "vocab_file") else tokenizer.vocab_files_names['tokenizer_file']
tokenizer_file = tokenizer.name_or_path + tokenizer_file_name
if os.path.exists(tokenizer_file):
with open(tokenizer_file, "r", encoding='utf-8') as f:
tokenizer_data = json.load(f)
if "normalizers" in tokenizer_data["normalizer"]:
for normalizer in tokenizer_data["normalizer"]["normalizers"]:
if normalizer["type"] == "Prepend" and \
(normalizer["prepend"] == '▁' or normalizer["prepend"] == ' '):
modelInfo["tokenizer_add_dummy_prefix"] = True
if "merges" in tokenizer_data["model"]:
bpe_merges = tokenizer_data["model"]["merges"]
bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
modelInfo["tokenizer_byte_as_char"] = True
else:
if hasattr(tokenizer, "byte_encoder") and hasattr(tokenizer, "byte_decoder"):
modelInfo["tokenizer_byte_as_char"] = True
if not hasattr(tokenizer, "add_prefix_space") or not getattr(tokenizer, "add_prefix_space", True):
modelInfo["tokenizer_add_dummy_prefix"] = False

if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size

fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
for it in sorted(modelInfo.keys()):
writeKeyValue(fo, str(it), str(modelInfo[it]))

if hasattr(model, "peft_config"):
Expand All @@ -214,6 +233,8 @@ def tofile(exportPath,
for it in adapter_dict.keys():
writeKeyValue(fo, str(it), str(adapter_dict[it]))

weight_type_dict = {}
model = model.cpu();
dict = model.state_dict()

# 1. vocab
Expand All @@ -234,17 +255,8 @@ def tofile(exportPath,
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
merges = {}
if hasattr(tokenizer, "bpe_ranks"):
merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])}
elif isinstance(tokenizer, PreTrainedTokenizerFast):
tokenizer_file_name = tokenizer.vocab_file if hasattr(tokenizer, "vocab_file") else tokenizer.vocab_files_names['tokenizer_file']
tokenizer_file = tokenizer.name_or_path + tokenizer_file_name
if os.path.exists(tokenizer_file):
with open(tokenizer_file, "r", encoding='utf-8') as f:
bpe_merges = json.load(f)["model"]["merges"]
bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
Expand All @@ -267,14 +279,13 @@ def tofile(exportPath,
else:
fo.write(struct.pack('i', 0))

weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
weight_type_dict[key + ".weight"] = "embedding"

# 2. weight
fo.write(struct.pack('i', len(dict)))
Expand Down

0 comments on commit 57b03e7

Please sign in to comment.