Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
[LLM Runtime] Support loadding models from HF directly. (#93)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhenzhong1 authored Jan 25, 2024
1 parent 66cb9f5 commit bb80273
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 12 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
>**Note**: For llama2/ mistral/ neural_chat/ codellama/ magicoder/ chatglmv1/v2/ baichuan models, we can only support the local path to model for now.
GGUF format HF model
```python
from transformers import AutoTokenizer, TextStreamer
Expand All @@ -62,7 +61,9 @@ model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

You can also use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers), but you need to install Intel Extension for Transformers.
Please refer [this link](./docs/supported_models.md) to check supported models.

If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md).

### 2. llama.cpp-like usage:

Expand Down
8 changes: 4 additions & 4 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))

vocab = load_vocab_for_baichuan(Path(dir_model))
tokenizer_path = Path(tokenizer.vocab_file).parent
vocab = load_vocab_for_baichuan(Path(tokenizer_path))
counter = 0
for text, score in vocab.all_tokens():
fout.write(struct.pack("i", len(text)))
Expand Down Expand Up @@ -230,9 +231,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
dir_model = args.model.as_posix()
fname_out = args.outfile.as_posix()

with open(dir_model + '/config.json', "r", encoding="utf-8") as f:
hparams = json.load(f)

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand All @@ -244,6 +242,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)

hparams = config.to_dict()

baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)


Expand Down
15 changes: 9 additions & 6 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from sentencepiece import SentencePieceProcessor # type: ignore
import gguf

Expand Down Expand Up @@ -368,7 +368,9 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))

vocab = load_vocab_for_glm2(Path(dir_model))
tokenizer_path = Path(tokenizer.vocab_file).parent
vocab = load_vocab_for_glm2(Path(tokenizer_path))

counter = 0
for text, score in vocab.all_tokens():
fout.write(struct.pack("i", len(text)))
Expand Down Expand Up @@ -464,7 +466,8 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))

vocab = load_vocab_for_glm1(Path(dir_model))
tokenizer_path = Path(tokenizer.vocab_file).parent
vocab = load_vocab_for_glm1(Path(tokenizer_path))
counter = 0
for text, score in vocab.all_tokens():
fout.write(struct.pack("i", len(text)))
Expand Down Expand Up @@ -534,19 +537,19 @@ def main(args_in: Optional[List[str]] = None) -> None:
dir_model = args.model.as_posix()
fname_out = args.outfile.as_posix()

with open(dir_model + '/config.json', "r", encoding="utf-8") as f:
hparams = json.load(f)

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
ftype = 0
if args.outtype == "f16":
ftype = 1

config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)

hparams = config.to_dict()

if hasattr(model.config, "multi_query_attention"):
if args.format == "GGUF":
chatglm2_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
Expand Down
13 changes: 13 additions & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
Union)
import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import gguf

if TYPE_CHECKING:
Expand Down Expand Up @@ -1432,6 +1433,18 @@ def main(args_in: Optional[List[str]] = None) -> None:
OutputFile.write_vocab_only(outfile, vocab)
print(f"Wrote {outfile}")
else:
if Path(args.model).is_dir():
print("Loadding the model from the local path.")
model_plus = load_some_model(args.model)
else:
print("Loadding the model from HF.")
model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
cache_path = Path(tokenizer.vocab_file).parent

model_plus = load_some_model(cache_path)
args.model = cache_path

model_plus = load_some_model(args.model)
if args.dump:
do_dump_model(model_plus)
Expand Down

0 comments on commit bb80273

Please sign in to comment.