From 521fbab0ac5365a177393e4c06db3645392c601a Mon Sep 17 00:00:00 2001 From: Zhenzhong1 Date: Fri, 23 Feb 2024 00:15:39 -0800 Subject: [PATCH] refine doc --- docs/gptq_and_awq.md | 43 +++++++++++++++++++++++++++++++++++ docs/supported_models.md | 12 +++++++++- scripts/python_api_example.py | 1 + 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 docs/gptq_and_awq.md diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md new file mode 100644 index 000000000..f3eeecf2c --- /dev/null +++ b/docs/gptq_and_awq.md @@ -0,0 +1,43 @@ +GPTQ & AWQ +======= + +Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ and AWQ. + +More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978). + +Validated GPTQ & AWQ models directly from the HuggingFace: +* [Llama-2-7B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) +* [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ) +* [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ) +* [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ) +* [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ) + +Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md). + +## Examples + +How to run GPTQ or AWQ models in Neural Speed: +```python +import sys +from transformers import AutoTokenizer, TextStreamer +from neural_speed import Model + +if len(sys.argv) != 2: + print("Usage: python python_api_example.py model_path") +model_name = sys.argv[1] + +prompt = "Once upon a time, a little girl" +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +inputs = tokenizer(prompt, return_tensors="pt").input_ids +streamer = TextStreamer(tokenizer) + +model = Model() +# Inference GPTQ models. +model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True) +# Inference AWQ models. +# model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_awq=True) + +outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) +``` + +Note: we have provided the [script](../scripts/python_api_example.py) to run these models. diff --git a/docs/supported_models.md b/docs/supported_models.md index 59272dd8b..cab1d1f38 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -43,6 +43,16 @@ Neural Speed supports the following models: ✅ ✅ Latest + + CodeLlama-7b + ✅ + ✅ + ✅ + ✅ + ✅ + ✅ + Latest + Solar-10.7B ✅ @@ -56,7 +66,7 @@ Neural Speed supports the following models: GPT-J-6B ✅ - + ✅ ✅ diff --git a/scripts/python_api_example.py b/scripts/python_api_example.py index bf4e573af..ce3927cb5 100644 --- a/scripts/python_api_example.py +++ b/scripts/python_api_example.py @@ -28,5 +28,6 @@ streamer = TextStreamer(tokenizer) model = Model() +# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True. model.init(model_name, weight_dtype="int4", compute_dtype="int8") outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)