From 521fbab0ac5365a177393e4c06db3645392c601a Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Fri, 23 Feb 2024 00:15:39 -0800
Subject: [PATCH] refine doc

---
 docs/gptq_and_awq.md          | 43 +++++++++++++++++++++++++++++++++++
 docs/supported_models.md      | 12 +++++++++-
 scripts/python_api_example.py |  1 +
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 docs/gptq_and_awq.md
diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
new file mode 100644
index 000000000..f3eeecf2c
--- /dev/null
+++ b/docs/gptq_and_awq.md
@@ -0,0 +1,43 @@
+GPTQ & AWQ
+=======
+
+Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ and AWQ.
+
+More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978).
+
+Validated GPTQ & AWQ models directly from the HuggingFace:
+* [Llama-2-7B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ)
+* [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ)
+* [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
+* [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ)
+* [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ)
+
+Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md).
+
+## Examples
+
+How to run GPTQ or AWQ models in Neural Speed:
+```python
+import sys
+from transformers import AutoTokenizer, TextStreamer
+from neural_speed import Model
+
+if len(sys.argv) != 2:
+    print("Usage: python python_api_example.py model_path")
+model_name = sys.argv[1]
+
+prompt = "Once upon a time, a little girl"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+
+model = Model()
+# Inference GPTQ models.
+model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True)
+# Inference AWQ models.
+# model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_awq=True)
+
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
+```
+
+Note: we have provided the [script](../scripts/python_api_example.py) to run these models.
diff --git a/docs/supported_models.md b/docs/supported_models.md
index 59272dd8b..cab1d1f38 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -43,6 +43,16 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+  </tr>
+    <td><a href="https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf" target="_blank" rel="noopener noreferrer">CodeLlama-7b</a></td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>Latest</td>
+  </tr>
   </tr>
     <td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
     <td>✅</td>
@@ -56,7 +66,7 @@ Neural Speed supports the following models:
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
     <td>✅</td>
-    <td> </td>
+    <td>✅</td>
     <td> </td>
     <td>✅</td>
     <td> </td>
diff --git a/scripts/python_api_example.py b/scripts/python_api_example.py
index bf4e573af..ce3927cb5 100644
--- a/scripts/python_api_example.py
+++ b/scripts/python_api_example.py
@@ -28,5 +28,6 @@
 streamer = TextStreamer(tokenizer)
 
 model = Model()
+# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
 model.init(model_name, weight_dtype="int4", compute_dtype="int8")
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)