modify

intel-analytics · May 23, 2024 · a1e34d7 · a1e34d7
1 parent 576d7e9
commit a1e34d7
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 16 deletions.
diff --git a/docs/docs/examples/llm/ipex_llm_gpu.ipynb b/docs/docs/examples/llm/ipex_llm_gpu.ipynb
@@ -70,7 +70,7 @@
     "\n",
     "## `IpexLLM`\n",
     "\n",
-    "Setting `device_map=\"xpu\"` when initializing `IpexLLM` will put the embedding model on Intel GPU and benefit from IPEX-LLM optimizations:\n",
+    "Setting `device_map=\"xpu\"` when initializing `IpexLLM` will put the embedding model on Intel GPU and benefit from IPEX-LLM optimizations. Use proper prompt format for zephyr-7b-alpha following the [model card](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)\n",
     "\n",
     "```python\n",
     "# Transform a string into input zephyr-specific input\n",

diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py
@@ -43,7 +43,7 @@ def messages_to_prompt(messages):
         type=str,
         default="cpu",
         choices=["cpu", "xpu"],
-        help="The device (Intel CPU or Intel GPU) the embedding model runs on",
+        help="The device (Intel CPU or Intel GPU) the LLM model runs on",
     )
     parser.add_argument(
         "--query",

diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/low_bit.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/low_bit.py
@@ -42,7 +42,7 @@ def messages_to_prompt(messages):
         type=str,
         default="cpu",
         choices=["cpu", "xpu"],
-        help="The device (Intel CPU or Intel GPU) the embedding model runs on",
+        help="The device (Intel CPU or Intel GPU) the LLM model runs on",
     )
     parser.add_argument(
         "--query",
@@ -51,11 +51,19 @@ def messages_to_prompt(messages):
         default="What is IPEX-LLM?",
         help="The sentence you prefer for query the LLM",
     )
+    parser.add_argument(
+        "--save-lowbit-dir",
+        "-s",
+        type=str,
+        default="./lowbit",
+        help="The directory to save the low bit model",
+    )
 
     args = parser.parse_args()
     model_name = args.model_name
     device = args.device
     query = args.query
+    saved_lowbit_model_path = args.save_lowbit_dir
 
     llm = IpexLLM.from_model_id(
         model_name=model_name,
@@ -68,8 +76,6 @@ def messages_to_prompt(messages):
         device_map=device,
     )
 
-    saved_lowbit_model_path = "./zephyr-7b-alpha-low-bit"  # path to save low-bit model
-
     llm._model.save_low_bit(saved_lowbit_model_path)
     del llm
 

diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py
@@ -36,14 +36,14 @@ def completion_to_prompt(completion):
             "sym_int5",
             "asym_int5",
             "sym_int8",
-            "fp4",
-            "fp8",
-            "fp16",
-            "bf16",
-            "fp8_e4m3",
-            "fp8_e5m2",
-            "nf3",
-            "nf4",
+            "fp4",  # only available on GPU
+            "fp8",  # only available on GPU
+            "fp16",  # only available on GPU
+            "bf16",  # only available on GPU
+            "fp8_e4m3",  # only available on GPU
+            "fp8_e5m2",  # only available on GPU
+            "nf3",  # only available on GPU
+            "nf4",  # only available on GPU
         ],
         help="The quantization type the model will convert to.",
     )

diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py
@@ -199,7 +199,7 @@ def __init__(
             )
         if device_map not in ["cpu", "xpu"]:
             raise ValueError(
-                "IpexLLMEmbedding currently only supports device to be 'cpu' or 'xpu', "
+                "IpexLLM currently only supports device to be 'cpu' or 'xpu', "
                 f"but you have: {device_map}."
             )
         if "xpu" in device_map:
@@ -292,7 +292,7 @@ def from_model_id(
         load_in_low_bit: Optional[str] = None,
         model: Optional[Any] = None,
         tokenizer: Optional[Any] = None,
-        device_map: Optional[str] = "cpu",
+        device_map: Literal["cpu", "xpu"] = "cpu",
         stopping_ids: Optional[List[int]] = None,
         tokenizer_kwargs: Optional[dict] = None,
         tokenizer_outputs_to_remove: Optional[list] = None,
@@ -338,7 +338,7 @@ def from_model_id_low_bit(
         model_name: str = DEFAULT_HUGGINGFACE_MODEL,
         model: Optional[Any] = None,
         tokenizer: Optional[Any] = None,
-        device_map: Optional[str] = "cpu",
+        device_map: Literal["cpu", "xpu"] = "cpu",
         stopping_ids: Optional[List[int]] = None,
         tokenizer_kwargs: Optional[dict] = None,
         tokenizer_outputs_to_remove: Optional[list] = None,