diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
index 068c2b2d4c9..31e055b5bea 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -124,17 +124,27 @@ python  llama2.py
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (i.e. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`.
+- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
+- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
+- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `768`.
+
 
 #### Sample Output
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
 
 ```log
 Inference time: xxxx s
--------------------- Prompt --------------------
-Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun
+-------------------- Input --------------------
+<s><s> [INST] <<SYS>>
+
+<</SYS>>
+
+What is AI? [/INST]
 -------------------- Output --------------------
-<s> Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun and exciting experiences.
+<s><s> [INST] <<SYS>>
+
+<</SYS>>
 
-One day, she decided to go on a journey to find a magical land that was said to be full of wonders
+What is AI? [/INST]  AI (Artificial Intelligence) is a field of computer science and engineering that focuses on the development of intelligent machines that can perform tasks
 ```
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
index a5945aa4501..d23a6405677 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
@@ -26,6 +26,18 @@
 
 logger = logging.get_logger(__name__)
 
+def get_prompt(message: str, chat_history: list[tuple[str, str]],
+               system_prompt: str) -> str:
+    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
+    # The first user input is _not_ stripped
+    do_strip = False
+    for user_input, response in chat_history:
+        user_input = user_input.strip() if do_strip else user_input
+        do_strip = True
+        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
+    message = message.strip() if do_strip else message
+    texts.append(f'{message} [/INST]')
+    return ''.join(texts)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -38,9 +50,11 @@
         help="The huggingface repo id for the Llama2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
+    parser.add_argument('--prompt', type=str, default="What is AI?",
+                        help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-output-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=128)
+    parser.add_argument("--max-prompt-len", type=int, default=768)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--intra-pp", type=int, default=2)
     parser.add_argument("--inter-pp", type=int, default=2)
@@ -64,20 +78,15 @@
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    prompts = [
-        "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
-        "Once upon a time, there existed",
-        "Once upon a time, there existed a little girl who liked to have adventures.",
-    ]
+    DEFAULT_SYSTEM_PROMPT = """\
+    """
 
     print("-" * 80)
     print("done")
     with torch.inference_mode():
         print("finish to load")
         for i in range(5):
-            import random
-            idx = random.randint(0, 2)
-            prompt = prompts[idx]
+            prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
             _input_ids = tokenizer.encode(prompt, return_tensors="pt")
             print("input length:", len(_input_ids[0]))
             st = time.time()