diff --git a/examples/ipex/text-generation/README.md b/examples/ipex/text-generation/README.md
deleted file mode 100644
index ccc26e78aa..0000000000
--- a/examples/ipex/text-generation/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-## Language generation
-
-Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
-
-The original generation task only supported the PyTorch eager and graph model. By calling the `IPEXModelForCausalLM` class, we can now apply ipex optimizations to the eager and graph model for generation tasks.
-
-
-Example usage:
-### Use bf16 and JIT model
-```bash
-python run_generation.py \
-    --model_name_or_path=gpt2 \
-    --bf16 \
-    --jit
-```
diff --git a/examples/ipex/text-generation/ipex_text-generation.ipynb b/examples/ipex/text-generation/ipex_text-generation.ipynb
new file mode 100644
index 0000000000..af6f72a09b
--- /dev/null
+++ b/examples/ipex/text-generation/ipex_text-generation.ipynb
@@ -0,0 +1,96 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# IPEX model for text-generation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "IPEX model will replace the linears and some ops. Please note that IPEXModel uses a graph mode model to inference to accelerate the generation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel.ipex import IPEXModelForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Framework not specified. Using pt to export the model.\n",
+      "Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.\n",
+      "/home/jiqingfe/frameworks.ai.pytorch.ipex-cpu/intel_extension_for_pytorch/frontend.py:462: UserWarning: Conv BatchNorm folding failed during the optimize process.\n",
+      "  warnings.warn(\n",
+      "/home/jiqingfe/frameworks.ai.pytorch.ipex-cpu/intel_extension_for_pytorch/frontend.py:469: UserWarning: Linear BatchNorm folding failed during the optimize process.\n",
+      "  warnings.warn(\n",
+      "/home/jiqingfe/miniconda3/envs/ipex/lib/python3.10/site-packages/transformers/modeling_utils.py:4193: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n",
+      "  warnings.warn(\n",
+      "/home/jiqingfe/miniconda3/envs/ipex/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py:801: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if batch_size <= 0:\n",
+      "Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+      "access to the `model_dtype` attribute is deprecated and will be removed after v1.18.0, please use `_dtype` instead.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet? Yes, you can.\n",
+      "\n",
+      "Yes, I can write Haikus in one tweet. I have no idea how to do that, but I'm sure\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
+    "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
+    "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
+    "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n",
+    "\n",
+    "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n",
+    "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
+    "print(output)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ipex",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py
deleted file mode 100755
index be7060408f..0000000000
--- a/examples/ipex/text-generation/run_generation.py
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright (c) 2024, INTEL CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
-"""
-
-
-import argparse
-import logging
-
-import torch
-from accelerate import PartialState
-from accelerate.utils import set_seed
-from transformers import AutoTokenizer
-
-from optimum.intel.ipex import IPEXModelForCausalLM
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
-
-
-def adjust_length_to_model(length, max_sequence_length):
-    if length < 0 and max_sequence_length > 0:
-        length = max_sequence_length
-    elif 0 < max_sequence_length < length:
-        length = max_sequence_length  # No generation bigger than model size
-    elif length < 0:
-        length = MAX_LENGTH  # avoid infinite loop
-    return length
-
-
-def sparse_model_config(model_config):
-    embedding_size = None
-    if hasattr(model_config, "hidden_size"):
-        embedding_size = model_config.hidden_size
-    elif hasattr(model_config, "n_embed"):
-        embedding_size = model_config.n_embed
-    elif hasattr(model_config, "n_embd"):
-        embedding_size = model_config.n_embd
-
-    num_head = None
-    if hasattr(model_config, "num_attention_heads"):
-        num_head = model_config.num_attention_heads
-    elif hasattr(model_config, "n_head"):
-        num_head = model_config.n_head
-
-    if embedding_size is None or num_head is None or num_head == 0:
-        raise ValueError("Check the model config")
-
-    num_embedding_size_per_head = int(embedding_size / num_head)
-    if hasattr(model_config, "n_layer"):
-        num_layer = model_config.n_layer
-    elif hasattr(model_config, "num_hidden_layers"):
-        num_layer = model_config.num_hidden_layers
-    else:
-        raise ValueError("Number of hidden layers couldn't be determined from the model config")
-
-    return num_layer, num_head, num_embedding_size_per_head
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name",
-    )
-
-    parser.add_argument("--prompt", type=str, default="")
-    parser.add_argument("--length", type=int, default=20)
-    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
-    )
-    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
-    )
-    parser.add_argument("--k", type=int, default=0)
-    parser.add_argument("--p", type=float, default=0.9)
-
-    parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
-    parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
-
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--use_cpu",
-        action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
-    )
-    parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to use bfloat 16-bit precision (through INTEL AMX or AVX_512) instead of 32-bit",
-    )
-    parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
-    args = parser.parse_args()
-
-    if args.fp16 and args.bf16:
-        raise ValueError("You can only choose one of {fp16, bf16}")
-
-    torch_dtype = torch.float32
-    if args.fp16:
-        torch_dtype = torch.float16
-    if args.bf16:
-        torch_dtype = torch.bfloat16
-
-    # Initialize the distributed state.
-    distributed_state = PartialState(cpu=args.use_cpu)
-
-    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16 or args.bf16}")
-
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Initialize the model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = IPEXModelForCausalLM.from_pretrained(args.model_name_or_path, export=args.jit, torch_dtype=torch_dtype)
-
-    # Set the model to the right device
-    model.to(distributed_state.device)
-
-    max_seq_length = getattr(model.config, "max_position_embeddings", 0)
-    args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
-    logger.info(args)
-
-    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
-
-    prefix = args.prefix if args.prefix else args.padding_text
-    encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
-    encoded_prompt = encoded_prompt.to(distributed_state.device)
-
-    if encoded_prompt.size()[-1] == 0:
-        input_ids = None
-    else:
-        input_ids = encoded_prompt
-
-    output_sequences = model.generate(
-        input_ids=input_ids,
-        max_length=args.length + len(encoded_prompt[0]),
-        temperature=args.temperature,
-        top_k=args.k,
-        top_p=args.p,
-        repetition_penalty=args.repetition_penalty,
-        do_sample=True,
-        num_return_sequences=args.num_return_sequences,
-    )
-
-    # Remove the batch dimension when returning multiple sequences
-    if len(output_sequences.shape) > 2:
-        output_sequences.squeeze_()
-
-    generated_sequences = []
-
-    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
-        print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
-        generated_sequence = generated_sequence.tolist()
-
-        # Decode text
-        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
-
-        # Remove all text after the stop token
-        text = text[: text.find(args.stop_token) if args.stop_token else None]
-
-        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
-        total_sequence = (
-            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
-        )
-
-        generated_sequences.append(total_sequence)
-        print(total_sequence)
-
-    return generated_sequences
-
-
-if __name__ == "__main__":
-    main()