diff --git a/examples/ipex/text-generation/README.md b/examples/ipex/text-generation/README.md deleted file mode 100644 index ccc26e78aa..0000000000 --- a/examples/ipex/text-generation/README.md +++ /dev/null @@ -1,31 +0,0 @@ - - -## Language generation - -Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py). - -The original generation task only supported the PyTorch eager and graph model. By calling the `IPEXModelForCausalLM` class, we can now apply ipex optimizations to the eager and graph model for generation tasks. - - -Example usage: -### Use bf16 and JIT model -```bash -python run_generation.py \ - --model_name_or_path=gpt2 \ - --bf16 \ - --jit -``` diff --git a/examples/ipex/text-generation/ipex_text-generation.ipynb b/examples/ipex/text-generation/ipex_text-generation.ipynb new file mode 100644 index 0000000000..af6f72a09b --- /dev/null +++ b/examples/ipex/text-generation/ipex_text-generation.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IPEX model for text-generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IPEX model will replace the linears and some ops. Please note that IPEXModel uses a graph mode model to inference to accelerate the generation." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import AutoTokenizer\n", + "from optimum.intel.ipex import IPEXModelForCausalLM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export the model.\n", + "Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.\n", + "/home/jiqingfe/frameworks.ai.pytorch.ipex-cpu/intel_extension_for_pytorch/frontend.py:462: UserWarning: Conv BatchNorm folding failed during the optimize process.\n", + " warnings.warn(\n", + "/home/jiqingfe/frameworks.ai.pytorch.ipex-cpu/intel_extension_for_pytorch/frontend.py:469: UserWarning: Linear BatchNorm folding failed during the optimize process.\n", + " warnings.warn(\n", + "/home/jiqingfe/miniconda3/envs/ipex/lib/python3.10/site-packages/transformers/modeling_utils.py:4193: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/home/jiqingfe/miniconda3/envs/ipex/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py:801: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if batch_size <= 0:\n", + "Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.\n", + "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n", + "access to the `model_dtype` attribute is deprecated and will be removed after v1.18.0, please use `_dtype` instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet? Yes, you can.\n", + "\n", + "Yes, I can write Haikus in one tweet. I have no idea how to do that, but I'm sure\n" + ] + } + ], + "source": [ + "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", + "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n", + "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n", + "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n", + "\n", + "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n", + "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", + "print(output)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ipex", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py deleted file mode 100755 index be7060408f..0000000000 --- a/examples/ipex/text-generation/run_generation.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# Copyright (c) 2024, INTEL CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet) -""" - - -import argparse -import logging - -import torch -from accelerate import PartialState -from accelerate.utils import set_seed -from transformers import AutoTokenizer - -from optimum.intel.ipex import IPEXModelForCausalLM - - -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, -) -logger = logging.getLogger(__name__) - -MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop - - -def adjust_length_to_model(length, max_sequence_length): - if length < 0 and max_sequence_length > 0: - length = max_sequence_length - elif 0 < max_sequence_length < length: - length = max_sequence_length # No generation bigger than model size - elif length < 0: - length = MAX_LENGTH # avoid infinite loop - return length - - -def sparse_model_config(model_config): - embedding_size = None - if hasattr(model_config, "hidden_size"): - embedding_size = model_config.hidden_size - elif hasattr(model_config, "n_embed"): - embedding_size = model_config.n_embed - elif hasattr(model_config, "n_embd"): - embedding_size = model_config.n_embd - - num_head = None - if hasattr(model_config, "num_attention_heads"): - num_head = model_config.num_attention_heads - elif hasattr(model_config, "n_head"): - num_head = model_config.n_head - - if embedding_size is None or num_head is None or num_head == 0: - raise ValueError("Check the model config") - - num_embedding_size_per_head = int(embedding_size / num_head) - if hasattr(model_config, "n_layer"): - num_layer = model_config.n_layer - elif hasattr(model_config, "num_hidden_layers"): - num_layer = model_config.num_hidden_layers - else: - raise ValueError("Number of hidden layers couldn't be determined from the model config") - - return num_layer, num_head, num_embedding_size_per_head - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - required=True, - help="Path to pre-trained model or shortcut name", - ) - - parser.add_argument("--prompt", type=str, default="") - parser.add_argument("--length", type=int, default=20) - parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="temperature of 1.0 has no effect, lower tend toward greedy sampling", - ) - parser.add_argument( - "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2" - ) - parser.add_argument("--k", type=int, default=0) - parser.add_argument("--p", type=float, default=0.9) - - parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.") - parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.") - - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument( - "--use_cpu", - action="store_true", - help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available", - ) - parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.") - parser.add_argument( - "--fp16", - action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", - ) - parser.add_argument( - "--bf16", - action="store_true", - help="Whether to use bfloat 16-bit precision (through INTEL AMX or AVX_512) instead of 32-bit", - ) - parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference") - args = parser.parse_args() - - if args.fp16 and args.bf16: - raise ValueError("You can only choose one of {fp16, bf16}") - - torch_dtype = torch.float32 - if args.fp16: - torch_dtype = torch.float16 - if args.bf16: - torch_dtype = torch.bfloat16 - - # Initialize the distributed state. - distributed_state = PartialState(cpu=args.use_cpu) - - logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16 or args.bf16}") - - if args.seed is not None: - set_seed(args.seed) - - # Initialize the model and tokenizer - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - model = IPEXModelForCausalLM.from_pretrained(args.model_name_or_path, export=args.jit, torch_dtype=torch_dtype) - - # Set the model to the right device - model.to(distributed_state.device) - - max_seq_length = getattr(model.config, "max_position_embeddings", 0) - args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length) - logger.info(args) - - prompt_text = args.prompt if args.prompt else input("Model prompt >>> ") - - prefix = args.prefix if args.prefix else args.padding_text - encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt") - encoded_prompt = encoded_prompt.to(distributed_state.device) - - if encoded_prompt.size()[-1] == 0: - input_ids = None - else: - input_ids = encoded_prompt - - output_sequences = model.generate( - input_ids=input_ids, - max_length=args.length + len(encoded_prompt[0]), - temperature=args.temperature, - top_k=args.k, - top_p=args.p, - repetition_penalty=args.repetition_penalty, - do_sample=True, - num_return_sequences=args.num_return_sequences, - ) - - # Remove the batch dimension when returning multiple sequences - if len(output_sequences.shape) > 2: - output_sequences.squeeze_() - - generated_sequences = [] - - for generated_sequence_idx, generated_sequence in enumerate(output_sequences): - print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===") - generated_sequence = generated_sequence.tolist() - - # Decode text - text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) - - # Remove all text after the stop token - text = text[: text.find(args.stop_token) if args.stop_token else None] - - # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing - total_sequence = ( - prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :] - ) - - generated_sequences.append(total_sequence) - print(total_sequence) - - return generated_sequences - - -if __name__ == "__main__": - main()