Merge branch 'huggingface:main' into main

Zyphra · Oct 4, 2024 · 6fabb6a · 6fabb6a
2 parents cdbd690 + 614660f
commit 6fabb6a
Show file tree

Hide file tree

Showing 68 changed files with 717 additions and 345 deletions.
diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml
@@ -94,7 +94,7 @@ jobs:
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision
 
       - name: NVIDIA-SMI
         run: |

diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py
@@ -11,4 +11,4 @@
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
     "{object_class}": "FakeObjectClass",
-}
+}
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
@@ -962,4 +962,129 @@ tokenizer.chat_template = open("template.jinja").read()
 
 As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will
 exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to
-identify the source of issues.
+identify the source of issues.
+
+### Writing templates for tools
+
+Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend 
+template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code
+to be transferable across models, so deviating from the standard tools API means users will have to write
+custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can
+make the standard API work!
+
+Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it.
+
+#### Tool definitions
+
+Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list 
+of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when
+functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the 
+`tools` variable that your template receives will always be a list of JSON schema. Here is
+a sample tool JSON schema:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+And here is some example code for handling tools in your chat template. Remember, this is just an example for a
+specific format - your model will probably need different formatting!
+
+```text
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model
+was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate
+JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) 
+was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, 
+converts types internally and renders the input tools as Python headers. You can do a lot with templates!
+
+#### Tool calls
+
+Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is 
+always a list, even though most tool-calling models only support single tool calls at a time, which means
+the list will usually only have a single element. Here is a sample message dict containing a tool call:
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+
+And a common pattern for handling them would be something like this:
+
+```text
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+Again, you should render the tool call with the formatting and special tokens that your model expects.
+
+#### Tool responses
+
+Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name
+of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response:
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+
+You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function
+name to be included in the tool response, then rendering it can be as simple as:
+
+```text
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care
+to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
@@ -138,16 +138,16 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
 ## Usage with Kubernetes
 
 The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the
-[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/pytorch/).
+[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/user-guides/pytorch).
 
 ### Setup
 
 This example assumes that you have:
-* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow/)
-* [`kubectl`](https://kubernetes.io/docs/tasks/tools/) installed and configured to access the Kubernetes cluster
-* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) that can be used
+* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow)
+* [`kubectl`](https://kubernetes.io/docs/tasks/tools) installed and configured to access the Kubernetes cluster
+* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes) that can be used
   to store datasets and model files. There are multiple options for setting up the PVC including using an NFS
-  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes/) or a cloud storage bucket.
+  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes) or a cloud storage bucket.
 * A Docker container that includes your model training script and all the dependencies needed to run the script. For
   distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel
   oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers.
@@ -176,7 +176,7 @@ PyTorchJob to the cluster.
 
 ### PyTorchJob Specification File
 
-The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/) is used to run the distributed
+The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch) is used to run the distributed
 training job on the cluster. The yaml file for the PyTorchJob defines parameters such as:
  * The name of the PyTorchJob
  * The number of replicas (workers)
@@ -273,12 +273,13 @@ To run this example, update the yaml based on your training script and the nodes
 
 <Tip>
 
-The CPU resource limits/requests in the yaml are defined in [cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
+The CPU resource limits/requests in the yaml are defined in 
+[cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
 where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical
 host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of
 available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine's capacity in
 order to leave some resources for the kubelet and OS. In order to get ["guaranteed"](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed)
-[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) for the worker pods,
+[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod) for the worker pods,
 set the same CPU and memory amounts for both the resource limits and requests.
 
 </Tip>
@@ -318,4 +319,4 @@ with the job, the PyTorchJob resource can be deleted from the cluster using `kub
 
 This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
 cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
-performance, and can be used as a template to run your own workload on multiple nodes.
+performance, and can be used as a template to run your own workload on multiple nodes.
diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md
@@ -230,3 +230,44 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 Note this feature is supported on AMD GPUs.
 
 </Tip>
+
+
+## CPU support
+
+Recent versions of `autoawq` supports CPU with ipex op optimizations. To get started, first install the latest version of `autoawq` by running:
+
+```bash
+pip install intel-extension-for-pytorch
+pip install git+https://github.com/casper-hansen/AutoAWQ.git
+```
+
+Get started by passing an `AwqConfig()` with `version="ipex"`.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
+
+quantization_config = AwqConfig(version="ipex")
+
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+    quantization_config=quantization_config,
+    device_map="cpu",
+)
+
+input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cpu")
+output = model(input_ids)
+print(output.logits)
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
+input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
+pad_token_id = tokenizer.eos_token_id
+output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+<Tip warning={true}>
+
+Note this feature is supported on Intel CPUs.
+
+</Tip>
diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py
@@ -882,7 +882,7 @@ def forward(
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+                "You must specify exactly one of input_ids or inputs_embeds"
             )
 
         if self.gradient_checkpointing and self.training and use_cache:

diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py
@@ -759,7 +759,7 @@ def forward(
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+                "You must specify exactly one of input_ids or inputs_embeds"
             )
 
         if self.gradient_checkpointing and self.training and use_cache:

diff --git a/i18n/README_ru.md b/i18n/README_ru.md
@@ -77,7 +77,7 @@ limitations under the License.
 
 ## Онлайн демонстрация
 
-Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [привтаный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей.
+Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [приватный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей.
 
 Вот несколько примеров:
 

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -21,6 +21,7 @@
     "awq": [
         "fuse_awq_modules",
         "post_init_awq_exllama_modules",
+        "post_init_awq_ipex_modules",
         "replace_quantization_scales",
         "replace_with_awq_linear",
     ],
@@ -115,6 +116,7 @@
     from .awq import (
         fuse_awq_modules,
         post_init_awq_exllama_modules,
+        post_init_awq_ipex_modules,
         replace_quantization_scales,
         replace_with_awq_linear,
     )

diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
@@ -145,6 +145,10 @@ def replace_with_awq_linear(
                 target_cls = WQLinear_ExllamaV2
             else:
                 raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}")
+        elif quantization_config.version == AWQLinearVersion.IPEX:
+            from awq.modules.linear.gemm_ipex import WQLinear_IPEX
+
+            target_cls = WQLinear_IPEX
         else:
             raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}")
     else:
@@ -266,8 +270,11 @@ def fuse_awq_modules(model, quantization_config):
         # Replace layer norms
         _fuse_awq_layernorm(modules_to_fuse["layernorm"], module, FasterTransformerRMSNorm)
 
-        # Replace MLP layers
-        _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP)
+        # Replace MLP layers if awq version is not ipex.
+        if quantization_config.version != "ipex":
+            _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP)
+        else:
+            logger.info("The IPEX version AWQ does not support fuse mlp for now.")
 
         # Replace attention layers
         attention_has_been_fused = _fuse_awq_attention_layers(
@@ -372,7 +379,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
             The `QuantAttentionFused` class as it only supports that class
             for now.
     """
-    from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
+    from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV, WQLinear_IPEX
 
     module_has_been_fused = False
 
@@ -389,6 +396,9 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
         elif isinstance(q_proj, WQLinear_GEMM):
             linear_target_cls = WQLinear_GEMM
             cat_dim = 1
+        elif isinstance(q_proj, WQLinear_IPEX):
+            linear_target_cls = WQLinear_IPEX
+            cat_dim = 1
         else:
             raise ValueError("Unsupported q_proj type: {type(q_proj)}")
 
@@ -466,3 +476,16 @@ def post_init_awq_exllama_modules(model, exllama_config):
         raise ValueError(f"Unrecognized Exllama version: {exllama_config['version']}")
 
     return model
+
+
+def post_init_awq_ipex_modules(model):
+    """
+    Runs post init for IPEX layers which performs:
+        - Weights packing, reordering and repacking
+    """
+
+    from awq.modules.linear.gemm_ipex import ipex_post_init
+
+    model = ipex_post_init(model)
+
+    return model