Merge branch 'main' of github.com:vllm-project/llm-compressor into kv…

…-cache
vllm-project · Dec 12, 2024 · 3f97a1a · 3f97a1a
2 parents aecf688 + 540d4b2
commit 3f97a1a
Show file tree

Hide file tree

Showing 160 changed files with 3,707 additions and 1,524 deletions.
diff --git a/.github/workflows/build-and-publish-release-images.yaml b/.github/workflows/build-and-publish-release-images.yaml
@@ -30,13 +30,13 @@ jobs:
 
       - name: Checkout code
         if: ${{ startsWith(github.ref, 'refs/tags/v') }}
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 1
 
       - name: Get Tag
         id: extract_tag
-        run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME#*/})"
+        run: echo "tag=${GITHUB_REF_NAME#*/}" >> $GITHUB_OUTPUT
 
       - name: Current Version Name
         if: ${{ startsWith(github.ref, 'refs/tags/v') }}

diff --git a/.github/workflows/linkcheck.yml b/.github/workflows/linkcheck.yml
@@ -15,7 +15,7 @@ jobs:
   markdown-link-check:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - uses: umbrelladocs/action-linkspector@v1
       with:
         github_token: ${{ secrets.github_token }}

diff --git a/.github/workflows/quality-check.yaml b/.github/workflows/quality-check.yaml
@@ -1,5 +1,5 @@
 name: Quality Checks
-on: 
+on:
   push:
     branches:
       - main
@@ -12,13 +12,10 @@ jobs:
   quality-check:
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: '3.9'
-      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
-        with:
-          ref: ${{needs.test-setup.outputs.branch}}
+      - uses: actions/checkout@v4
       - name: "⚙️ Install dependencies"
         run: pip3 install .[dev]
       - name: "🧹 Running quality checks"

diff --git a/.github/workflows/set-comment.yaml b/.github/workflows/set-comment.yaml
@@ -1,5 +1,5 @@
 name: PR Reminder Comment Bot
-on: 
+on:
   pull_request:
     branches:
       - main
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Remind to add ready label
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           script: |
             github.rest.issues.createComment({
@@ -20,4 +20,4 @@ jobs:
               body: '👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.'
             })
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -1,151 +1,127 @@
 name: Test Checks
-on: 
+on:
   push:
-    branches:
-      - main
-      - 'release/*'
-  pull_request:
-    branches:
-      - main
-      - 'release/*'
-    types: [opened, synchronize]
 
 env:
   CADENCE: "commit"
   CLEARML_WEB_HOST: ${{ secrets.CLEARML_WEB_HOST }}
   CLEARML_API_HOST: ${{ secrets.CLEARML_API_HOST }}
   CLEARML_API_ACCESS_KEY: ${{ secrets.CLEARML_API_ACCESS_KEY }}
-  CLEARML_FILES_HOST:  ${{ secrets.CLEARML_FILES_HOST }}
-  CLEARML_API_SECRET_KEY:  ${{ secrets.CLEARML_API_SECRET_KEY }}
+  CLEARML_FILES_HOST: ${{ secrets.CLEARML_FILES_HOST }}
+  CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }}
 
 jobs:
-  test-setup:
-    runs-on: ubuntu-22.04
-    outputs:
-      branch: ${{ steps.get-branch.outputs.branch }}
-      base: ${{ steps.base-check.outputs.output }}
-      pytorch: ${{ steps.pytorch-check.outputs.output }}
-      transformers: ${{ steps.transformers-check.outputs.output }}
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      # TODO: for @DanH what is this supposed to be doing?
-      # The way it was being used before was only testing code on main,
-      # not on the current PR. git branch --show current does not work
-      - name: Get current branch
-        id: get-branch
-        run: >
-          (git branch --show-current | grep -E "release/")
-          && echo "::set-output name=branch::$(git branch --show-current)"
-          || echo "::set-output name=branch::main"
+
   base-tests:
     runs-on: ubuntu-22.04
-    needs: test-setup
     steps:
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
-      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
+          python-version: '3.12'
+      - uses: actions/checkout@v4
+      - name: "⚙️ Install dependencies"
+        run: pip3 install -U pip setuptools && pip3 install .[dev]
+      - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
-          ref: ${{needs.test-setup.outputs.branch}}
       - name: "⚙️ Install compressed-tensors dependencies"
-        run: pip3 install -U pip && pip3 install setuptools compressed-tensors/
+        run: |
+          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
+          pip3 install ./compressed-tensors/
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
-      - name: "⚙️ Install dependencies"
-        run: pip3 install .[dev]
       - name: "🔬 Running base tests"
         run: make test
+
   pytorch-tests:
     runs-on: ubuntu-22.04
-    needs: test-setup
     steps:
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
+      - name: "⚙️ Install dependencies"
+        run: pip3 install -U pip setuptools && pip3 install .[dev]
+      - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
-          ref: ${{needs.test-setup.outputs.branch}}
       - name: "⚙️ Install compressed-tensors dependencies"
-        run: pip3 install -U pip && pip3 install setuptools compressed-tensors/
+        run: |
+          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
+          pip3 install ./compressed-tensors/
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
-      - name: "⚙️ Install dependencies"
-        run: pip3 install .[dev]
       - name: "🔬 Running pytorch tests"
         run: |
-          pytest tests/llmcompressor/pytorch -v
+          pytest -v tests/llmcompressor/pytorch
+
   compat-pytorch-1_9-pytorch-tests:
     runs-on: ubuntu-22.04
-    needs: test-setup
     steps:
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
-      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
+          python-version: '3.10'
+      - uses: actions/checkout@v4
+      - name: "⚙️ Install dependencies"
+        run: pip3 install -U pip setuptools && pip3 install .[dev]
+      - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
-          ref: ${{needs.test-setup.outputs.branch}}
       - name: "⚙️ Install compressed-tensors dependencies"
-        run: pip3 install -U pip && pip3 install setuptools compressed-tensors/
+        run: |
+          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
+          pip3 install ./compressed-tensors/
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
-      - name: "⚙️ Install dependencies"
-        run: pip3 install .[dev]
       - name: "🔬 Running pytorch tests"
         run: |
-          pytest tests/llmcompressor/pytorch -v
+          pytest -v tests/llmcompressor/pytorch
+
   transformers-tests:
-    runs-on: ubuntu-22.04
-    needs: test-setup
+    runs-on: gcp-k8s-vllm-l4-solo
     steps:
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
-      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
+          python-version: '3.9'
+      - uses: actions/checkout@v4
+      - name: "⚙️ Install dependencies"
+        run: pip3 install -U pip setuptools && pip3 install .[dev]
+      - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
-          ref: ${{needs.test-setup.outputs.branch}}
       - name: "⚙️ Install compressed-tensors dependencies"
-        run: pip3 install -U pip && pip3 install setuptools compressed-tensors/
+        id: install
+        run: |
+          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
+          pip3 install ./compressed-tensors/
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
-      - name: "⚙️ Install dependencies"
-        id: install
-        run: pip3 install .[dev]
       - name: "🔬 Running transformers tests"
-        if: always() && steps.install.outcome == 'success'
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest tests/llmcompressor/transformers/compression -v
+          pytest -v tests/llmcompressor/transformers/compression
       - name: Run Finetune Tests
-        if: always() && steps.install.outcome == 'success'       
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/finetune -m unit
+          pytest -v tests/llmcompressor/transformers/finetune
       - name: Running GPTQ Tests
-        if: always() && steps.install.outcome == 'success'
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest tests/llmcompressor/transformers/gptq -v
+          pytest -v tests/llmcompressor/transformers/gptq
       - name: Running ONESHOT Tests
-        if: always() && steps.install.outcome == 'success'
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest tests/llmcompressor/transformers/oneshot -v
+          pytest -v tests/llmcompressor/transformers/oneshot
       - name: Running Sparsification Tests
-        if: always() && steps.install.outcome == 'success'
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest tests/llmcompressor/transformers/sparsification -v
-          ptyest tests/llmcompressor/transformers/test_clear_ml.py -v
+          pytest tests/llmcompressor/transformers/test_clear_ml.py -v
       - name: Running OBCQ Tests
-        if: always() && steps.install.outcome == 'success'
+        if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/obcq -v
+          pytest -v tests/llmcompressor/transformers/obcq
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 **✨ Read the announcement blog [here](https://neuralmagic.com/blog/llm-compressor-is-here-faster-inference-with-vllm/)! ✨**
 
 <p align="center">
-   <img alt="LLM Compressor Flow" src="https://github.com/user-attachments/assets/91c1f391-8c9a-4b20-80c2-20ffb9ad78b4" width="80%" />
+   <img alt="LLM Compressor Flow" src="https://github.com/user-attachments/assets/adf07594-6487-48ae-af62-d9555046d51b" width="80%" />
 </p>
 
 ### Supported Formats
@@ -57,6 +57,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 from llmcompressor.transformers import oneshot
+from transformers import AutoModelForCausalLM
 
 # Select quantization algorithm. In this case, we:
 #   * apply SmoothQuant to make the activations easier to quantize

diff --git a/examples/automodelforcausallm/README.md b/examples/automodelforcausallm/README.md
@@ -0,0 +1,13 @@
+# Loading models using `AutoModelForCausalLM`
+
+Models quantized through `llm-compressor` can be loaded directly through 
+`AutoModelForCausalLM`. Note: this requires `transformers>=v4.45.0` and 
+`compressed-tensors>v0.6.0`.
+
+```python
+from transformers import AutoModelForCausalLM
+
+MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
+```
diff --git a/examples/automodelforcausallm/run_automodelforcausallm.py b/examples/automodelforcausallm/run_automodelforcausallm.py
@@ -0,0 +1,11 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
+
+# Use the AutoModelForCausalLM to run the model
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
@@ -14,13 +14,13 @@
 To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load.
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # device_map="auto" triggers usage of accelerate
 # if > 1 GPU, the model will be sharded across the GPUs
 # if not enough GPU memory to fit the model, parameters are offloaded to the CPU
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto")
 ```
 
@@ -29,17 +29,17 @@ will work properly out of the box for basic quantization with `QuantizationModif
 even for CPU offloaded models. 
 
 To enable CPU offloading for second-order quantization methods such as GPTQ, we need to 
-allocate additional memory upfront when computing the device map. Note that this 
-device map will only compatible with `GPTQModifier(sequential_update=True, ...)`
+allocate additional memory upfront when computing the device map. Not doing so risks
+potentially going out-of-memory.
 
 ```python
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-from llmcompressor.transformers import SparseAutoModelForCausalLM,
+from transformers import AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed)
 device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1)
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map=device_map,
     torch_dtype="auto",
@@ -48,12 +48,7 @@ model = SparseAutoModelForCausalLM.from_pretrained(
 
 ### Practical Advice
 
-When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down.
-
-General rules of thumb:
-- CPU offloading is best used with data-free quantization methods (e.g. PTQ with `FP8_DYNAMIC`)
-- Multi-GPU is fast enough to be used with calibration data-based methods with `sequential_update=False`
-- It is possible to use Multi-GPU with `sequential_update=True` to save GPU memory, but the runtime will be slower
+When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`.
 
 ## Examples
 
@@ -66,7 +61,7 @@ We will show working examples for each use case:
 Install `llmcompressor`:
 
 ```bash
-pip install llmcompressor==0.1.0
+pip install llmcompressor
 ```
 
 ### CPU Offloading: `FP8` Quantization with `PTQ`
@@ -99,4 +94,4 @@ The resulting model `./Meta-Llama-3-70B-Instruct-INT8-Dynamic` is quantized and
 
 ## Questions or Feature Request?
 
-Please open up an issue on `vllm-project/llm-compressor`
+Please open up an issue on `vllm-project/llm-compressor`