diff --git a/.github/workflows/build-and-publish-release-images.yaml b/.github/workflows/build-and-publish-release-images.yaml index 416c0f735..032c3f065 100644 --- a/.github/workflows/build-and-publish-release-images.yaml +++ b/.github/workflows/build-and-publish-release-images.yaml @@ -30,13 +30,13 @@ jobs: - name: Checkout code if: ${{ startsWith(github.ref, 'refs/tags/v') }} - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 1 - name: Get Tag id: extract_tag - run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME#*/})" + run: echo "tag=${GITHUB_REF_NAME#*/}" >> $GITHUB_OUTPUT - name: Current Version Name if: ${{ startsWith(github.ref, 'refs/tags/v') }} diff --git a/.github/workflows/linkcheck.yml b/.github/workflows/linkcheck.yml index 7573b91ff..5b5319b4e 100644 --- a/.github/workflows/linkcheck.yml +++ b/.github/workflows/linkcheck.yml @@ -15,7 +15,7 @@ jobs: markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: umbrelladocs/action-linkspector@v1 with: github_token: ${{ secrets.github_token }} diff --git a/.github/workflows/quality-check.yaml b/.github/workflows/quality-check.yaml index 21be8a509..2d60791c3 100644 --- a/.github/workflows/quality-check.yaml +++ b/.github/workflows/quality-check.yaml @@ -1,5 +1,5 @@ name: Quality Checks -on: +on: push: branches: - main @@ -12,13 +12,10 @@ jobs: quality-check: runs-on: ubuntu-22.04 steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: '3.9' - - uses: actions/checkout@v2 - - uses: actions/checkout@v2 - with: - ref: ${{needs.test-setup.outputs.branch}} + - uses: actions/checkout@v4 - name: "โ๏ธ Install dependencies" run: pip3 install .[dev] - name: "๐งน Running quality checks" diff --git a/.github/workflows/set-comment.yaml b/.github/workflows/set-comment.yaml index 47edb6fec..16483ed54 100644 --- a/.github/workflows/set-comment.yaml +++ b/.github/workflows/set-comment.yaml @@ -1,5 +1,5 @@ name: PR Reminder Comment Bot -on: +on: pull_request: branches: - main @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to add ready label - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | github.rest.issues.createComment({ @@ -20,4 +20,4 @@ jobs: body: '๐ Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.' }) env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 36e73f4f8..47744e8da 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -1,151 +1,127 @@ name: Test Checks -on: +on: push: - branches: - - main - - 'release/*' - pull_request: - branches: - - main - - 'release/*' - types: [opened, synchronize] env: CADENCE: "commit" CLEARML_WEB_HOST: ${{ secrets.CLEARML_WEB_HOST }} CLEARML_API_HOST: ${{ secrets.CLEARML_API_HOST }} CLEARML_API_ACCESS_KEY: ${{ secrets.CLEARML_API_ACCESS_KEY }} - CLEARML_FILES_HOST: ${{ secrets.CLEARML_FILES_HOST }} - CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }} + CLEARML_FILES_HOST: ${{ secrets.CLEARML_FILES_HOST }} + CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }} jobs: - test-setup: - runs-on: ubuntu-22.04 - outputs: - branch: ${{ steps.get-branch.outputs.branch }} - base: ${{ steps.base-check.outputs.output }} - pytorch: ${{ steps.pytorch-check.outputs.output }} - transformers: ${{ steps.transformers-check.outputs.output }} - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - # TODO: for @DanH what is this supposed to be doing? - # The way it was being used before was only testing code on main, - # not on the current PR. git branch --show current does not work - - name: Get current branch - id: get-branch - run: > - (git branch --show-current | grep -E "release/") - && echo "::set-output name=branch::$(git branch --show-current)" - || echo "::set-output name=branch::main" + base-tests: runs-on: ubuntu-22.04 - needs: test-setup steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.11' - - uses: actions/checkout@v2 - - uses: actions/checkout@v2 + python-version: '3.12' + - uses: actions/checkout@v4 + - name: "โ๏ธ Install dependencies" + run: pip3 install -U pip setuptools && pip3 install .[dev] + - uses: actions/checkout@v4 with: repository: "neuralmagic/compressed-tensors" path: "compressed-tensors" - ref: ${{needs.test-setup.outputs.branch}} - name: "โ๏ธ Install compressed-tensors dependencies" - run: pip3 install -U pip && pip3 install setuptools compressed-tensors/ + run: | + pip3 uninstall -y compressed-tensors compressed-tensors-nightly + pip3 install ./compressed-tensors/ - name: "Clean compressed-tensors directory" run: rm -r compressed-tensors/ - - name: "โ๏ธ Install dependencies" - run: pip3 install .[dev] - name: "๐ฌ Running base tests" run: make test + pytorch-tests: runs-on: ubuntu-22.04 - needs: test-setup steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: '3.11' - - uses: actions/checkout@v2 - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + - name: "โ๏ธ Install dependencies" + run: pip3 install -U pip setuptools && pip3 install .[dev] + - uses: actions/checkout@v4 with: repository: "neuralmagic/compressed-tensors" path: "compressed-tensors" - ref: ${{needs.test-setup.outputs.branch}} - name: "โ๏ธ Install compressed-tensors dependencies" - run: pip3 install -U pip && pip3 install setuptools compressed-tensors/ + run: | + pip3 uninstall -y compressed-tensors compressed-tensors-nightly + pip3 install ./compressed-tensors/ - name: "Clean compressed-tensors directory" run: rm -r compressed-tensors/ - - name: "โ๏ธ Install dependencies" - run: pip3 install .[dev] - name: "๐ฌ Running pytorch tests" run: | - pytest tests/llmcompressor/pytorch -v + pytest -v tests/llmcompressor/pytorch + compat-pytorch-1_9-pytorch-tests: runs-on: ubuntu-22.04 - needs: test-setup steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.9' - - uses: actions/checkout@v2 - - uses: actions/checkout@v2 + python-version: '3.10' + - uses: actions/checkout@v4 + - name: "โ๏ธ Install dependencies" + run: pip3 install -U pip setuptools && pip3 install .[dev] + - uses: actions/checkout@v4 with: repository: "neuralmagic/compressed-tensors" path: "compressed-tensors" - ref: ${{needs.test-setup.outputs.branch}} - name: "โ๏ธ Install compressed-tensors dependencies" - run: pip3 install -U pip && pip3 install setuptools compressed-tensors/ + run: | + pip3 uninstall -y compressed-tensors compressed-tensors-nightly + pip3 install ./compressed-tensors/ - name: "Clean compressed-tensors directory" run: rm -r compressed-tensors/ - - name: "โ๏ธ Install dependencies" - run: pip3 install .[dev] - name: "๐ฌ Running pytorch tests" run: | - pytest tests/llmcompressor/pytorch -v + pytest -v tests/llmcompressor/pytorch + transformers-tests: - runs-on: ubuntu-22.04 - needs: test-setup + runs-on: gcp-k8s-vllm-l4-solo steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.11' - - uses: actions/checkout@v2 - - uses: actions/checkout@v2 + python-version: '3.9' + - uses: actions/checkout@v4 + - name: "โ๏ธ Install dependencies" + run: pip3 install -U pip setuptools && pip3 install .[dev] + - uses: actions/checkout@v4 with: repository: "neuralmagic/compressed-tensors" path: "compressed-tensors" - ref: ${{needs.test-setup.outputs.branch}} - name: "โ๏ธ Install compressed-tensors dependencies" - run: pip3 install -U pip && pip3 install setuptools compressed-tensors/ + id: install + run: | + pip3 uninstall -y compressed-tensors compressed-tensors-nightly + pip3 install ./compressed-tensors/ - name: "Clean compressed-tensors directory" run: rm -r compressed-tensors/ - - name: "โ๏ธ Install dependencies" - id: install - run: pip3 install .[dev] - name: "๐ฌ Running transformers tests" - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest tests/llmcompressor/transformers/compression -v + pytest -v tests/llmcompressor/transformers/compression - name: Run Finetune Tests - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/finetune -m unit + pytest -v tests/llmcompressor/transformers/finetune - name: Running GPTQ Tests - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest tests/llmcompressor/transformers/gptq -v + pytest -v tests/llmcompressor/transformers/gptq - name: Running ONESHOT Tests - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest tests/llmcompressor/transformers/oneshot -v + pytest -v tests/llmcompressor/transformers/oneshot - name: Running Sparsification Tests - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | pytest tests/llmcompressor/transformers/sparsification -v - ptyest tests/llmcompressor/transformers/test_clear_ml.py -v + pytest tests/llmcompressor/transformers/test_clear_ml.py -v - name: Running OBCQ Tests - if: always() && steps.install.outcome == 'success' + if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/obcq -v \ No newline at end of file + pytest -v tests/llmcompressor/transformers/obcq diff --git a/README.md b/README.md index e4bcbdbd1..fd7f2f3e3 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ **โจ Read the announcement blog [here](https://neuralmagic.com/blog/llm-compressor-is-here-faster-inference-with-vllm/)! โจ**
- +
### Supported Formats @@ -57,6 +57,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API. from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.transformers import oneshot +from transformers import AutoModelForCausalLM # Select quantization algorithm. In this case, we: # * apply SmoothQuant to make the activations easier to quantize diff --git a/examples/automodelforcausallm/README.md b/examples/automodelforcausallm/README.md new file mode 100644 index 000000000..e40cb5c2a --- /dev/null +++ b/examples/automodelforcausallm/README.md @@ -0,0 +1,13 @@ +# Loading models using `AutoModelForCausalLM` + +Models quantized through `llm-compressor` can be loaded directly through +`AutoModelForCausalLM`. Note: this requires `transformers>=v4.45.0` and +`compressed-tensors>v0.6.0`. + +```python +from transformers import AutoModelForCausalLM + +MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer" + +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto") +``` diff --git a/examples/automodelforcausallm/run_automodelforcausallm.py b/examples/automodelforcausallm/run_automodelforcausallm.py new file mode 100644 index 000000000..791b4d3d5 --- /dev/null +++ b/examples/automodelforcausallm/run_automodelforcausallm.py @@ -0,0 +1,11 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer" + +# Use the AutoModelForCausalLM to run the model +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md index c29d0ed39..c0e53b884 100644 --- a/examples/big_models_with_accelerate/README.md +++ b/examples/big_models_with_accelerate/README.md @@ -14,13 +14,13 @@ To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load. ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" # device_map="auto" triggers usage of accelerate # if > 1 GPU, the model will be sharded across the GPUs # if not enough GPU memory to fit the model, parameters are offloaded to the CPU -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto") ``` @@ -29,17 +29,17 @@ will work properly out of the box for basic quantization with `QuantizationModif even for CPU offloaded models. To enable CPU offloading for second-order quantization methods such as GPTQ, we need to -allocate additional memory upfront when computing the device map. Note that this -device map will only compatible with `GPTQModifier(sequential_update=True, ...)` +allocate additional memory upfront when computing the device map. Not doing so risks +potentially going out-of-memory. ```python from llmcompressor.transformers.compression.helpers import calculate_offload_device_map -from llmcompressor.transformers import SparseAutoModelForCausalLM, +from transformers import AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" # Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed) device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1) -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, torch_dtype="auto", @@ -48,12 +48,7 @@ model = SparseAutoModelForCausalLM.from_pretrained( ### Practical Advice -When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. - -General rules of thumb: -- CPU offloading is best used with data-free quantization methods (e.g. PTQ with `FP8_DYNAMIC`) -- Multi-GPU is fast enough to be used with calibration data-based methods with `sequential_update=False` -- It is possible to use Multi-GPU with `sequential_update=True` to save GPU memory, but the runtime will be slower +When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`. ## Examples @@ -66,7 +61,7 @@ We will show working examples for each use case: Install `llmcompressor`: ```bash -pip install llmcompressor==0.1.0 +pip install llmcompressor ``` ### CPU Offloading: `FP8` Quantization with `PTQ` @@ -99,4 +94,4 @@ The resulting model `./Meta-Llama-3-70B-Instruct-INT8-Dynamic` is quantized and ## Questions or Feature Request? -Please open up an issue on `vllm-project/llm-compressor` \ No newline at end of file +Please open up an issue on `vllm-project/llm-compressor` diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py index de9275136..b5135af5c 100644 --- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py +++ b/examples/big_models_with_accelerate/cpu_offloading_fp8.py @@ -1,14 +1,14 @@ -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" # Load model # Note: device_map="auto" will offload to CPU if not enough space on GPU. -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True ) diff --git a/examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py similarity index 81% rename from examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py rename to examples/big_models_with_accelerate/mult_gpus_int8_device_map.py index a9befa0e8..6a3123e1d 100644 --- a/examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py +++ b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py @@ -1,20 +1,22 @@ import torch from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407" # adjust based off number of desired GPUs +# reserve_for_hessians=True reserves memory which is required by +# GPTQModifier and SparseGPTModifier device_map = calculate_offload_device_map( - MODEL_ID, reserve_for_hessians=True, num_gpus=2, torch_dtype=torch.bfloat16 + MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16 ) -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -60,7 +62,9 @@ def tokenize(sample): recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True + targets="Linear", + scheme="W8A8", + ignore=["lm_head"], ), ] diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py index 4daf8c63e..50d0bea08 100644 --- a/examples/big_models_with_accelerate/multi_gpu_int8.py +++ b/examples/big_models_with_accelerate/multi_gpu_int8.py @@ -1,14 +1,14 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic" # 1) Load model (device_map="auto" with shard the model over multiple GPUs!). -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", @@ -58,14 +58,15 @@ def tokenize(sample): # 3) Configure algorithms. In this case, we: # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# * run non-sequentially (for seq update, see multi_gpu_int8_sequential_update.py) recipe = [ GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=False + targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1 ), ] # 4) Apply algorithms and save in `compressed-tensors` format. +# if you encounter GPU out-of-memory issues, consider using an explicit +# device map (see multi_gpus_int8_device_map.py) oneshot( model=model, tokenizer=tokenizer, diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py index 202ea6125..f0d0381d2 100644 --- a/examples/compressed_inference/fp8_compressed_inference.py +++ b/examples/compressed_inference/fp8_compressed_inference.py @@ -1,13 +1,7 @@ -from transformers import AutoTokenizer - -from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoModelForCausalLM, AutoTokenizer """ -This example covers how to load a quantized model in compressed mode. By default, -SparseAutoModelForCausalLM will decompress the whole model on load resulting in no -memory savings from quantization. By setting the `run_compressed` kwarg to True, the -model will remain compressed in memory on load, saving memory during inference at the -cost of increased runtime +This example covers how to load a quantized model using AutoModelForCausalLM. During inference, each layer will be decompressed as needed before the forward pass. This saves memory as only a single layer is ever uncompressed at a time, but increases @@ -25,9 +19,10 @@ "def fibonacci(n):", ] -# set run_compressed=True to enable running in compressed mode -compressed_model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_STUB, torch_dtype="auto", device_map="cuda:0", run_compressed=True +compressed_model = AutoModelForCausalLM.from_pretrained( + MODEL_STUB, + torch_dtype="auto", + device_map="cuda:0", ) # tokenize the sample data diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml similarity index 96% rename from examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml rename to examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml index b31504a5a..166e41a66 100644 --- a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml +++ b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml @@ -23,7 +23,6 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: - sequential_update: true ignore: ["lm_head"] config_groups: group_0: diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml similarity index 96% rename from examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml rename to examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml index cc42da3f0..2ad00b457 100644 --- a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml +++ b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml @@ -23,7 +23,6 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: - sequential_update: true ignore: ["lm_head"] config_groups: group_0: diff --git a/examples/quantization_24_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md similarity index 79% rename from examples/quantization_24_sparse_w4a16/README.md rename to examples/quantization_2of4_sparse_w4a16/README.md index 6e006d9db..7b00cf682 100644 --- a/examples/quantization_24_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -29,7 +29,7 @@ This example uses LLMCompressor and Compressed-Tensors to create a 2:4 sparse an The model is calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is required to run this example. -Follow the steps below, or to run the example as `python examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py` +Follow the steps below, or to run the example as `python examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py` ## Step 1: Select a model, dataset, and recipe In this step, we select which model to use as a baseline for sparsification, a dataset to @@ -40,34 +40,34 @@ Models can reference a local directory, or a model in the huggingface hub. Datasets can be from a local compatible directory or the huggingface hub. Recipes are YAML files that describe how a model should be optimized during or after training. -The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). +The recipe used for this flow is located in [2of4_w4a16_recipe.yaml](./2of4_w4a16_recipe.yaml). It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning, and quantize to 4 bits in one show using GPTQ. ```python import torch -from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoModelForCausalLM model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) dataset = "ultrachat-200k" splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} -recipe = "2:4_w4a16_recipe.yaml" +recipe = "2of4_w4a16_recipe.yaml" ``` ## Step 2: Run sparsification using `apply` The `apply` function applies the given recipe to our model and dataset. The hardcoded kwargs may be altered based on each model's needs. -After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. +After running, the sparsified model will be saved to `output_llama7b_2of4_w4a16_channel`. ```python from llmcompressor.transformers import apply -output_dir = "output_llama7b_2:4_w4a16_channel" +output_dir = "output_llama7b_2of4_w4a16_channel" apply( model=model, @@ -86,6 +86,7 @@ apply( lr_scheduler_type="cosine", warmup_ratio=0.1, ) + ``` @@ -96,14 +97,14 @@ run the following: ```python import torch -from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoModelForCausalLM -compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed" -model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) +compressed_output_dir = "output_llama7b_2of4_w4a16_channel_compressed" +model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) model.save_pretrained(compressed_output_dir, save_compressed=True) ``` ### Custom Quantization The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. -The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. -To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`. +The above recipe (`2of4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. +To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2of4_w4a16_group-128_recipe.yaml`. diff --git a/examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py similarity index 79% rename from examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py rename to examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index fba6db74f..7c1aee71d 100644 --- a/examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -1,13 +1,15 @@ import torch +from loguru import logger +from transformers import AutoModelForCausalLM -from llmcompressor.transformers import SparseAutoModelForCausalLM, apply +from llmcompressor.transformers import apply # define a recipe to handle sparsity, finetuning and quantization -recipe = "2:4_w4a16_recipe.yaml" +recipe = "2of4_w4a16_recipe.yaml" # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) @@ -15,7 +17,7 @@ dataset = "ultrachat-200k" # save location of quantized model -output_dir = "output_llama7b_2:4_w4a16_channel" +output_dir = "output_llama7b_2of4_w4a16_channel" # set dataset config parameters splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} @@ -51,3 +53,7 @@ lr_scheduler_type=lr_scheduler_type, warmup_ratio=warmup_ratio, ) +logger.info( + "Note: vLLM requires the dtype=torch.float16 when running the ", + "compressed marlin-24 model", +) diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md index 11d78ab18..906990c21 100644 --- a/examples/quantization_kv_cache/README.md +++ b/examples/quantization_kv_cache/README.md @@ -33,14 +33,13 @@ Let's walk through the main steps of the quantization process: ### 1. Load Model -Load the model using `SparseAutoModelForCausalLM`: +Load the model using `AutoModelForCausalLM`: ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index a3e2a8e95..6c08d4acc 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -1,11 +1,12 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from loguru import logger +from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", @@ -81,6 +82,11 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +logger.info( + "Running sample generation. ", + "Note: Inference with the quantized kv_cache is not supported. ", + "Please use vLLM for inference with the quantized kv_cache.", +) # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md index 4ecac4ed0..718975331 100644 --- a/examples/quantization_w4a16/README.md +++ b/examples/quantization_w4a16/README.md @@ -34,14 +34,13 @@ Now, we will step though the code in the example. There are four steps: ### 1) Load Model -Load the model using `SparseAutoModelForCausalLM`, which is a wrapper around `AutoModel` for handling quantized saving and loading. Note that `SparseAutoModel` is compatible with `accelerate` so you can load your model onto multiple GPUs if needed. +Load the model using `AutoModelForCausalLM` for handling quantized saving and loading. ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 939991ab6..c08165299 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -1,13 +1,13 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md index ef373caa1..091946623 100644 --- a/examples/quantization_w8a8_fp8/README.md +++ b/examples/quantization_w8a8_fp8/README.md @@ -9,7 +9,7 @@ To get started, install: ```bash -pip install llmcompressor==0.1.0 +pip install llmcompressor ``` ## Quickstart @@ -31,15 +31,14 @@ Now, we will step though the code in the example. There are three steps: ### 1) Load Model -Load the model using `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM` for saving and loading quantized models. +Load the model using `AutoModelForCausalLM` ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index 20700da53..5b0662ec3 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -1,12 +1,12 @@ -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot MODEL_ID = "google/gemma-2-27b-it" # 1) Load model. -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -21,7 +21,12 @@ # 3) Apply quantization and save in compressed-tensors format. OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR, tokenizer=tokenizer) +oneshot( + model=model, + recipe=recipe, + tokenizer=tokenizer, + output_dir=OUTPUT_DIR, +) # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index 50bd1c4ff..d6ea7b363 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -1,13 +1,14 @@ from transformers import AutoProcessor, MllamaForConditionalGeneration from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import oneshot, wrap_hf_model_class +from llmcompressor.transformers import oneshot MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Load model. -model_class = wrap_hf_model_class(MllamaForConditionalGeneration) -model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +model = MllamaForConditionalGeneration.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. @@ -22,7 +23,11 @@ # Apply quantization and save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR) +oneshot( + model=model, + recipe=recipe, + output_dir=SAVE_DIR, +) processor.save_pretrained(SAVE_DIR) # Confirm generations of the quantized model look sane. diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index 582537915..6dc870b32 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -1,12 +1,12 @@ -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # Load model. -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index 96dae0cee..6b3a721a1 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -1,13 +1,14 @@ from transformers import AutoProcessor, LlavaForConditionalGeneration from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import oneshot, wrap_hf_model_class +from llmcompressor.transformers import oneshot MODEL_ID = "llava-hf/llava-1.5-7b-hf" # Load model. -model_class = wrap_hf_model_class(LlavaForConditionalGeneration) -model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +model = LlavaForConditionalGeneration.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index 32c345424..ab7e4f682 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -1,13 +1,14 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import oneshot, wrap_hf_model_class +from llmcompressor.transformers import oneshot MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" # Load model. -model_class = wrap_hf_model_class(Qwen2VLForConditionalGeneration) -model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +model = Qwen2VLForConditionalGeneration.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md index 8511721c0..a7e15c330 100644 --- a/examples/quantization_w8a8_int8/README.md +++ b/examples/quantization_w8a8_int8/README.md @@ -9,7 +9,7 @@ To get started, install: ```bash -pip install llmcompressor==0.1.0 +pip install llmcompressor ``` ## Quickstart @@ -32,14 +32,13 @@ Now, we will step though the code in the example. There are four steps: ### 1) Load Model -Load the model using `SparseAutoModelForCausalLM`, which is a wrapper around `AutoModel` for handling quantized saving and loading. Note that `SparseAutoModel` is compatible with `accelerate` so you can load your model onto multiple GPUs if needed. +Load the model using `AutoModelForCausalLM` for handling quantized saving and loading. ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index 976ac5473..4747057eb 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -1,12 +1,12 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot # 1) Select model and load it. MODEL_ID = "google/gemma-2-2b-it" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", @@ -55,7 +55,6 @@ def tokenize(sample): # 3) Select quantization algorithms. In this case, we: # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# Note: set sequential_update: true in the recipe to reduce memory recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]) # 4) Apply quantization and save to disk compressed. diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index 123bc62bc..a97ed3198 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -1,13 +1,13 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", @@ -57,13 +57,12 @@ def tokenize(sample): # * apply SmoothQuant to make the activations easier to quantize # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# Note: set sequential_update: true in the recipe to reduce memory recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), ] -# Apply algorithms. +# Apply algorithms and save to output_dir oneshot( model=model, dataset=ds, diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md index 4421bdf01..0bb6bf007 100644 --- a/examples/quantizing_moe/README.md +++ b/examples/quantizing_moe/README.md @@ -65,6 +65,7 @@ oneshot( max_seq_length=2048, num_calibration_samples=512, ) + ``` ### Custom Quantization diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index a6ac450d0..0515f8355 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -1,8 +1,8 @@ import torch from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # select a Mixture of Experts model for quantization @@ -18,7 +18,7 @@ trust_remote_code=True, ) -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py index 32db0485f..4156261d1 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -1,13 +1,13 @@ from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot # select a Mixture of Experts model for quantization MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -73,7 +73,6 @@ def tokenize(sample): output_dir=SAVE_DIR, ) - print("========== SAMPLE GENERATION ==============") SAMPLE_INPUT = ["I love quantization because"] tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index 3c02f5d8d..507bce620 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -1,9 +1,9 @@ import torch from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # select a Mixture of Experts model for quantization @@ -19,7 +19,7 @@ trust_remote_code=True, ) -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -70,7 +70,6 @@ def tokenize(sample): targets="Linear", scheme="W8A8", ignore=["lm_head", "re:.*mlp.gate$"], - sequential_update=True, ), ] @@ -86,7 +85,6 @@ def tokenize(sample): output_dir=SAVE_DIR, ) - print("========== SAMPLE GENERATION ==============") SAMPLE_INPUT = ["I love quantization because"] tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml index 05e294365..23f276e2f 100644 --- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: true ignore: [lm_head, "re:.*mlp.gate$"] config_groups: group_0: diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index ac7510b03..bbbde067e 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -1,9 +1,9 @@ from typing import List -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" @@ -14,7 +14,7 @@ MODEL_ID, reserve_for_hessians=True, num_gpus=NUM_GPUS, torch_dtype="auto" ) -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map=device_map, torch_dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -26,7 +26,7 @@ NUM_CALIBRATION_SAMPLES = 512 # Save location of quantized model -OUTPUT_DIR = f"{MODEL_ID.split('/')[-1]}-FP8" +SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8" SAVE_COMPRESSED = True layers_to_ignore: List[str] = [ @@ -46,7 +46,7 @@ num_calibration_samples=NUM_CALIBRATION_SAMPLES, save_compressed=SAVE_COMPRESSED, overwrite_output_dir=True, - output_dir=OUTPUT_DIR, + output_dir=SAVE_DIR, ) # Confirm generations of the quantized model look sane. diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py index 926a054c3..b2f597ec8 100644 --- a/examples/trl_mixin/ex_trl_constant.py +++ b/examples/trl_mixin/ex_trl_constant.py @@ -1,13 +1,13 @@ from datasets import load_dataset from sft_trainer import SFTTrainer -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from trl import DataCollatorForCompletionOnlyLM -from llmcompressor.transformers import SparseAutoModelForCausalLM, TrainingArguments +from llmcompressor.transformers import TrainingArguments model_path = "neuralmagic/Llama-2-7b-pruned50-retrained" output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_path) @@ -57,4 +57,3 @@ def formatting_prompts_func(example): max_seq_length=512, ) trainer.train() -trainer.save_model() diff --git a/examples/trl_mixin/ex_trl_distillation.py b/examples/trl_mixin/ex_trl_distillation.py index ee9a299ad..ff3ddf000 100644 --- a/examples/trl_mixin/ex_trl_distillation.py +++ b/examples/trl_mixin/ex_trl_distillation.py @@ -1,9 +1,8 @@ from sft_trainer import SFTTrainer -from transformers import AutoTokenizer, DefaultDataCollator +from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator from llmcompressor.transformers import ( DataTrainingArguments, - SparseAutoModelForCausalLM, TextGenerationDataset, TrainingArguments, ) @@ -12,10 +11,10 @@ teacher_path = "neuralmagic/Llama-2-7b-gsm8k" output_dir = "./output_trl_sft_test_7b_gsm8k" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype="auto", device_map="auto" ) -teacher = SparseAutoModelForCausalLM.from_pretrained( +teacher = AutoModelForCausalLM.from_pretrained( teacher_path, torch_dtype="auto", device_map="auto" ) diff --git a/setup.py b/setup.py index 389469341..71a681b48 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ "torch>=1.7.0", "transformers>4.0,<5.0", "datasets", - "accelerate>=0.20.3", + "accelerate>=0.20.3,!=1.1.0", "pynvml==11.5.3", "compressed-tensors" if version_info.build_type == "release" @@ -71,6 +71,7 @@ "pytest-mock>=3.6.0", "pytest-rerunfailures>=13.0", "parameterized", + "lm_eval==0.4.5", # example test dependencies "beautifulsoup4~=4.12.3", "cmarkgfm~=2024.1.14", diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py index 41072feb9..7c489f36f 100644 --- a/src/llmcompressor/core/session.py +++ b/src/llmcompressor/core/session.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Union +from loguru import logger + from llmcompressor.core.events import EventType from llmcompressor.core.helpers import log_model_info, should_log_model_info from llmcompressor.core.lifecycle import CompressionLifecycle @@ -260,12 +262,16 @@ def reset_stage(self): self.lifecycle.initialized_ = False self.lifecycle.finalized = False - def get_serialized_recipe(self) -> str: + def get_serialized_recipe(self) -> Optional[str]: """ :return: serialized string of the current compiled recipe """ recipe = self.lifecycle.recipe_container.compiled_recipe - return recipe.yaml() + + if recipe is not None and hasattr(recipe, "yaml"): + return recipe.yaml() + + logger.warning("Recipe not found in session - it may have been reset") def _log_model_info(self): # Log model level logs if cadence reached diff --git a/src/llmcompressor/modifiers/distillation/output/base.py b/src/llmcompressor/modifiers/distillation/output/base.py index 3716db359..130e2470c 100644 --- a/src/llmcompressor/modifiers/distillation/output/base.py +++ b/src/llmcompressor/modifiers/distillation/output/base.py @@ -40,6 +40,12 @@ def on_initialize(self, state: State, **kwargs) -> bool: if kwargs.get("fsdp_active"): self.fsdp_active_ = True + if not hasattr(state.model.config, "hidden_size"): + raise ValueError( + "Model config must specify hidden_size in order to use " + "OutputDistillationModifier" + ) + # needed to initialize intermediate output buffers for student and teacher hidden_size = ( kwargs.get("metadata").get("per_device_train_batch_size", 1), diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py index 77301cc6e..ee96e4763 100644 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py +++ b/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py @@ -24,6 +24,7 @@ def __init__( super(KDModuleWrapper, self).__init__() self.layer = layer + self._save_active = False self._fsdp_active = fsdp_active self.offload_output = offload_output self.kd_transforms = transforms @@ -88,16 +89,28 @@ def named_modules( prefix: str = "", remove_duplicate: bool = True, ): - # we want the full names of modules in two cases + # outside of saving, we want the full names of modules in two cases: # 1. trainer initialization, so teacher is moved to the correct device. This is # caught by the kd_enabled flag, which is set when the modifier is started # 2. running in DataParallel (non-FSDP) mode so the replicate function can pick # up the teacher. - if not self.kd_enabled or not self._fsdp_active: - return super().named_modules( + if self._save_active or (self.kd_enabled and self._fsdp_active): + return self.layer.named_modules( memo=memo, prefix=prefix, remove_duplicate=remove_duplicate ) - return self.layer.named_modules( + return super().named_modules( memo=memo, prefix=prefix, remove_duplicate=remove_duplicate ) + + def prepare_for_save(self): + """ + Prepare model structure to be saved, specifically `self.named_modules` + """ + self._save_active = True + + def finish_save(self): + """ + Finish saving model + """ + self._save_active = False diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py index 0f7aaf672..33ba6f698 100644 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py +++ b/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py @@ -23,6 +23,7 @@ def __init__( self.teacher_model = teacher_model self.wrappers = wrappers self.kd_comparison = comparison + self._save_active = False self._fsdp_active = fsdp_active self.kd_enabled = False self.register_buffer(self.KD_LAST_COMPARISON, torch.zeros(1, device="cpu")) @@ -88,17 +89,17 @@ def named_modules( prefix: str = "", remove_duplicate: bool = True, ): - # we want the full names of modules in two cases + # outside of saving, we want the full names of modules in two cases: # 1. trainer initialization, so teacher is moved to the correct device. This is # caught by the kd_enabled flag, which is set when the modifier is started # 2. running in DataParallel (non-FSDP) mode so the replicate function can pick # up the teacher. - if not self.kd_enabled or not self._fsdp_active: - return super().named_modules( + if self._save_active or (self.kd_enabled and self._fsdp_active): + return self.student_model.named_modules( memo=memo, prefix=prefix, remove_duplicate=remove_duplicate ) - return self.student_model.named_modules( + return super().named_modules( memo=memo, prefix=prefix, remove_duplicate=remove_duplicate ) @@ -109,6 +110,24 @@ def train(self, mode: bool = True): self.student_model.train(mode) return self + def prepare_for_save(self): + """ + Prepare model structure to be saved, specifically `self.named_modules` + """ + self._save_active = True + for student_wrapper, teacher_wrapper in self.wrappers.values(): + student_wrapper.prepare_for_save() + teacher_wrapper.prepare_for_save() + + def finish_save(self): + """ + Finish saving model + """ + self._save_active = False + for student_wrapper, teacher_wrapper in self.wrappers.values(): + student_wrapper.finish_save() + teacher_wrapper.finish_save() + def __getattr__(self, name: str) -> Any: try: return super().__getattr__(name) diff --git a/src/llmcompressor/modifiers/modifier.py b/src/llmcompressor/modifiers/modifier.py index 494f8bdfc..65b4a4029 100644 --- a/src/llmcompressor/modifiers/modifier.py +++ b/src/llmcompressor/modifiers/modifier.py @@ -1,16 +1,15 @@ -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Optional -from pydantic import BaseModel - from llmcompressor.core.events import Event, EventType from llmcompressor.core.state import State from llmcompressor.modifiers.interface import ModifierInterface +from llmcompressor.modifiers.utils.hooks import HooksMixin __all__ = ["Modifier"] -class Modifier(BaseModel, ModifierInterface, ABC): +class Modifier(ModifierInterface, HooksMixin): """ A base class for all modifiers to inherit from. Modifiers are used to modify the training process for a model. diff --git a/src/llmcompressor/modifiers/obcq/base.py b/src/llmcompressor/modifiers/obcq/base.py index 3da0e3d0c..9cf0ff331 100644 --- a/src/llmcompressor/modifiers/obcq/base.py +++ b/src/llmcompressor/modifiers/obcq/base.py @@ -1,3 +1,4 @@ +from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -130,7 +131,8 @@ def initialize_compression( "Inferring layer-wise sparsities from " f"{len(dataloader)} calibration samples..." ) - self.sparsity = self._infer_layer_sparsity(dataloader) + activations = self._get_activations(dataloader) + self.sparsity = self._infer_layer_sparsity(activations) self._validate_layerwise_sparsity() for idx, (name, layer) in enumerate(self.compressible_layers_.items()): @@ -254,19 +256,17 @@ def _infer_mask_block_size(self): self.prunen_, self.prunem_ = list(map(int, self.mask_structure.split(":"))) - def _infer_layer_sparsity(self, calibration_dataloader): - acts = _get_activations(self.model, calibration_dataloader) + def _infer_layer_sparsity(self, activations): sparsegpt_groups = {} for name, layer in self.compressible_layers_.items(): prunable_layers = get_prunable_layers(layer) z = [ - m.weight.abs() * acts[f"{name}.{n}"].unsqueeze(0) + m.weight.abs() * activations[f"{name}.{n}"].unsqueeze(0) for n, m in prunable_layers.items() ] sparsegpt_groups[name] = torch.cat([item.flatten().cpu() for item in z]) - acts = None - del acts + del activations torch.cuda.empty_cache() outlier_ratios = {} @@ -300,36 +300,34 @@ def _infer_layer_sparsity(self, calibration_dataloader): logger.info(f"Sparsity for {k}: {sparsities[k]}") return sparsities + @torch.no_grad() + def _get_activations(self, data_loader, nsamples=128): + self.model.eval() + acts = {} + + def save_acts(module, input, name): + if isinstance(input, tuple): + input = input[0] + if name not in acts: + acts[name] = ( + 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() + ) + else: + acts[name] += ( + 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() + ) + + for name, mod in self.model.named_modules(): + if isinstance(mod, torch.nn.Linear) and "lm_head" not in name: + self.register_hook(mod, partial(save_acts, name=name), "forward_pre") + + device = next(self.model.parameters()).device + for batch in tqdm(data_loader): + batch = {k: v.to(device) for k, v in batch.items()} + self.model(**batch) + batch = None + torch.cuda.empty_cache() -@torch.no_grad() -def _get_activations(model, data_loader, nsamples=128): - import functools - - model.eval() - acts = {} - - def save_acts(module, input, name): - if isinstance(input, tuple): - input = input[0] - if name not in acts: - acts[name] = 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() - else: - acts[name] += 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() - - hooks = [] - for name, mod in model.named_modules(): - if isinstance(mod, torch.nn.Linear) and "lm_head" not in name: - hooks.append( - mod.register_forward_pre_hook(functools.partial(save_acts, name=name)) - ) - device = next(model.parameters()).device - for batch in tqdm(data_loader): - batch = {k: v.to(device) for k, v in batch.items()} - model(**batch) - batch = None - torch.cuda.empty_cache() - - for h in hooks: - h.remove() + self.remove_hooks() - return acts + return acts diff --git a/src/llmcompressor/modifiers/obcq/utils/sgpt_wrapper.py b/src/llmcompressor/modifiers/obcq/utils/sgpt_wrapper.py index b3eea3b56..0ea1eaeef 100644 --- a/src/llmcompressor/modifiers/obcq/utils/sgpt_wrapper.py +++ b/src/llmcompressor/modifiers/obcq/utils/sgpt_wrapper.py @@ -1,8 +1,6 @@ import time -from compressed_tensors.quantization.lifecycle.forward import ( - maybe_calibrate_or_quantize, -) +from compressed_tensors.quantization.lifecycle.forward import forward_quantize from llmcompressor.modifiers.utils.compression_wrapper import ModuleCompressionWrapper from llmcompressor.utils import getattr_chain @@ -40,7 +38,10 @@ def __init__(self, name, layer): # for Hessian calculation self.register_buffer( - "H", torch.zeros((self.columns, self.columns), device=self.dev) + "H", + torch.zeros( + (self.columns, self.columns), device=self.dev, dtype=torch.float32 + ), ) def add_batch(self, inp: torch.Tensor, out: torch.Tensor): @@ -61,7 +62,8 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor): inp = inp.t() self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp - inp = math.sqrt(2 / self.nsamples) * inp.float() + inp = inp.to(dtype=self.H.dtype) + inp = math.sqrt(2 / self.nsamples) * inp self.H += inp.matmul(inp.t()).to(self.dev) def compress( @@ -91,7 +93,7 @@ def compress( args_loc = "quantization_scheme.weights" weight_quant_args = getattr_chain(self.layer, args_loc, None) if weight_quant_args is not None: - W = maybe_calibrate_or_quantize(self.layer, W, "weight", weight_quant_args) + W = forward_quantize(self.layer, W, "weight", weight_quant_args) if isinstance(self.layer, nn.Conv2d): W = W.flatten(1) @@ -209,7 +211,7 @@ def compress( W = W.t() W = W.reshape(final_shape).to(final_dtype) if weight_quant_args is not None: - W = maybe_calibrate_or_quantize(self.layer, W, "weight", weight_quant_args) + W = forward_quantize(self.layer, W, "weight", weight_quant_args) # This is a bit hacky, but FSDP updates only work if we change the weight in # place, clone() or direct assignment won't work diff --git a/src/llmcompressor/modifiers/pruning/utils/pytorch/layer_mask.py b/src/llmcompressor/modifiers/pruning/utils/pytorch/layer_mask.py index 3ada8c7fb..d59b4563b 100644 --- a/src/llmcompressor/modifiers/pruning/utils/pytorch/layer_mask.py +++ b/src/llmcompressor/modifiers/pruning/utils/pytorch/layer_mask.py @@ -2,11 +2,10 @@ from typing import Dict import torch -from pydantic import BaseModel from torch.nn import Parameter -from torch.utils.hooks import RemovableHandle from llmcompressor.core import ModelParameterizedLayer +from llmcompressor.modifiers.utils.hooks import HooksMixin __all__ = ["LayerParamMasking", "param_mask_name"] @@ -39,11 +38,9 @@ class ParameterizedLayerMaskSettings: use_hooks: bool = False -class LayerParamMasking(BaseModel): +class LayerParamMasking(HooksMixin): _mask_settings: Dict[str, ParameterizedLayerMaskSettings] = {} _masked_layer_params: Dict[str, ModelParameterizedLayer] = {} - _forward_hooks: Dict[str, RemovableHandle] = {} - _backward_hooks: Dict[str, RemovableHandle] = {} enabled_: bool = False def add_mask( @@ -100,12 +97,8 @@ def _backward_hook_fn(gradients): return gradients - self._forward_hooks[layer_param_name] = ( - parameterized_layer.layer.register_forward_hook(_forward_hook_fn) - ) - self._backward_hooks[layer_param_name] = ( - parameterized_layer.param.register_hook(_backward_hook_fn) - ) + self.register_hook(parameterized_layer.layer, _forward_hook_fn, "forward") + self.register_hook(parameterized_layer.param, _backward_hook_fn, "") def update_mask( self, @@ -131,11 +124,7 @@ def remove_mask(self, layer_param_name: str): del self._mask_settings[layer_param_name] if mask_settings.use_hooks: - self._forward_hooks[layer_param_name].remove() - self._backward_hooks[layer_param_name].remove() - - del self._forward_hooks[layer_param_name] - del self._backward_hooks[layer_param_name] + self.remove_hooks() def apply_mask_weight(self, layer_param_name: str): if not self.enabled_: diff --git a/src/llmcompressor/modifiers/pruning/wanda/base.py b/src/llmcompressor/modifiers/pruning/wanda/base.py index f056ee1ae..1881a347c 100644 --- a/src/llmcompressor/modifiers/pruning/wanda/base.py +++ b/src/llmcompressor/modifiers/pruning/wanda/base.py @@ -1,3 +1,4 @@ +from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -121,7 +122,8 @@ def initialize_compression( "Inferring layer-wise sparsities from " f"{len(dataloader) if dataloader else 0} calibration samples..." ) - self.sparsity = self._infer_layer_sparsity(dataloader) + activations = self._get_activations(dataloader) + self.sparsity = self._infer_layer_sparsity(activations) self._validate_layerwise_sparsity() for idx, (name, layer) in enumerate(self.compressible_layers_.items()): @@ -224,19 +226,17 @@ def _infer_mask_block_size(self): self.prunen_, self.prunem_ = list(map(int, self.mask_structure.split(":"))) - def _infer_layer_sparsity(self, calibration_dataloader): - acts = _get_activations(self.model, calibration_dataloader) + def _infer_layer_sparsity(self, activations): wanda = {} for name, layer in self.compressible_layers_.items(): prunable_layers = get_prunable_layers(layer) z = [ - m.weight.abs() * acts[f"{name}.{n}"].unsqueeze(0) + m.weight.abs() * activations[f"{name}.{n}"].unsqueeze(0) for n, m in prunable_layers.items() ] wanda[name] = torch.cat([item.flatten().cpu() for item in z]) - acts = None - del acts + del activations torch.cuda.empty_cache() outlier_ratios = {} @@ -268,36 +268,34 @@ def _infer_layer_sparsity(self, calibration_dataloader): logger.info(f"Sparsity for {k}: {sparsities[k]}") return sparsities + @torch.no_grad() + def _get_activations(self, data_loader, nsamples=128): + self.model.eval() + acts = {} + + def save_acts(module, input, name): + if isinstance(input, tuple): + input = input[0] + if name not in acts: + acts[name] = ( + 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() + ) + else: + acts[name] += ( + 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() + ) + + for name, mod in self.model.named_modules(): + if isinstance(mod, torch.nn.Linear) and "lm_head" not in name: + self.register_hook(mod, partial(save_acts, name=name), "forward_pre") + + device = next(self.model.parameters()).device + for batch in tqdm(data_loader): + batch = {k: v.to(device) for k, v in batch.items()} + self.model(**batch) + batch = None + torch.cuda.empty_cache() -@torch.no_grad() -def _get_activations(model, data_loader, nsamples=128): - import functools - - model.eval() - acts = {} - - def save_acts(module, input, name): - if isinstance(input, tuple): - input = input[0] - if name not in acts: - acts[name] = 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() - else: - acts[name] += 1.0 / nsamples * input.detach().pow(2).sum(dim=(0, 1)).sqrt() - - hooks = [] - for name, mod in model.named_modules(): - if isinstance(mod, torch.nn.Linear) and "lm_head" not in name: - hooks.append( - mod.register_forward_pre_hook(functools.partial(save_acts, name=name)) - ) - device = next(model.parameters()).device - for batch in tqdm(data_loader): - batch = {k: v.to(device) for k, v in batch.items()} - model(**batch) - batch = None - torch.cuda.empty_cache() - - for h in hooks: - h.remove() + self.remove_hooks() - return acts + return acts diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py index 226869f39..f1cdf596c 100644 --- a/src/llmcompressor/modifiers/quantization/__init__.py +++ b/src/llmcompressor/modifiers/quantization/__init__.py @@ -1,4 +1,5 @@ # flake8: noqa +from .cache import * from .gptq import * from .quantization import * diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py new file mode 100644 index 000000000..d89964ed3 --- /dev/null +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -0,0 +1,202 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, List, Optional, Tuple + +from compressed_tensors.quantization.lifecycle import KVCacheScaleType +from compressed_tensors.quantization.quant_args import QuantizationArgs +from torch import Tensor +from transformers import DynamicCache + +from llmcompressor.observers import Observer + + +class QuantizedKVParameterCache(DynamicCache): + """ + Quantized KV cache used in the forward call based on HF's dynamic cache. + Quantization strategy (tensor, group, channel) set from Quantization arg's strategy + Singleton, so that the same cache gets reused in all forward call of self_attn. + Each time forward is called, .update() is called, and ._quantize(), ._dequantize() + gets called appropriately. + The size of tensor is + `[batch_size, num_heads, seq_len - residual_length, head_dim]`. + + + Triggered by adding kv_cache_scheme in the recipe. + + Example: + + ```python3 + recipe = ''' + quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + ''' + + """ + + _instance = None + _initialized = False + + def __new__(cls, *args, **kwargs): + """Singleton""" + if cls._instance is None: + cls._instance = super(QuantizedKVParameterCache, cls).__new__(cls) + return cls._instance + + def __init__(self, quantization_args: QuantizationArgs): + if not self._initialized: + super().__init__() + + self.quantization_args = quantization_args + + self.k_observers: List[Observer] = [] + self.v_observers: List[Observer] = [] + + # each index corresponds to layer_idx of the attention layer + self.k_scales: List[Tensor] = [] + self.v_scales: List[Tensor] = [] + + self.k_zps: List[Tensor] = [] + self.v_zps: List[Tensor] = [] + + self._initialized = True + + def update( + self, + key_states: Tensor, + value_states: Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[Tensor, Tensor]: + """ + Get the k_scale and v_scale and output the + fakequant-ed key_states and value_states + """ + + if len(self.k_observers) <= layer_idx: + k_observer_name = self.quantization_args.get_observer() + k_observer = Observer.load_from_registry( + k_observer_name, quantization_args=self.quantization_args + ) + v_observer_name = self.quantization_args.get_observer() + v_observer = Observer.load_from_registry( + v_observer_name, quantization_args=self.quantization_args + ) + + self.k_observers.append(k_observer) + self.v_observers.append(v_observer) + + q_key_states = self._quantize( + key_states.contiguous(), KVCacheScaleType.KEY, layer_idx + ) + q_value_states = self._quantize( + value_states.contiguous(), KVCacheScaleType.VALUE, layer_idx + ) + + qdq_key_states = self._dequantize(q_key_states, KVCacheScaleType.KEY, layer_idx) + qdq_value_states = self._dequantize( + q_value_states, KVCacheScaleType.VALUE, layer_idx + ) + + keys_to_return, values_to_return = qdq_key_states, qdq_value_states + + return keys_to_return, values_to_return + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """ + Returns the sequence length of the cached states. + A layer index can be optionally passed. + """ + if len(self.key_cache) <= layer_idx: + return 0 + # since we cannot get the seq_length of each layer directly and + # rely on `_seen_tokens` which is updated every "layer_idx" == 0, + # this is a hack to get the actual seq_length for the given layer_idx + # this part of code otherwise fails when used to + # verify attn_weight shape in some models + return self._seen_tokens if layer_idx == 0 else self._seen_tokens - 1 + + def reset_states(self): + """reset the kv states (used in calibration)""" + self.key_cache: List[Tensor] = [] + self.value_cache: List[Tensor] = [] + # Used in `generate` to keep tally of how many tokens the cache has seen + self._seen_tokens = 0 + self._quantized_key_cache: List[Tensor] = [] + self._quantized_value_cache: List[Tensor] = [] + + def reset(self): + """ + Reset the instantiation, create new instance on init + """ + QuantizedKVParameterCache._instance = None + QuantizedKVParameterCache._initialized = False + + def _quantize(self, tensor, kv_type, layer_idx): + """Quantizes a key/value using a defined quantization method.""" + from compressed_tensors.quantization.lifecycle.forward import quantize + + if kv_type == KVCacheScaleType.KEY: # key type + observer = self.k_observers[layer_idx] + scales = self.k_scales + zps = self.k_zps + else: + assert kv_type == KVCacheScaleType.VALUE + observer = self.v_observers[layer_idx] + scales = self.v_scales + zps = self.v_zps + + scale, zp = observer(tensor) + if len(scales) <= layer_idx: + scales.append(scale) + zps.append(zp) + else: + scales[layer_idx] = scale + zps[layer_idx] = scale + + q_tensor = quantize( + x=tensor, + scale=scale, + zero_point=zp, + args=self.quantization_args, + ) + return q_tensor + + def _dequantize(self, qtensor, kv_type, layer_idx): + """Dequantizes back the tensor that was quantized by `self._quantize()`""" + from compressed_tensors.quantization.lifecycle.forward import dequantize + + if kv_type == KVCacheScaleType.KEY: + scale = self.k_scales[layer_idx] + zp = self.k_zps[layer_idx] + else: + assert kv_type == KVCacheScaleType.VALUE + scale = self.v_scales[layer_idx] + zp = self.v_zps[layer_idx] + + qdq_tensor = dequantize( + x_q=qtensor, + scale=scale, + zero_point=zp, + args=self.quantization_args, + ) + return qdq_tensor diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py new file mode 100644 index 000000000..ee4ce171e --- /dev/null +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -0,0 +1,249 @@ +from typing import Any, Dict, Optional, Tuple + +import torch +from compressed_tensors.quantization import QuantizationStatus, is_attention_module +from compressed_tensors.quantization.lifecycle.forward import forward_quantize +from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme +from compressed_tensors.utils.offload import is_module_offloaded, update_parameter_data +from loguru import logger +from torch.nn import Module + +from llmcompressor.modifiers.quantization.cache import QuantizedKVParameterCache +from llmcompressor.observers import Observer + +__all__ = [ + "initialize_observer", + "update_weight_zp_scale", + "calibrate_input_hook", + "calibrate_output_hook", + "calibrate_kv_cache_input_hook", + "calibrate_kv_cache_output_hook", + "set_unset_kv_cache", + "freeze_module_quantization", + "apply_calibration_status", +] + + +def initialize_observer( + module: Module, + base_name: str, +): + """ + Initialize observer module and attach as submodule. + The name of the observer is fetched from the quantization_args. + The name is then used to load the observer from the registry and attached + to the module. The name of the observer uses the base_name provided. + + :param module: torch.nn.Module that the observer is being attached to + :param base_name: str used to name the observer attribute + + """ + + arg_name = "weights" if base_name == "weight" else f"{base_name}_activations" + quantization_scheme = getattr(module, "quantization_scheme", None) + if not quantization_scheme: + # no quantization scheme nothing to do + return + + # observers have a different lifecycle for kv_cache + if is_attention_module(module): + return + + quantization_args = getattr(quantization_scheme, arg_name, None) + # dont need observers for dynamic + if quantization_args and not quantization_args.dynamic: + observer = quantization_args.get_observer() + observer = Observer.load_from_registry( + observer, quantization_args=quantization_args + ) + module.register_module(f"{base_name}_observer", observer) + + +def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor] = None): + """ + Call a module's attached input/weight/output observer using a provided value. + Update the module's scale and zp using the observer's return values. + + :param module: torch.nn.Module + :param base_name: substring used to fetch the observer, scales, and zp + :param value: torch.Tensor to be passed to the observer for activations. If + base_name is "weight", then the module's weight tensor will be used + """ + offloaded = is_module_offloaded(module) + if offloaded: + module._hf_hook.pre_forward(module) + + if base_name == "weight": + value = module.weight + g_idx = getattr(module, "weight_g_idx", None) + elif value is not None: + g_idx = None + else: + raise ValueError("Must provide a value to observe if not using weight observer") + + observer = getattr(module, f"{base_name}_observer") + updated_scale, updated_zero_point = observer(value, g_idx=g_idx) + + # update scale and zero point + update_parameter_data(module, updated_scale, f"{base_name}_scale") + update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point") + + if offloaded: + module._hf_hook.post_forward(module, None) + + +def update_weight_zp_scale(module: Module): + """ + marks a layer as ready for calibration which activates observers + to update scales and zero points on each forward pass + + apply to full model with `model.apply(update_weight_zp_scale)` + + :param module: module to set for calibration + :param quantize_weights_upfront: whether to automatically + run weight quantization at the start of calibration + """ + if not getattr(module, "quantization_scheme", None): + # no quantization scheme nothing to do + return + + status = getattr(module, "quantization_status", None) + if not status: + # not set to initialize; no scales/zp to update + return + if status != QuantizationStatus.INITIALIZED: + logger.warning( + f"Attempting set module with status {status} to calibration mode. " + f"but status is not {QuantizationStatus.INITIALIZED} - you may " + "be calibrating an uninitialized module which may fail or attempting " + "to re-calibrate a frozen module" + ) + + if module.quantization_scheme.weights is not None: + # set weight scale and zero_point up front, calibration data doesn't affect it + call_observer(module=module, base_name="weight") + + +def calibrate_activations(module: Module, value: torch.Tensor, base_name: str): + """ + Calibrate input or output activations by calling the a module's attached + observer. + + :param module: torch.nn.Module + :param base_name: substring used to fetch the observer, scales, and zp + :param value: torch.Tensor to be passed to the observer + + """ + # If empty tensor, can't update zp/scale + # Case for MoEs + if value.numel() == 0: + return + + call_observer( + module=module, + base_name=base_name, + value=value, + ) + + +def calibrate_input_hook(module: Module, args: Any): + """ + Hook to calibrate input activations. + Will call the observers to update the scales/zp before applying + input QDQ in the module's forward pass. + """ + args = args[0] if isinstance(args, tuple) else args + calibrate_activations(module, value=args, base_name="input") + + +def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor): + """ + Hook to calibrate output activations. + Will call the observers to update the scales/zp before applying + output QDQ. + """ + calibrate_activations( + module, + value=output, + base_name="output", + ) + output = forward_quantize( + module=module, + value=output, + base_name="output", + args=module.quantization_scheme.output_activations, + ) + return output + + +def calibrate_kv_cache_input_hook( + module: Module, args: Any, kwargs: Dict[str, Any] +) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: + """ + Hook to update inputs to attention layers when running + kv_cache quantization. Will update the passed in + kv_cache to singleton QuantizedKVParameterCache. + """ + kv_cache = getattr(module, "kv_cache") + kwargs["past_key_value"] = kv_cache + kwargs["use_cache"] = False + return args, kwargs + + +def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Tensor): + """ + Hook to update k_scale and v_scale parameters when running kv_cache quantization. + """ + kv_cache = getattr(module, "kv_cache") + update_parameter_data(module, kv_cache.k_scales[module.layer_idx], "k_scale") + update_parameter_data(module, kv_cache.v_scales[module.layer_idx], "v_scale") + + +def set_unset_kv_cache(module: Module): + """ + Set or unset singleton QuantizedKVParameterCache for each + attn module when running kv_cache quantization. + """ + if not hasattr(module, "quantization_scheme"): + return + + if is_kv_cache_quant_scheme(module.quantization_scheme): + output_args = module.quantization_scheme.output_activations + kv_cache = QuantizedKVParameterCache(output_args) + if hasattr(module, "kv_cache"): + delattr(module, "kv_cache") + else: + setattr(module, "kv_cache", kv_cache) + + +def apply_calibration_status(module: Module): + scheme = getattr(module, "quantization_scheme", None) + if not scheme: + # no quantization scheme nothing to do + return + module.quantization_status = QuantizationStatus.CALIBRATION + + +def freeze_module_quantization(module: Module): + """ + deletes observers when calibration is complete. + + apply to full model with `model.apply(freeze_module_quantization)` + + :param module: module to freeze quantization for + """ + scheme = getattr(module, "quantization_scheme", None) + if not scheme: + # no quantization scheme nothing to do + return + + if module.quantization_status == QuantizationStatus.FROZEN: + # nothing to do, already frozen + return + + for name in ("input", "weight", "output"): + obs_name = f"{name}_observer" + if hasattr(module, obs_name): + delattr(module, obs_name) + + module.quantization_status = QuantizationStatus.FROZEN diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index b472e289e..c5200cf0f 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -1,4 +1,4 @@ -import gc +import warnings from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch @@ -6,7 +6,6 @@ QuantizationScheme, disable_quantization, enable_quantization, - freeze_module_quantization, ) from loguru import logger from pydantic import Field, field_validator @@ -14,6 +13,7 @@ from llmcompressor.core import State from llmcompressor.modifiers import Modifier, ModifierFactory +from llmcompressor.modifiers.quantization.calibration import freeze_module_quantization from llmcompressor.modifiers.quantization.gptq.utils import ( GPTQWrapper, get_output_error, @@ -21,6 +21,7 @@ from llmcompressor.modifiers.utils.layer_compressor import LayerCompressor from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward from llmcompressor.utils.fsdp.context import fix_fsdp_module_name +from llmcompressor.utils.helpers import DisableKVCache from llmcompressor.utils.pytorch.module import ( get_layers, get_no_split_params, @@ -48,7 +49,6 @@ class GPTQModifier(Modifier): | test_stage: | obcq_modifiers: | GPTQModifier: - | sequential_update: true | dampening_frac: 0.001 | block_size: 128 | config_groups: @@ -66,8 +66,8 @@ class GPTQModifier(Modifier): | actorder: False - :param sequential_update: Whether or not to update weights sequentially by layer, - True saves on GPU memory, default is True + :param sequential_update: Whether or not to update weights sequentially by layer. + This option is depreciated and setting to False is no longer supported :param targets: list of layer names to compress during GPTQ, or '__ALL__' to compress every layer in the model :param block_size: Used to determine number of columns to compress in one pass @@ -97,7 +97,7 @@ class GPTQModifier(Modifier): and activation 8 bit quantization on the Linear layers. """ - sequential_update: bool = True + sequential_update: bool = True # DEPRECIATED targets: Union[str, List[str], None] = None sequential_targets: Union[str, List[str], None] = None block_size: int = 128 @@ -117,13 +117,13 @@ class GPTQModifier(Modifier): @field_validator("sequential_update", mode="before") def validate_sequential_update(cls, value: bool) -> bool: if not value: - logger.warning( - "Not using sequential_update requires allocating all hessians in " - "GPU memory. If you are running into GPU memory issues, consider " - "using sequential_update=True" + warnings.warn( + "`sequential_update=False` is no longer supported, setting " + "sequential_update=True", + DeprecationWarning, ) - return value + return True def on_initialize_structure(self, state: State, **kwargs): """ @@ -245,7 +245,7 @@ def initialize_compression( compressible layers of model, and sets the device :param model: model to initialize for compression - :param dataloader: calibration data for GPTQ + :param dataloader: calibration data, not used by GPTQ in this function """ self.model = model self.compressible_layers_ = self.compressible_layers() @@ -257,16 +257,12 @@ def initialize_compression( args = self._pruning_arguments() comp_cls = self._compression_class() compressor = LayerCompressor(comp_cls, self.model, layer, idx, name, args) - - # if running sequentially, allocate all hessians now - if not self.sequential_update: - compressor.pre_compress() - self.layer_compressors_.append(compressor) - if self.sequential_update: - first_layer_compressor = self.layer_compressors_[0] - first_layer_compressor.set_early_stop() + # for the initial forward data pass, add an early stop exception in order + # to capture inputs right before being compressed by first module + first_layer_compressor = self.layer_compressors_[0] + first_layer_compressor.set_early_stop() @torch.no_grad() def apply_compression( @@ -286,48 +282,33 @@ def apply_compression( # want to calibrate wrt to these self.model.apply(disable_quantization) - forward_pass_use_cache = self.model.config.use_cache - self.model.config.use_cache = False - - # in non-sequential mode we run calibration through the full model - # in sequential mode we run calibration up to the first transformer target - intermediates = run_calibration_forward( - self.model, dataloader, mask_padding=True - ) - self.layer_compressors_[0].clear_early_stop() - - # empty cache if not using sequential update - if not self.sequential_update: - del intermediates - gc.collect() - torch.cuda.empty_cache() + with DisableKVCache(self.model): + # run_calibration_forward uses the early stop exception to capture values + # as intermediates right before the forward pass of the first module + intermediates = run_calibration_forward( + self.model, dataloader, mask_padding=True + ) + self.layer_compressors_[0].clear_early_stop() - num_layers = len(self.compressible_layers_) - for idx, layer_compressor in enumerate(self.layer_compressors_): - logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====") + num_layers = len(self.compressible_layers_) + for idx, layer_compressor in enumerate(self.layer_compressors_): + logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====") - if self.sequential_update: - # in sequential mode we run the forward pass for each transformer layer - # one at a time, caching the intermediate outputs between layers + # run the forward pass for each transformer layer (block) one at a time logger.info(f"Calibrating {layer_compressor.name}...") layer_compressor.pre_compress() unquantized_outputs = layer_compressor.calibrate_layer(intermediates) - layer_compressor.compress() - layer_compressor.post_compress() - layer_compressor.revert_layer_wrappers() + layer_compressor.compress() + layer_compressor.post_compress() + layer_compressor.revert_layer_wrappers() - if self.sequential_update: + # perform a second forward pass of the module to calculate + # weight-quantized outputs for use as inputs to the next layer quantized_outputs = layer_compressor.calibrate_layer(intermediates) error = get_output_error(unquantized_outputs, quantized_outputs) logger.info(f"Mean output error from quantization: {error:.3f}") intermediates = quantized_outputs - del unquantized_outputs - - gc.collect() - torch.cuda.empty_cache() - - self.model.config.use_cache = forward_pass_use_cache # re-enable quantization self.model.apply(enable_quantization) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index c54d9ca91..02eafb669 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -1,21 +1,15 @@ import time from typing import Tuple -from compressed_tensors.quantization import ( - ActivationOrdering, - QuantizationArgs, - QuantizationStrategy, -) +from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import fake_quantize from llmcompressor.modifiers.utils import SPARSITY_THRESHOLD from llmcompressor.modifiers.utils.compression_wrapper import ModuleCompressionWrapper +from llmcompressor.observers import Observer from llmcompressor.pytorch.utils.helpers import tensor_sparsity from llmcompressor.utils import getattr_chain -from llmcompressor.utils.metric_logging import ( - get_GPU_memory_usage, - get_layer_size_bytes, -) +from llmcompressor.utils.metric_logging import get_GPU_memory_usage, get_layer_size_mb try: import transformers @@ -57,7 +51,10 @@ def __init__(self, name, layer): # for Hessian calculation self.register_buffer( - "H", torch.zeros((self.columns, self.columns), device=self.dev) + "H", + torch.zeros( + (self.columns, self.columns), device=self.dev, dtype=torch.float32 + ), ) def add_batch(self, inp: torch.Tensor, out: torch.Tensor): @@ -78,7 +75,8 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor): inp = inp.t() self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp - inp = math.sqrt(2 / self.nsamples) * inp.float() + inp = inp.to(dtype=self.H.dtype) + inp = math.sqrt(2 / self.nsamples) * inp self.H += inp.matmul(inp.t()) def compress( @@ -95,20 +93,27 @@ def compress( diagonal norm """ args_loc = "quantization_scheme.weights" - weight_quant_args = getattr_chain(self.layer, args_loc, None) - if weight_quant_args is None: + quant_args = getattr_chain(self.layer, args_loc, None) + if quant_args is None: logger.debug(f"Skipping unquantized layer {self.name}...") return if is_module_offloaded(self.layer): self.layer._hf_hook.pre_forward(self.layer) - strategy = weight_quant_args.strategy - actorder = weight_quant_args.actorder + strategy = quant_args.strategy + actorder = quant_args.actorder final_shape = self.layer.weight.shape final_dtype = self.layer.weight.dtype W = self.layer.weight.data.clone() + # create observer for calculating quantization parameters + observer = Observer.load_from_registry( + quant_args.observer, + quantization_args=quant_args, + averaging_constant=1.0, # ignore moving average + ) + # standardize shape and dtype if isinstance(self.layer, nn.Conv2d): W = W.flatten(1) @@ -122,26 +127,28 @@ def compress( # mapping from column index to group index g_idx = ( torch.arange(self.columns, device=W.device, dtype=torch.int) - // weight_quant_args.group_size + // quant_args.group_size ) if actorder == ActivationOrdering.GROUP: # permute by activation order first, then update groups W, self.H, perm = self._apply_activation_ordering(W, self.H) - self._update_quantization_parameters(weight_quant_args, W) + scale, zero_point = observer(W, g_idx=None) # use identity g_idx (invert permutation later) elif actorder == ActivationOrdering.WEIGHT: # update groups first, then permute by activation order - self._update_quantization_parameters(weight_quant_args, W) + scale, zero_point = observer(W, g_idx=None) W, self.H, perm = self._apply_activation_ordering(W, self.H) # permute g_idx to maintain identity mapping after unpermutation g_idx = g_idx[perm] - scale = self.layer.weight_scale - zero_point = self.layer.weight_zero_point + else: + scale, zero_point = observer(W, g_idx=None) + else: + scale, zero_point = observer(W, g_idx=None) # sparsity mask sparsity = tensor_sparsity(W) @@ -160,13 +167,20 @@ def compress( Losses = torch.zeros(self.rows, device=self.dev) # compute inverse hessian in place to save memory - damp = percdamp * torch.mean(torch.diag(self.H)) - diag = torch.arange(self.columns, device=self.dev) - self.H[diag, diag] += damp - self.H = torch.linalg.cholesky(self.H) - self.H = torch.cholesky_inverse(self.H) - self.H = torch.linalg.cholesky(self.H, upper=True) - Hinv = self.H + try: + damp = percdamp * torch.mean(torch.diag(self.H)) + diag = torch.arange(self.columns, device=self.dev) + self.H[diag, diag] += damp + self.H = torch.linalg.cholesky(self.H) + self.H = torch.cholesky_inverse(self.H) + self.H = torch.linalg.cholesky(self.H, upper=True) + Hinv = self.H + except torch._C._LinAlgError: + raise ValueError( + "Failed to invert hessian due to numerical instability. Consider " + "increasing GPTQModifier.dampening_frac, increasing the number " + "of calibration samples, or shuffling the calibration dataset" + ) # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): @@ -200,16 +214,28 @@ def compress( q, scale[:, 0], zero_point[:, 0], - weight_quant_args, + quant_args, ) elif strategy == QuantizationStrategy.GROUP: # get the group index for the current column column_idx = i1 + i group_index = g_idx[column_idx] + # update quantization parameters to reflect changes + # resulting from previous blocks + if ( + actorder != ActivationOrdering.WEIGHT + and column_idx % quant_args.group_size == 0 + ): + _scale, _zero_point = observer.get_qparams_along_dim( + W[:, g_idx == group_index], dim=0 + ) + scale[:, group_index] = _scale[:, 0] + zero_point[:, group_index] = _zero_point[:, 0] + # Since we're only applying quantization to a slice, this # ends up being a channelwise application - altered_qargs = copy(weight_quant_args) + altered_qargs = copy(quant_args) altered_qargs.strategy = QuantizationStrategy.CHANNEL q = fake_quantize( q, @@ -267,6 +293,9 @@ def compress( W.transpose_(0, 1) W = W.reshape(final_shape).to(final_dtype) + update_parameter_data(self.layer, scale, "weight_scale") + update_parameter_data(self.layer, zero_point, "weight_zero_point") + # This is a bit hacky, but FSDP updates only work if we change # the weight in place, clone() or direct assignment won't work self.layer.weight -= self.layer.weight @@ -284,18 +313,6 @@ def free(self): delattr(self, "H") super().free() - def _update_quantization_parameters(self, args: QuantizationArgs, W: torch.Tensor): - """ - Update layer quantization parameters with potentially permuted weight - - :param args: quantization arguments - :param W: weight to calculate quantization parameters from - """ - observer = args.get_observer() - _scale, _zero_point = observer(W, g_idx=None) - update_parameter_data(self.layer, _scale, "weight_scale") - update_parameter_data(self.layer, _zero_point, "weight_zero_point") - def _apply_activation_ordering( self, W: torch.Tensor, H: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -333,5 +350,5 @@ def _log_metrics(self, start_tick: float, losses: torch.Tensor): patch.log( "METRIC", - f"Compressed layer size: {get_layer_size_bytes(self.layer)} MB", + f"Compressed layer size: {get_layer_size_mb(self.layer)} MB", ) diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 9cfda05cc..9b4516b52 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -6,22 +6,32 @@ QuantizationScheme, QuantizationStatus, apply_quantization_config, - freeze_module_quantization, + is_attention_module, is_preset_scheme, preset_name_to_scheme, - set_module_for_calibration, ) -from compressed_tensors.quantization.observers.helpers import get_observer_token_count from loguru import logger -from pydantic import Field +from pydantic import Field, field_validator from torch.nn import Module from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier +from llmcompressor.modifiers.quantization.calibration import ( + apply_calibration_status, + calibrate_input_hook, + calibrate_kv_cache_input_hook, + calibrate_kv_cache_output_hook, + calibrate_output_hook, + freeze_module_quantization, + initialize_observer, + set_unset_kv_cache, + update_weight_zp_scale, +) from llmcompressor.modifiers.utils.pytorch_helpers import ( is_moe_model, run_calibration_forward, ) +from llmcompressor.observers.helpers import get_observer_token_count __all__ = ["QuantizationModifier"] @@ -53,7 +63,8 @@ class QuantizationModifier(Modifier): There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case and kv_cache_scheme != None, the quantization of kv cache will fail - :param targets: list of layer names to quantize if a scheme is provided + :param targets: list of layer names to quantize if a scheme is provided. Defaults + to Linear layers :param disable_quantization_observer_epoch: Epoch to disable updates to the module quantization observers. At this point, quantized weights and zero points will not be updated. Leave None to not disable observers during QAT. Default is None @@ -63,7 +74,7 @@ class QuantizationModifier(Modifier): config_groups: Optional[Dict[str, QuantizationScheme]] = None ignore: List[str] = Field(default_factory=list) - targets: Union[str, List[str], None] = None + targets: Union[str, List[str]] = Field(default_factory=lambda: ["Linear"]) scheme: Optional[Union[str, Dict[str, Any]]] = None kv_cache_scheme: Optional[QuantizationArgs] = None disable_quantization_observer_epoch: Optional[float] = None @@ -72,6 +83,13 @@ class QuantizationModifier(Modifier): calibration_dataloader_: Any = None calibration_function_: Any = None + @field_validator("targets", mode="before") + def validate_targets(cls, value: Union[str, List[str]]) -> List[str]: + if isinstance(value, str): + return [value] + + return value + def on_initialize(self, state: State, **kwargs) -> bool: if self.end and self.end != -1: raise ValueError( @@ -84,10 +102,12 @@ def on_initialize(self, state: State, **kwargs) -> bool: # initialize quantization in appropriate modules config = self._apply_modifier_to_model(module) + module.apply(lambda module: initialize_observer(module, base_name="weight")) if self.calculate_start() == -1: # one-shot self._check_calibration_data(config) - module.apply(set_module_for_calibration) + module.apply(update_weight_zp_scale) + module.apply(apply_calibration_status) self._calibrate_if_possible(module) self._check_token_distribution( module, threshold=kwargs.get("min_tokens_per_module") @@ -98,7 +118,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: def on_start(self, state: State, event: Event, **kwargs): module = state.model - module.apply(set_module_for_calibration) + module.apply(update_weight_zp_scale) def on_update(self, state: State, event: Event, **kwargs): if event.type_ == EventType.BATCH_START: @@ -111,9 +131,6 @@ def on_end(self, state: State, event: Event, **kwargs): module.apply(freeze_module_quantization) def create_init_config(self) -> QuantizationConfig: - if self.targets is not None and isinstance(self.targets, str): - self.targets = [self.targets] - if self.scheme is not None: # takes precedence over config_groups @@ -134,13 +151,10 @@ def create_init_config(self) -> QuantizationConfig: self.config_groups[group_name] = scheme if self.config_groups is None or len(self.config_groups) == 0: - default_quant_scheme = QuantizationScheme.default_scheme( - targets=self.targets - ) + default_quant_scheme = QuantizationScheme(targets=self.targets) self.config_groups = {"group_0": default_quant_scheme} logger.info( - "No config groups were provided, generating " - f"QuantizationScheme.default_scheme = {self.config_groups}" + f"No config groups were provided, using default {self.config_groups}" ) return QuantizationConfig( @@ -194,10 +208,42 @@ def _check_calibration_data(self, config: QuantizationConfig): def _apply_modifier_to_model(self, model: Module): modifier_as_config = self.create_init_config() + # Add step to attach kv_cache to the model, if present within the config apply_quantization_config(model, modifier_as_config) + model.apply(set_unset_kv_cache) return modifier_as_config def _calibrate_if_possible(self, module: Module): + # TODO: @dsikka restructure such that all of calibration isn't happening + # on init + # flake8: noqa + """# noqa: E501 + Run calibration if running input/output activation quantization or kv_cache + quantization. + + Calibration Lifecycle for a single torch.nn.Module: + + initialize_observer(): + if input/output activation: + - observer = Observer.load_from_registry(...) + - module.register_module(f"{base_name}_observer", observer) + + register_calibration_hooks(): + if input activation and not dynamic quant (used to call observers before intput QDQ): + - pre_hook := calibrate_input_hook + if output activation and not dynamic quant (used to call observers before output QDQ): + - post_hook := calibrate_kv_cache_output_hook + if kv_cache quantization (used to set kv_cache to QuantizedKVParameterCache and update k_scale/v_scale) + - pre_hook := calibrate_kv_cache_input_hook + - post_hook := calibrate_kv_cache_output_hook + + self._calibrate(module) # run forward pass through model using calibration data + set_unset_kv_cache() # remove kv_cache objects attached to attention layers + # initially set in _apply_modifier_to_model + remove calibration hooks in self.calibration_hooks_ + remove observers + + """ if self.num_calibration_steps == 0 and self.calibration_dataloader_: logger.warning( f"num_calibration_steps is {self.num_calibration_steps}." @@ -213,7 +259,48 @@ def _calibrate_if_possible(self, module: Module): elif not self.calibration_dataloader_: return + module.apply(lambda model: initialize_observer(model, base_name="input")) + module.apply(lambda model: initialize_observer(model, base_name="output")) + module.apply(self.register_calibration_hooks) self._calibrate(module) + module.apply(set_unset_kv_cache) + self.remove_hooks() + + def register_calibration_hooks(self, module: Module): + """ + Register hooks for input/output activation or kv_cache quantization. + """ + quantization_scheme = getattr(module, "quantization_scheme", None) + if not quantization_scheme: + return + + is_attention_module_ = is_attention_module(module) + input_quant = quantization_scheme.input_activations + output_quant = quantization_scheme.output_activations + + calibrate_inputs = ( + input_quant and not is_attention_module_ and not input_quant.dynamic + ) + + # Calibrate inputs if an input_quant is provided and not running dynamic quant + if calibrate_inputs: + self.register_hook(module, calibrate_input_hook, "forward_pre") + + if output_quant: + # hooks for attn modules if running kv_cache quant + if is_attention_module_: + self.register_hook( + module, + calibrate_kv_cache_input_hook, + "forward_pre", + with_kwargs=True, + ) + + self.register_hook(module, calibrate_kv_cache_output_hook, "forward") + + # hooks for output quant if not running dynamic quant + elif not output_quant.dynamic: + self.register_hook(module, calibrate_output_hook, "forward") def _calibrate(self, module: Module): class_name = self.__class__.__name__.replace("PyTorch", "") diff --git a/src/llmcompressor/modifiers/smoothquant/README.md b/src/llmcompressor/modifiers/smoothquant/README.md new file mode 100644 index 000000000..2014763a8 --- /dev/null +++ b/src/llmcompressor/modifiers/smoothquant/README.md @@ -0,0 +1,82 @@ +# SmoothQuant Modifier Mapping Tutorial + +In this tutorial, we'll cover how to specify the correct mappings for applying the SmoothQuant Modifier from the [LLM Compressor](https://github.com/vllm-project/llm-compressor) repository, based on the SmoothQuant paper [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438). + +## Understanding the Mapping Format + +### Context +SmoothQuant leverages activation scaling to smooth out input activations, making quantization more efficient for large language models (LLMs). As mentioned in the SmoothQuant paper, "By default, we perform scale smoothing for the input activations of self-attention and feed-forward layers." + +This means that we need to smooth the inputs feeding into: +- The **q/k/v blocks** (query, key, value blocks of self-attention) +- The **fc1 block** (the fully connected block of the feed-forward layer) + +We can derive this by examining the diagram on page 5 of the SmoothQuant paper. It shows that smoothing should occur at specific points in the neural network architecture. + +### Layer Selection + +To get the correct input for smoothing: +1. For **q/k/v blocks**, based on the SmoothQuant paper, we need to target the outputs of `input_layernorm`, as these provide the inputs for the self-attention mechanism. +2. For the **fc1 block**, based on the SmoothQuant paper, we need to target the outputs of `post_attention_layernorm`. + +### Why Target Leaf Modules? + +Based on the SmoothQuant paper smoothing needs to be applied at the leaf nodes of the computational graph. This is why we see mappings such as: + +```python +[["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"] +``` + +Instead of targeting broader modules like `mlp`, we explicitly specify the lower-level projections (`gate_proj` and `up_proj`) and the `post_attention_layernorm` normalization. + +### The Mapping Format + +A mapping in SmoothQuant takes the form: + +```python +[[layers smoothed input activations pass into], output_to_smooth] +``` + +For example, in the default mapping: +```python +[["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"] +``` +This specifies that we want to smooth the inputs feeding into the projections (`gate_proj`, `up_proj`) and the output from `post_attention_layernorm`. + +## Specifying Your Own Mappings + +To create your own mappings, follow these steps: + +1. **Identify the layers you want to pass smoothed input activations into**: + You can find the exact names of these layers by exploring the relevant model file (e.g., `modeling_llama.py`). For example, you might target layers related to the self-attention or feed-forward blocks. + +2. **Match leaf modules**: + Ensure you're targeting leaf modules (i.e., the individual components of broader blocks, such as `gate_proj` and `up_proj` instead of a larger `mlp` module). + +3. **Specify the correct regular expressions**: + Use regular expressions to match the layers you want to target. For instance, if you want to target all projection layers across all attention heads, you could use a regex like `"re:.*proj"`. If you want to target a specific projection layer, make the regex more specific. + +### Example Custom Mapping + +Let's say you're working with a model with layers named similar to LLaMA, and you want to smooth the input activations of the self-attention layers and the feed-forward layers. Here is how you might specify the mapping: + +```python +mapping = [ + # Smooth the inputs going into the query, key, value projections of self-attention + [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], + # Smooth the inputs going into the first feed-forward block (fc1) + [["re:.*fc1"], "re:.*post_attention_layernorm"] +] +``` + +In this mapping: +- We are targeting the `q_proj`, `k_proj`, and `v_proj` layers for smoothing by using the outputs of `input_layernorm`. +- We are targeting the `fc1` feed-forward block by using the outputs of `post_attention_layernorm`. + +This ensures that SmoothQuant modifies the correct activations, improving quantization efficiency while maintaining model accuracy. + +## Conclusion + +By understanding the structure of your model and specifying precise mappings, you can apply the SmoothQuant Modifier effectively. Use the diagram on page 5 of the [SmoothQuant paper](https://arxiv.org/pdf/2211.10438) and inspect your model's code to identify the correct layers and leaf modules to target for smoothing. + +Now that you know how to create these mappings, you can experiment with different model architectures and observe how SmoothQuant impacts performance and quantization accuracy. \ No newline at end of file diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py index a5bcfcff4..f4117e31d 100644 --- a/src/llmcompressor/modifiers/smoothquant/base.py +++ b/src/llmcompressor/modifiers/smoothquant/base.py @@ -7,16 +7,16 @@ from llmcompressor.core import State from llmcompressor.modifiers import Modifier +from llmcompressor.modifiers.smoothquant.utils import ( + get_layer_mappings_from_architecture, + handle_mapping_resolution_errors, +) from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward from llmcompressor.utils.fsdp.helpers import get_fsdp_parent from llmcompressor.utils.pytorch.module import get_layers, get_matching_layer MINIMUM_SMOOTHING_SCALE = 1e-5 -DEFAULT_SMOOTHQUANT_MAPPINGS = [ - [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], - [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"], -] __all__ = ["SmoothQuantScale", "SmoothQuantMapping", "SmoothQuantModifier"] @@ -81,8 +81,9 @@ class SmoothQuantModifier(Modifier): Each entry of the mapping list should be a list itself, in which the first entry is a list of layers who share the same input activation (the one to be to smoothed) and the second entry is the layer whose output is scaled to - achieve the smoothing. - If regex is used, it matches layers with the largest overlap in module name. + achieve the smoothing. If regex is used, it matches layers with the largest + overlap in module name. If not supplied the argument will be inferred from the + model architecture. :param ignore: list of layers to ignore, even if they match a regex in mappings. It should match the name of layers whose outputs are scaled to achieve smoothing (the second entry of the mappings list). @@ -93,12 +94,11 @@ class SmoothQuantModifier(Modifier): """ smoothing_strength: float = 0.5 - mappings: List[Tuple] = DEFAULT_SMOOTHQUANT_MAPPINGS + mappings: Optional[List[Tuple]] = None ignore: Optional[List[str]] = None num_calibration_steps: Optional[int] = None calibration_function: Optional[Callable] = None - hooks_: Optional[List] = None resolved_mappings_: Optional[List] = None scales_: Optional[Dict] = None @@ -121,11 +121,11 @@ def on_initialize(self, state: State, **kwargs) -> bool: ) self.ignore = [] if not self.ignore else self.ignore + self.mappings = self._infer_mappings_from_model(state.model) self.resolved_mappings_ = self._resolve_mappings(state.model) self.scales_ = {} calibration_dataloader = state.data.calib - self.hooks_ = [] self._setup_scale_hooks() self._calibrate(state.model, calibration_dataloader) @@ -147,6 +147,19 @@ def on_finalize(self, state: State, **kwargs) -> bool: return True + def _infer_mappings_from_model( + self, + model: Module, + ) -> List[Tuple]: + if self.mappings is not None: + return self.mappings + + logger.info("No SmoothQuantModifier.mappings provided, inferring from model...") + return get_layer_mappings_from_architecture( + architecture=model.__class__.__name__ + ) + + @handle_mapping_resolution_errors def _resolve_mappings(self, model: Module) -> List: """ Transforms the list of activations to smooth and their corresponding weights @@ -213,7 +226,7 @@ def hook_fn(module, inp, out): for mapping in self.resolved_mappings_: name = mapping.smooth_name layer = mapping.smooth_layer - self.hooks_.append(layer.register_forward_hook(create_hook_fn(name))) + self.register_hook(layer, create_hook_fn(name), "forward") @torch.no_grad() def _calibrate(self, model: Module, calibration_dataloader: List): @@ -240,9 +253,7 @@ def _calibrate(self, model: Module, calibration_dataloader: List): ) # remove the hooks now that we are done calibrating - for hook in self.hooks_: - hook.remove() - del self.hooks_ + self.remove_hooks() @torch.no_grad() def _apply_smoothing(self, model: Module): diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py new file mode 100644 index 000000000..adf015632 --- /dev/null +++ b/src/llmcompressor/modifiers/smoothquant/utils.py @@ -0,0 +1,91 @@ +import functools +import pathlib +from collections import namedtuple +from typing import Dict, List, Tuple, Union + +from loguru import logger + +__all__ = [ + "get_layer_mappings_from_architecture", + "MAPPINGS_REGISTRY", + "DEFAULT_SMOOTHQUANT_MAPPINGS", +] + +LayerMapType = Tuple[Union[List[str], str], Union[List[str], str]] +LayerMap: LayerMapType = namedtuple("LayerMap", ["balance_layers", "smooth_layers"]) + +DEFAULT_SMOOTHQUANT_MAPPINGS: List[LayerMap] = [ + LayerMap( + balance_layers=["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], + smooth_layers="re:.*input_layernorm", + ), + LayerMap( + balance_layers=["re:.*gate_proj", "re:.*up_proj"], + smooth_layers="re:.*post_attention_layernorm", + ), +] +MIXTRAL_MAPPINGS: List[LayerMap] = [ + LayerMap( + balance_layers=["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], + smooth_layers="re:.*input_layernorm", + ), + LayerMap( + balance_layers=["re:.*gate"], smooth_layers="re:.*post_attention_layernorm" + ), +] +BLOOM_SMOOTHQUANT_MAPPINGS: List[LayerMap] = [ + LayerMap( + balance_layers=["re:.*query_key_value"], + smooth_layers="re:.*input_layernorm", + ), + LayerMap( + balance_layers=["re:.*dense_h_to_4h"], + smooth_layers="re:.*post_attention_layernorm", + ), +] + + +# Registry of layer mappings for different architectures +# Add more mappings here +MAPPINGS_REGISTRY: Dict[str, List[LayerMap]] = { + "LlamaForCausalLM": DEFAULT_SMOOTHQUANT_MAPPINGS, + "MixtralForCausalLM": MIXTRAL_MAPPINGS, + "MistralForCausalLM": DEFAULT_SMOOTHQUANT_MAPPINGS, + "Qwen2ForCausalLM": DEFAULT_SMOOTHQUANT_MAPPINGS, + "BloomForCausalLM": BLOOM_SMOOTHQUANT_MAPPINGS, +} + + +def get_layer_mappings_from_architecture(architecture: str) -> List[LayerMap]: + """ + :param architecture: str: The architecture of the model + :return: list: The layer mappings for the given architecture + """ + + if architecture not in MAPPINGS_REGISTRY: + logger.info( + f"Architecture {architecture} not found in mappings. " + f"Using default mappings: {DEFAULT_SMOOTHQUANT_MAPPINGS}" + ) + + return MAPPINGS_REGISTRY.get(architecture, DEFAULT_SMOOTHQUANT_MAPPINGS) + + +def handle_mapping_resolution_errors(func): + """ + Decorator to catch any errors that occur when resolving mappings and provide a + helpful error message to the user pointing them to the README + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as original_exception: + readme_location = pathlib.Path(__file__).parent / "README.md" + raise RuntimeError( + f"Error resolving mappings for given architecture." + f"Please refer to the README at {readme_location} for more information." + ) from original_exception + + return wrapper diff --git a/src/llmcompressor/modifiers/utils/hooks.py b/src/llmcompressor/modifiers/utils/hooks.py new file mode 100644 index 000000000..bb1755519 --- /dev/null +++ b/src/llmcompressor/modifiers/utils/hooks.py @@ -0,0 +1,83 @@ +import contextlib +from functools import wraps +from typing import Any, Callable, ClassVar, List, Union + +import torch +from loguru import logger +from pydantic import BaseModel +from torch.utils.hooks import RemovableHandle + +__all__ = ["HooksMixin"] + + +class HooksMixin(BaseModel): + """ + Mixin to manage hook registration, disabling, and removal. + Modifiers should use `self.register_hook(module, hook, hook_type)` + for hook registration and `self.remove_hooks()` for removal. + + Modifiers which implement hooks should register them using + `self.register_..._hook(module, hook)` rather than the usual + `module.register_..._hook(hook)`. Modifiers should remove hooks with + `self.remove_hooks()`. + + Hooks can be applied to modules or parameters + + Lifecycle: + - modifier.register_forward_hook(module, hook) + - with HooksMixin.disable_hooks(): model.forward() + - modifier.remove_hooks() + """ + + _HOOKS_DISABLED: ClassVar[bool] = False # attached to global HooksMixin + _hooks: List[RemovableHandle] = [] # attached to local subclasses + + @classmethod + @contextlib.contextmanager + def disable_hooks(cls): + """Disable all hooks across all modifiers""" + try: + cls._HOOKS_DISABLED = True + yield + finally: + cls._HOOKS_DISABLED = False + + def register_hook( + self, + target: Union[torch.nn.Module, torch.nn.Parameter], + hook: Callable[[Any], Any], + hook_type: str, + **kwargs, + ) -> RemovableHandle: + """ + Registers a hook on a specified module/parameter with the option to disable it + with HooksMixin.disable_hooks() + + :param target: the module or parameter on which the hook should be registered + :param hook: the hook to register + :param hook_type: the type of hook to register corresponding to the + `register_{hook_type}_hook` attribute on torch.nn.Module. + Ex. "forward", "forward_pre", "full_backward", "state_dict_post", "" + :param kwargs: keyword arguments to pass to register hook method + """ + + @wraps(hook) + def wrapped_hook(*args, **kwargs): + if HooksMixin._HOOKS_DISABLED: + return + + return hook(*args, **kwargs) + + register_function = getattr(target, f"register_{hook_type}_hook") + handle = register_function(wrapped_hook, **kwargs) + self._hooks.append(handle) + logger.debug(f"{self} added {handle}") + + return handle + + def remove_hooks(self): + """Remove all hooks belonging to a modifier""" + for hook in self._hooks: + hook.remove() + + self._hooks = [] diff --git a/src/llmcompressor/observers/__init__.py b/src/llmcompressor/observers/__init__.py new file mode 100644 index 000000000..05b6b3675 --- /dev/null +++ b/src/llmcompressor/observers/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +# isort: skip_file + +from .helpers import * +from .base import * +from .min_max import * +from .mse import * diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py new file mode 100644 index 000000000..f09526781 --- /dev/null +++ b/src/llmcompressor/observers/base.py @@ -0,0 +1,207 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import ceil +from typing import Any, Iterable, Optional, Tuple, Union + +import torch +from compressed_tensors.quantization.quant_args import ( + QuantizationArgs, + QuantizationStrategy, +) +from compressed_tensors.registry.registry import RegistryMixin +from compressed_tensors.utils import safe_permute +from loguru import logger +from torch import FloatTensor, IntTensor, Tensor +from torch.nn import Module + +__all__ = ["Observer"] + + +class Observer(Module, RegistryMixin): + """ + Base Observer class to be subclassed for specific implementation. + Subclasses should override `calculate_qparams` to return a scale, zero_point + pair + """ + + def __init__(self, quantization_args: QuantizationArgs): + self.quantization_args: QuantizationArgs = quantization_args + super().__init__() + self._scale = None + self._zero_point = None + self._num_observed_tokens = None + + @torch.no_grad() + def forward( + self, observed: Tensor, g_idx: Optional[Tensor] = None + ) -> Tuple[FloatTensor, IntTensor]: + """ + maps directly to get_qparams + :param observed: optional observed tensor from which to calculate + quantization parameters + :param g_idx: optional mapping from column index to group index + :return: tuple of scale and zero point based on last observed value + """ + self.record_observed_tokens(observed) + return self.get_qparams(observed=observed, g_idx=g_idx) + + def calculate_qparams( + self, + observed: Tensor, + reduce_dims: Optional[Tuple[int]] = None, + ) -> Tuple[FloatTensor, IntTensor]: + """ + :param observed: observed tensor to calculate quantization parameters for + :param reduce_dims: optional tuple of dimensions to reduce along, + returned scale and zero point will be shaped (1,) along the + reduced dimensions + :return: tuple of scale and zero point derived from the observed tensor + """ + raise NotImplementedError(f"{self.__class__} must implement calculate_qparams") + + def post_calculate_qparams(self) -> None: + """ + Run any logic specific to its observers after running calculate_qparams + """ + + def get_qparams( + self, + observed: Optional[Tensor] = None, + g_idx: Optional[Tensor] = None, + ) -> Tuple[FloatTensor, IntTensor]: + """ + Convenience function to wrap overwritten calculate_qparams + adds support to make observed tensor optional and support for tracking latest + calculated scale and zero point + + :param observed: optional observed tensor to calculate quantization parameters + from + :param g_idx: optional mapping from column index to group index + :return: tuple of scale and zero point based on last observed value + """ + if observed is not None: + group_size = self.quantization_args.group_size + + if self.quantization_args.strategy == QuantizationStrategy.TENSOR: + # re-calculate scale and zero point, update the stored value + self._scale, self._zero_point = self.calculate_qparams(observed) + + elif self.quantization_args.strategy == QuantizationStrategy.GROUP: + rows = observed.shape[0] + columns = observed.shape[1] + num_groups = int(ceil(columns / group_size)) + self._scale = torch.empty( + (rows, num_groups), dtype=observed.dtype, device=observed.device + ) + zp_dtype = self.quantization_args.pytorch_dtype() + self._zero_point = torch.empty( + (rows, num_groups), dtype=zp_dtype, device=observed.device + ) + + # support column-order (default) quantization as well as other orderings + # such as activation ordering. Below checks if g_idx has initialized + is_column_order = g_idx is None or -1 in g_idx + if is_column_order: + group_sizes = torch.full((num_groups,), group_size, dtype=torch.int) + else: + group_indices, group_sizes = torch.unique(g_idx, return_counts=True) + group_sizes = group_sizes[torch.argsort(group_indices)] + + perm = torch.argsort(g_idx) + observed = safe_permute(observed, perm, dim=1) + + # TODO: experiment with vectorizing for loop for performance + end = 0 + for group_index, group_count in enumerate(group_sizes): + start = end + end = start + group_count + scale, zero_point = self.get_qparams_along_dim( + observed[:, start:end], + 0, + tensor_id=group_index, + ) + + self._scale[:, group_index] = scale.squeeze(1) + self._zero_point[:, group_index] = zero_point.squeeze(1) + + elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL: + # assume observed is transposed, because its the output, hence use dim 0 + self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0) + + elif self.quantization_args.strategy == QuantizationStrategy.TOKEN: + # use dim 1, assume the obsersed.shape = [batch, token, hidden] + # should be batch, token + self._scale, self._zero_point = self.get_qparams_along_dim( + observed, + dim={0, 1}, + ) + + return self._scale, self._zero_point + + def get_qparams_along_dim( + self, + observed, + dim: Union[int, Iterable[int]], + tensor_id: Optional[Any] = None, + ): + if isinstance(dim, int): + dim = [dim] + dim = set(dim) + + reduce_dims = tuple(idx for idx in range(observed.ndim) if idx not in dim) + return self.calculate_qparams( + observed, reduce_dims=reduce_dims, tensor_id=tensor_id + ) + + def record_observed_tokens(self, batch_tensor: Tensor): + """ + Counts the number of tokens observed during the + forward passes. The count is aggregated in the + _num_observed_tokens attribute of the class. + + Note: The batch_tensor is expected to have two dimensions + (batch_size * sequence_length, num_features). This is the + general shape expected by the forward pass of the expert + layers in a MOE model. If the input tensor does not have + two dimensions, the _num_observed_tokens attribute will be set + to None. + """ + if not isinstance(batch_tensor, Tensor): + raise ValueError(f"Expected value to be a tensor, got {type(batch_tensor)}") + + if batch_tensor.ndim != 2: + logger.debug( + "The input tensor is expected to have two dimensions " + "(batch_size * sequence_length, num_features). " + f"The input tensor has {batch_tensor.ndim} dimensions." + ) + return + + if self._num_observed_tokens is None: + # initialize the count + self._num_observed_tokens = 0 + + # batch_tensor (batch_size * sequence_length, num_features) + # observed_tokens (batch_size * sequence_length) + observed_tokens, _ = batch_tensor.shape + self._num_observed_tokens += observed_tokens + + def reset(self): + """ + Reset the state of the observer + """ + self._num_observed_tokens = None + self._scale = None + self._zero_point = None diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py new file mode 100644 index 000000000..79d6fb819 --- /dev/null +++ b/src/llmcompressor/observers/helpers.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import Counter + +import torch + +__all__ = ["get_observer_token_count"] + + +def get_observer_token_count(module: torch.nn.Module) -> Counter: + """ + Parse the module and return the number of tokens observed by + each module's observer. + + :param module: module to parse + :return: counter with the number of tokens observed by each observer + """ + token_counts = Counter() + for name, module in module.named_modules(): + if name.endswith(".input_observer"): + token_counts[name.replace(".input_observer", "")] = ( + module._num_observed_tokens + ) + return token_counts diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py new file mode 100644 index 000000000..5c59a6573 --- /dev/null +++ b/src/llmcompressor/observers/min_max.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional, Tuple + +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.utils import calculate_qparams +from torch import FloatTensor, IntTensor, Tensor + +from llmcompressor.observers.base import Observer + +__all__ = ["MovingAverageMinMaxObserver"] + + +@Observer.register("minmax") +class MovingAverageMinMaxObserver(Observer): + """ + Implements a dynamic quantization observer that sets the scale and + zero point based on a moving average of the overall min and max observed values + """ + + def __init__( + self, quantization_args: QuantizationArgs, averaging_constant: float = 0.01 + ): + super().__init__(quantization_args=quantization_args) + + self.min_val = {} + self.max_val = {} + self.averaging_constant = averaging_constant + + def calculate_qparams( + self, + observed: Tensor, + reduce_dims: Optional[Tuple[int]] = None, + tensor_id: Optional[Any] = None, + ) -> Tuple[FloatTensor, IntTensor]: + """ + Updates the observed min and max using a moving average smoothed by the + averaging_constant + + :param observed: observed tensor to calculate quantization parameters for + :param reduce_dims: optional tuple of dimensions to reduce along, + returned scale and zero point will be shaped (1,) along the + reduced dimensions + :param tensor_id: Optional id if different ranges of observed tensors are + passed, useful for sharding tensors by group_size + :return: tuple of scale and zero point derived from the observed tensor + """ + tensor_id = tensor_id or "default" + + if not reduce_dims: + min_val, max_val = torch.aminmax(observed) + else: + min_val = torch.amin(observed, dim=reduce_dims, keepdims=True) + max_val = torch.amax(observed, dim=reduce_dims, keepdims=True) + + running_min_val = self.min_val.get(tensor_id, None) + running_max_val = self.max_val.get(tensor_id, None) + + if running_min_val is None or running_max_val is None: + updated_min_val = min_val + updated_max_val = max_val + else: + updated_min_val = running_min_val + self.averaging_constant * ( + min_val - running_min_val + ) + updated_max_val = running_max_val + self.averaging_constant * ( + max_val - running_max_val + ) + + self.min_val[tensor_id] = updated_min_val + self.max_val[tensor_id] = updated_max_val + + return calculate_qparams( + updated_min_val, updated_max_val, self.quantization_args + ) + + def get_qparams_along_dim( + self, observed, dim: int, tensor_id: Optional[Any] = None + ): + reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim) + return self.calculate_qparams( + observed, reduce_dims=reduce_dims, tensor_id=tensor_id + ) + + def reset(self): + """ + Reset the state of the observer, including min and maximum values + """ + super().reset() + self.min_val = {} + self.max_val = {} diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py new file mode 100644 index 000000000..8fabf92fb --- /dev/null +++ b/src/llmcompressor/observers/mse.py @@ -0,0 +1,164 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional, Tuple + +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.utils import calculate_qparams +from torch import FloatTensor, IntTensor, Tensor + +from llmcompressor.observers.base import Observer + +__all__ = ["MovingAverageMSEObserver"] + + +@Observer.register("mse") +class MovingAverageMSEObserver(Observer): + """ + Implements a dynamic quantization observer that sets the scale and + zero point based on a moving average of the mse-clipped min and max observed values + """ + + def __init__( + self, + quantization_args: QuantizationArgs, + averaging_constant: float = 0.01, + grid: float = 100.0, + maxshrink: float = 0.80, + norm: float = 2.4, + ): + super().__init__(quantization_args=quantization_args) + + self.min_val = {} + self.max_val = {} + self.averaging_constant = averaging_constant + self.grid = grid + self.maxshrink = maxshrink + self.norm = norm + + def calculate_mse_min_max( + self, + observed: Tensor, + reduce_dims: Optional[Tuple[int]] = None, + ): + """ + Computes the mse-clipped min and max values of the observed tensor by + optimizing for quantization error + + :param observed: observed tensor to calculate quantization parameters for + :param reduce_dims: optional tuple of dimensions to reduce along, + returned values will be shaped (1,) along the reduced dimensions + :return: tuple of min and max values derived from the observed tensor + """ + from compressed_tensors.quantization.lifecycle import fake_quantize + + if not reduce_dims: + absolute_min_val, absolute_max_val = torch.aminmax(observed) + else: + absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True) + absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True) + + best = torch.full_like( + absolute_min_val, torch.finfo(absolute_min_val.dtype).max + ) + min_val = torch.ones_like(absolute_min_val) + max_val = torch.zeros_like(absolute_max_val) + for i in range(int(self.maxshrink * self.grid)): + p = 1 - i / self.grid + shrinked_min_val = p * absolute_min_val + shrinked_max_val = p * absolute_max_val + + candidate_scales, candidate_zero_points = calculate_qparams( + shrinked_min_val, shrinked_max_val, self.quantization_args + ) + q = fake_quantize( + observed, + candidate_scales, + candidate_zero_points, + self.quantization_args, + ) + + q -= observed + q.abs_() + q.pow_(self.norm) + if not reduce_dims: + err = torch.sum(q) + else: + err = torch.sum(q, reduce_dims, keepdims=True) + + tmp = err < best + if torch.any(tmp): + best[tmp] = err[tmp] + min_val[tmp] = shrinked_min_val[tmp] + max_val[tmp] = shrinked_max_val[tmp] + return min_val, max_val + + def calculate_qparams( + self, + observed: Tensor, + reduce_dims: Optional[Tuple[int]] = None, + tensor_id: Optional[Any] = None, + ) -> Tuple[FloatTensor, IntTensor]: + """ + Updates the mse-clipped min and max values of the observed tensor using + a moving average smoothed by the averaging_constant + + :param observed: observed tensor to calculate quantization parameters for + :param reduce_dims: optional tuple of dimensions to reduce along, + returned scale and zero point will be shaped (1,) along the + reduced dimensions + :param tensor_id: Optional id if different ranges of observed tensors are + passed, useful for sharding tensors by group_size + :return: tuple of scale and zero point derived from the observed tensor + """ + min_val, max_val = self.calculate_mse_min_max(observed, reduce_dims) + + running_min_val = self.min_val.get(tensor_id, None) + running_max_val = self.max_val.get(tensor_id, None) + + if running_min_val is None or running_max_val is None: + updated_min_val = min_val + updated_max_val = max_val + else: + updated_min_val = running_min_val + self.averaging_constant * ( + min_val - running_min_val + ) + updated_max_val = running_max_val + self.averaging_constant * ( + max_val - running_max_val + ) + + tensor_id = tensor_id or "default" + self.min_val[tensor_id] = updated_min_val + self.max_val[tensor_id] = updated_max_val + + return calculate_qparams( + updated_min_val, updated_max_val, self.quantization_args + ) + + def get_qparams_along_dim( + self, observed, dim: int, tensor_id: Optional[Any] = None + ): + reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim) + return self.calculate_qparams( + observed, reduce_dims=reduce_dims, tensor_id=tensor_id + ) + + def reset(self): + """ + Reset the state of the observer, including min and maximum values + """ + super().reset() + self.min_val = {} + self.max_val = {} diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index df2eb9380..3db9be173 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -1,6 +1,6 @@ import json import os -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch from loguru import logger @@ -16,6 +16,7 @@ "log_model_load", "initialize_recipe", "save_model_and_recipe", + "copy_python_files_from_model_cache", "fallback_to_cpu", "parse_dtype", "get_session_model", @@ -23,8 +24,6 @@ "save_completed_stages", ] -RECIPE_FILE_NAME = "recipe.yaml" - def log_model_load( model: Module, model_name_or_path: str, model_type: str, delayed_load: bool @@ -99,13 +98,14 @@ def save_model_and_recipe( ): """ Save a model, tokenizer and the currently loaded recipe to file - :param model: pytorch model to save :param save_path: path to save output to :param tokenizer: model tokenizer to save :param save_safetensors: whether to save as safetensors or pickle (bin) :param save_compressed: whether to compress sparse weights on disk """ + # avoid circular import + from llmcompressor.transformers.utils.helpers import RECIPE_FILE_NAME model.save_pretrained( save_path, save_compressed=save_compressed, safe_serialization=save_safetensors @@ -123,7 +123,7 @@ def save_model_and_recipe( fp.write(recipe_yaml_str) # copy python files from cache dir to save_path if any - _copy_python_files_from_model_cache(model, save_path) + copy_python_files_from_model_cache(model, save_path) def fallback_to_cpu(device: str) -> str: @@ -142,17 +142,18 @@ def fallback_to_cpu(device: str) -> str: return device -def parse_dtype(dtype_arg: str) -> torch.dtype: +def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype: """ - :param dtype_arg: dtype string to parse + :param dtype_arg: dtype or string to parse :return: torch.dtype parsed from input string """ + dtype_arg = str(dtype_arg) dtype = "auto" # get precision from model by default - if dtype_arg == "half" or dtype_arg == "float16": + if dtype_arg in ("half", "float16", "torch.float16"): dtype = torch.float16 - elif dtype_arg == "bfloat16": + elif dtype_arg in ("torch.bfloat16", "bfloat16"): dtype = torch.bfloat16 - elif dtype_arg == "full" or dtype_arg == "float32": + elif dtype_arg in ("full", "float32", "torch.float32"): dtype = torch.float32 return dtype @@ -212,16 +213,31 @@ def load_safetensors_state_dict(file_path: str) -> Dict[str, torch.Tensor]: return {key: f.get_tensor(key) for key in f.keys()} -def _copy_python_files_from_model_cache(model: Module, save_path: str): +def copy_python_files_from_model_cache(model, save_path: str): config = model.config - cache_dir = None + cache_path = None if hasattr(config, "_name_or_path"): import os import shutil - cache_dir = config._name_or_path - for file in os.listdir(cache_dir): - full_file_name = os.path.join(cache_dir, file) + from huggingface_hub import hf_hub_download + from transformers import TRANSFORMERS_CACHE + from transformers.utils import http_user_agent + + cache_path = config._name_or_path + if not os.path.exists(cache_path): + user_agent = http_user_agent() + config_file_path = hf_hub_download( + repo_id=cache_path, + filename="config.json", + cache_dir=TRANSFORMERS_CACHE, + force_download=False, + user_agent=user_agent, + ) + cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) + + for file in os.listdir(cache_path): + full_file_name = os.path.join(cache_path, file) if file.endswith(".py") and os.path.isfile(full_file_name): logger.debug(f"Transferring {full_file_name} to {save_path}") shutil.copy(full_file_name, save_path) diff --git a/src/llmcompressor/recipe/recipe.py b/src/llmcompressor/recipe/recipe.py index cffe75ce2..1e9851ba8 100644 --- a/src/llmcompressor/recipe/recipe.py +++ b/src/llmcompressor/recipe/recipe.py @@ -73,7 +73,7 @@ def from_modifiers( @classmethod def create_instance( cls, - path_or_modifiers: Union[str, Modifier, List[Modifier]], + path_or_modifiers: Union[str, Modifier, List[Modifier], "Recipe"], modifier_group_name: Optional[str] = None, ) -> "Recipe": """ @@ -574,7 +574,7 @@ def _get_yaml_dict(self) -> Dict[str, Any]: @dataclass class RecipeTuple: """ - A simple dataclass to hold a recipe, it's target_stages, and override_args + A simple dataclass to hold a recipe, its target_stages, and override_args :param recipe: The Recipe instance to hold :param target_stages: The stages to target when simplifying the recipe diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py index 2c4b6588a..2becd67f1 100644 --- a/src/llmcompressor/transformers/__init__.py +++ b/src/llmcompressor/transformers/__init__.py @@ -7,5 +7,8 @@ # isort: skip_file # (import order matters for circular import avoidance) from .utils import * -from .sparsification import SparseAutoModel, SparseAutoModelForCausalLM, wrap_hf_model_class + +from .sparsification import ( + SparseAutoModelForCausalLM, +) from .finetune import * diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py index 845f04c4a..7c839c5a7 100644 --- a/src/llmcompressor/transformers/compression/helpers.py +++ b/src/llmcompressor/transformers/compression/helpers.py @@ -5,6 +5,7 @@ import torch from accelerate import infer_auto_device_map, init_empty_weights from accelerate.accelerator import get_state_dict_offloaded_model +from compressed_tensors import is_module_offloaded from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type from torch.nn.modules import Linear from tqdm import tqdm @@ -125,14 +126,13 @@ def hessian_memory_requirements(model: torch.nn.Module) -> int: for no_split_name, no_split_layer in transformer_layers.items(): total_hessian_elems[no_split_name] = 0 max_column_size[no_split_name] = 0 - for name, module in no_split_layer.named_modules(): - if isinstance(module, Linear): - for param in module.parameters(): - column_size = param.shape[1] - total_hessian_elems[no_split_name] += column_size * column_size - if column_size > max_column_size[no_split_name]: - # max extra memory for inverse calculation - max_column_size[no_split_name] = column_size + for _name, module in no_split_layer.named_modules(): + if isinstance(module, Linear) and hasattr(module, "weight"): + column_size = module.weight.shape[1] + total_hessian_elems[no_split_name] += column_size * column_size + if column_size > max_column_size[no_split_name]: + # max extra memory for inverse calculation + max_column_size[no_split_name] = column_size max_total_hessian_elems = max(total_hessian_elems.values()) overall_max_column_size = max(max_column_size.values()) @@ -292,12 +292,21 @@ def is_sparse_compression_target( :return: whether or not the module is a target for sparsity compression, i.e True if it is sparse and follows the sparsity structure, else False """ - return ( + offloaded = is_module_offloaded(module) + if offloaded: + module._hf_hook.pre_forward(module) + + result = ( hasattr(module, "weight") and tensor_sparsity(module.weight) >= sparsity_threshold and tensor_follows_mask_structure(tensor=module.weight, mask=sparsity_structure) ) + if offloaded: + module._hf_hook.post_forward(module, None) + + return result + def _get_sparse_targets_ignore_dicts( module: torch.nn.Module, sparsity_structure: str, sparsity_threshold: float diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py index 17f9400cf..e6f14a6c7 100644 --- a/src/llmcompressor/transformers/compression/quantization_format.py +++ b/src/llmcompressor/transformers/compression/quantization_format.py @@ -42,10 +42,10 @@ def infer_quantization_format( is_weight_only = len(input_args) == 0 and len(weight_args) > 0 if is_weight_only: # w4a16 and w8a16 - is_valid_pack = ( - len(weight_args) == 1 - and weight_args[0].num_bits in [4, 8] - and weight_args[0].type == QuantizationType.INT.value + is_valid_pack = all( + weight_arg.num_bits in [4, 8] + and weight_arg.type == QuantizationType.INT.value + for weight_arg in weight_args ) if not is_valid_pack: # packing only valid for int4 and int 8 return CompressionFormat.naive_quantized diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md index 7384b077b..27a0bd3ae 100644 --- a/src/llmcompressor/transformers/finetune/README.md +++ b/src/llmcompressor/transformers/finetune/README.md @@ -101,7 +101,7 @@ accelerate launch ```python from llmcompressor.transformers import oneshot -model = "Xenova/llama2.c-stories15M" +model ="Xenova/llama2.c-stories15M" dataset_name = "open_platypus" concatenate_data = False pad_to_max_length = False @@ -119,7 +119,6 @@ oneshot( output_dir=output_dir, recipe=recipe, overwrite_output_dir=overwrite_output_dir, - concatenate_data = concatenate_data, pad_to_max_length = pad_to_max_length, splits = splits ) @@ -141,8 +140,10 @@ of a staged recipe for Llama. test_multi.py ```python from llmcompressor.transformers import apply +from transformers import AutoModelForCausalLM model = "../ml-experiments/nlg-text_generation/llama_pretrain-llama_7b-base/dense/training" + dataset_name = "open_platypus" concatenate_data = False run_stages=True @@ -167,4 +168,5 @@ apply( remove_unused_columns = False, splits = splits ) + ``` \ No newline at end of file diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py index c1af2fe6c..6344b1a2b 100644 --- a/src/llmcompressor/transformers/finetune/runner.py +++ b/src/llmcompressor/transformers/finetune/runner.py @@ -13,7 +13,6 @@ get_completed_stages, get_session_model, save_completed_stages, - save_model_and_recipe, ) from llmcompressor.pytorch.utils import tensors_to_device from llmcompressor.recipe import Recipe, StageRunType @@ -25,11 +24,7 @@ ) from llmcompressor.transformers.finetune.model_args import ModelArguments from llmcompressor.transformers.finetune.training_args import TrainingArguments -from llmcompressor.utils.fsdp.helpers import ( - find_and_move_state_dicts_to_cpu, - is_fsdp_model, - unwrap_and_export_model, -) +from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe class StageRunner: @@ -170,35 +165,6 @@ def one_shot(self, stage: Optional[str] = None): self.trainer.one_shot(calibration_data=calib_data, stage=stage) - if is_fsdp_model(self.trainer.model): - try: - self.trainer.save_model(output_dir=self._output_dir, _is_oneshot=True) - except AssertionError: - # fallback to this in the case of quantization - unwrap_and_export_model( - model=self.trainer.model, - accelerator=self.trainer.accelerator, - output_dir=self._output_dir, - tokenizer=self.tokenizer, - ) - # only allow the main process move the state - # dicts to cpu - if self.trainer.accelerator.is_main_process: - # assuming quantization is the last step - # we no longer need the original model - # and can safely delete it to save memory - del self.trainer.model - find_and_move_state_dicts_to_cpu(self._output_dir) - - else: - save_model_and_recipe( - model=self.trainer.model, - save_path=self._output_dir, - tokenizer=self.tokenizer, - save_safetensors=self._training_args.save_safetensors, - save_compressed=self._training_args.save_compressed, - ) - def train(self, checkpoint: str, stage: Optional[str] = None): """ Run trainer's training loop on train_dataset, saving the resulting model to @@ -293,7 +259,19 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None): self.train(checkpoint=checkpoint, stage=stage_name) checkpoint = None - # save stage stage to checkpoint dir + if ( + self._training_args.output_dir + != TrainingArguments.__dataclass_fields__["output_dir"].default + ): + save_model_and_recipe( + model=self.trainer.model, + save_path=self._output_dir, + tokenizer=self.tokenizer, + save_safetensors=self._training_args.save_safetensors, + save_compressed=self._training_args.save_compressed, + ) + + # save stage to checkpoint dir if self.trainer.accelerator.is_main_process: completed_stages.append(stage_name) save_completed_stages(self._output_dir, completed_stages) diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index 8804bb475..b1ac57b95 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -2,7 +2,7 @@ import math import os from dataclasses import asdict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from loguru import logger @@ -21,8 +21,12 @@ pre_initialize_structure, ) from llmcompressor.metrics import LoggerManager -from llmcompressor.pytorch.model_load.helpers import RECIPE_FILE_NAME, get_session_model +from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import ( + KDModelWrapper, +) +from llmcompressor.pytorch.model_load.helpers import get_session_model from llmcompressor.pytorch.utils import ModuleSparsificationInfo +from llmcompressor.transformers import RECIPE_FILE_NAME from llmcompressor.transformers.finetune.callbacks import ( DisableHalfPrecisionCallback, TrainingLoopCallbacks, @@ -31,6 +35,10 @@ from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_pretrained_fsdp from llmcompressor.utils.pytorch import qat_active +if TYPE_CHECKING: + from llmcompressor.transformers import DataTrainingArguments + + __all__ = [ "SessionManagerMixIn", ] @@ -60,7 +68,7 @@ def __init__( self, recipe: Optional[str] = None, recipe_args: Optional[Union[Dict[str, Any], str]] = None, - data_args: Optional["DataTrainingArguments"] = None, # noqa: F821 + data_args: Optional["DataTrainingArguments"] = None, teacher: Optional[Union[Module, str]] = None, **kwargs, ): @@ -242,10 +250,15 @@ def create_scheduler( # TODO: we don't currently have a LR scheduler in the new modifier framework self._check_super_defined("create_scheduler") - return super().create_scheduler(num_training_steps, optimizer) + return super().create_scheduler( + num_training_steps=num_training_steps, optimizer=optimizer + ) def training_step( - self, model: Module, inputs: Dict[str, Union[torch.Tensor, Any]] + self, + model: torch.nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + num_items_in_batch: Optional[int] = None, ) -> torch.Tensor: """ Overrides the Trainer's training step to trigger the batch_start callback to @@ -258,12 +271,18 @@ def training_step( self._check_super_defined("training_step") callbacks.batch_start(batch_data=inputs) - model_outputs = super().training_step(model, inputs) + model_outputs = super().training_step( + model=model, inputs=inputs, num_items_in_batch=num_items_in_batch + ) return model_outputs def compute_loss( - self, model: Module, inputs: Dict[str, Any], return_outputs: bool = False + self, + model: Module, + inputs: Dict[str, Any], + return_outputs: bool = False, + num_items_in_batch: Optional[int] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, Any]]: """ Override for the compute_loss to factor trigger callbacks and filter columns @@ -279,7 +298,12 @@ def compute_loss( # TODO: do we need these model signature columns? inputs = {k: inputs[k] for k in inputs if k in self._signature_columns} - loss = super().compute_loss(model, inputs, return_outputs=return_outputs) + loss = super().compute_loss( + model=model, + inputs=inputs, + return_outputs=return_outputs, + num_items_in_batch=num_items_in_batch, + ) # take the mean across multiple GPUs # this is done outside the compute_loss function in the parent, replicating it @@ -325,7 +349,10 @@ def prediction_step( inputs = {k: inputs[k] for k in inputs if k in self._model_signature_columns} model_outputs = super().prediction_step( - model, inputs, prediction_loss_only, ignore_keys + model=model, + inputs=inputs, + prediction_loss_only=prediction_loss_only, + ignore_keys=ignore_keys, ) return model_outputs @@ -341,13 +368,25 @@ def train(self, *args, stage: Optional[str] = None, **kwargs): :param kwargs: keyword args to pass to super().train() :return: the output from super.train() """ + + # lifecycle checkpoint, epoch = self._calculate_checkpoint_info(kwargs) self.initialize_session(epoch=epoch, checkpoint=checkpoint, stage=stage) + + # do not save checkpoints as compressed + original_save_compressed = self.args.save_compressed + self.args.save_compressed = False + + # train with accelerator self.accelerator.wait_for_everyone() output = super().train(*args, **kwargs) self.accelerator.wait_for_everyone() - self.finalize_session() + # restore original setting for saving final model + self.args.save_compressed = original_save_compressed + + # lifecycle + self.finalize_session() self.accelerator.wait_for_everyone() # log model sparsity @@ -414,9 +453,7 @@ def one_shot( # self.maybe_log_model_sparsification() self.accelerator.wait_for_everyone() - def save_model( - self, output_dir: Optional[str] = None, _internal_call=False, _is_oneshot=False - ): + def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False): """ Override of the save_model function and expects it to exist in the parent. Calls into super() to save the model and additionally saves any recipes @@ -430,6 +467,10 @@ def save_model( if output_dir is None: output_dir = self.args.output_dir + # knowledge distillation requires making wrappers transparent during + if isinstance(self.model, KDModelWrapper): + self.model.prepare_for_save() + if not is_fsdp_model(self.model): self.model.save_pretrained( output_dir, @@ -467,6 +508,9 @@ def save_model( self.accelerator.wait_for_everyone() + if isinstance(self.model, KDModelWrapper): + self.model.finish_save() + def maybe_log_model_sparsification(self): """ Log info on model sparsity and quantization if possible. Only print logs on the diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py index e99d67429..85aa6d82c 100644 --- a/src/llmcompressor/transformers/finetune/text_generation.py +++ b/src/llmcompressor/transformers/finetune/text_generation.py @@ -23,6 +23,7 @@ from loguru import logger from transformers import ( AutoConfig, + AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator, HfArgumentParser, @@ -42,11 +43,16 @@ from llmcompressor.transformers.finetune.runner import StageRunner from llmcompressor.transformers.finetune.trainer import Trainer from llmcompressor.transformers.finetune.training_args import TrainingArguments +from llmcompressor.transformers.sparsification.compressed_tensors_utils import ( + modify_fsdp_model_save_pretrained, + modify_save_pretrained, + patch_tied_tensors_bug, +) from llmcompressor.transformers.sparsification.sparse_model import ( - SparseAutoModel, get_shared_tokenizer_src, ) from llmcompressor.transformers.utils.helpers import detect_last_checkpoint +from llmcompressor.utils.fsdp.helpers import is_fsdp_model def train(**kwargs): @@ -199,21 +205,23 @@ def initialize_model_from_path( "trust_remote_code": model_args.trust_remote_code_model, } # this calls from_pretrained under the hood so should be FSDP safe - model = SparseAutoModel.text_generation_from_pretrained( - model_name_or_path=model_path, - sequence_length=None, # use model default + model = AutoModelForCausalLM.from_pretrained( + model_path, **model_kwargs, ) + if "sequence_length" in model_kwargs: + model.seqlen = model_kwargs["sequence_length"] teacher = ( - SparseAutoModel.text_generation_from_pretrained( - model_name_or_path=model_args.distill_teacher, - sequence_length=None, # use model default + AutoModelForCausalLM.from_pretrained( + model_args.distill_teacher, **teacher_kwargs, ) if model_args.distill_teacher is not None else None ) + if teacher is not None and "sequence_length" in teacher_kwargs: + teacher.seqlen = teacher_kwargs["sequence_length"] return teacher, model_path, model @@ -291,18 +299,21 @@ def main( # Detecting last checkpoint. last_checkpoint = None teacher = model_args.distill_teacher - model_path = None model = model_args.model # Load tokenizer # distill TODO: support for different tokenizer for teacher? tokenizer = model_args.tokenizer if isinstance(model, str) or isinstance(model, PosixPath): - (teacher, model_path, model) = initialize_model_from_path( + (teacher, _model_path, model) = initialize_model_from_path( model_args, training_args, ) + # patch a shared tensor bug in HF transformers + # https://github.com/huggingface/transformers/issues/33689 + patch_tied_tensors_bug(model) + if teacher is not None: teacher.eval() @@ -338,6 +349,13 @@ def main( tokenizer=tokenizer, data_collator=data_collator, ) + + # wrap model.save_pretrained + if is_fsdp_model(model): + modify_fsdp_model_save_pretrained(trainer, tokenizer) + else: + modify_save_pretrained(model) + stage_runner.trainer = trainer # alternating Training/One-shot @@ -349,7 +367,6 @@ def main( # exit immediately return - # Training if training_args.do_train: checkpoint = None @@ -371,6 +388,17 @@ def main( if training_args.do_predict: stage_runner.predict() + # save if model was provided as a string or custom output_dir was set + if isinstance(model_args.model, str) or ( + training_args.output_dir + != TrainingArguments.__dataclass_fields__["output_dir"].default + ): + model.save_pretrained( + training_args.output_dir, save_compressed=training_args.save_compressed + ) + if tokenizer is not None: + tokenizer.save_pretrained(training_args.output_dir) + # Clean up the CompressionSession before exit if requested if training_args.clear_sparse_session: reset_session() diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 262cf9390..759098894 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -1,26 +1,104 @@ +import os import re import weakref from functools import wraps -from typing import Optional +from typing import Dict, Optional import torch import transformers from accelerate.accelerator import get_state_dict_offloaded_model -from compressed_tensors import ModelCompressor, SparsityCompressionConfig +from compressed_tensors import ( + ModelCompressor, + SparsityCompressionConfig, + is_module_offloaded, + update_parameter_data, +) from loguru import logger -from transformers import PreTrainedModel +from safetensors.torch import storage_ptr +from llmcompressor.core import active_session +from llmcompressor.pytorch.model_load.helpers import copy_python_files_from_model_cache from llmcompressor.transformers.compression.quantization_format import ( infer_quantization_format, ) from llmcompressor.transformers.compression.sparsity_config import ( SparsityConfigMetadata, ) +from llmcompressor.transformers.utils import RECIPE_FILE_NAME +from llmcompressor.utils.fsdp.helpers import ( + find_and_move_state_dicts_to_cpu, + unwrap_and_export_model, +) + +__all__ = ["modify_save_pretrained", "modify_fsdp_model_save_pretrained"] + + +def modify_fsdp_model_save_pretrained(trainer, tokenizer): + """ + Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that + supports compression for fsdp model + """ + + def save_pretrained_compressed(save_pretrained_method): + if getattr(save_pretrained_method, "_overridden", False): + # `model.save_pretrained` has already been replaced, return. + return save_pretrained_method + + # Keep a weak reference to the model class and unbound save_pretrained + # method so we can call the original + original_save_pretrained = save_pretrained_method.__func__ + del save_pretrained_method + + @wraps(original_save_pretrained) + def save_pretrained_wrapper( + save_directory: str, + **kwargs, + ): + """ + Wrapper around PreTrainedModel.save_pretrained(), adds functionality for + saving models in a compressed format on disk. The compression format is + saved to the model's config file + + :param save_directory: output directory to save model to + :param sparsity_config: optional sparsity config to compress model with, + if no config is provided it will be inferred from the model + :param quantization_format: optional compression format for quantized + models. If none is provided it will be inferred from the model + :param save_compressed: whether or not to compress the model on disk + :param skip_compression_stats: whether to skip the calculation of + compression statistics (such as global sparsity and sparsity structure) when + saving a model in dense format + :param kwargs: additional kwargs to pass on to model.save_pretrained + """ + try: + trainer.save_model(output_dir=save_directory, _is_oneshot=True) + except AssertionError: + # fallback to this in the case of quantization + unwrap_and_export_model( + model=trainer.model, + accelerator=trainer.accelerator, + output_dir=save_directory, + tokenizer=tokenizer, + ) + # only allow the main process move the state + # dicts to cpu + if trainer.accelerator.is_main_process: + # assuming quantization is the last step + # we no longer need the original model + # and can safely delete it to save memory + del trainer.model + find_and_move_state_dicts_to_cpu(save_directory) + + save_pretrained_wrapper._overriden = True + return save_pretrained_wrapper -__all__ = ["modify_save_pretrained"] + # wrap save_pretrained + trainer.model.save_pretrained = save_pretrained_compressed( + trainer.model.save_pretrained + ) -def modify_save_pretrained(model: PreTrainedModel): +def modify_save_pretrained(model: torch.nn.Module): """ Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that supports compression @@ -69,66 +147,59 @@ def save_pretrained_wrapper( # https://github.com/huggingface/transformers/pull/30488 transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size - model = model_ref() - # state_dict gets passed in as a kwarg for FSDP models - state_dict = kwargs.get("state_dict", None) + def skip(*args, **kwargs): + pass - if sparsity_config is not None: - sparsity_config.global_sparsity = ( - SparsityConfigMetadata.infer_global_sparsity( - model, state_dict=state_dict - ) - ) - sparsity_config.sparsity_structure = ( - SparsityConfigMetadata.infer_sparsity_structure() - ) - elif not skip_compression_stats: - # try to infer a sparsity config from the model if none is provided - logger.info( - "Inferring a sparsity configuration requires a global sparsity " - "calculation. This can be costly for large models. To skip the " - "calculation of compression statistics set " - "skip_compression_stats=True" - ) - sparsity_config = SparsityConfigMetadata.from_pretrained( - model, state_dict=state_dict, compress=False - ) + # Skip the initializer step. This accelerates the loading + # of the models, especially for the quantized models + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip - quantization_format = infer_quantization_format( + # state_dict gets passed in as a kwarg for FSDP models + state_dict = kwargs.pop("state_dict", None) + if state_dict is None: + state_dict = get_state_dict_offloaded_model(model) + + compressor = get_model_compressor( model=model, - quantization_format=quantization_format, - save_compressed=save_compressed, - sparsity_config=sparsity_config, - ) - compressor = ModelCompressor.from_pretrained_model( - model, sparsity_config=sparsity_config, quantization_format=quantization_format, + save_compressed=save_compressed, + skip_compression_stats=skip_compression_stats, + state_dict=state_dict, ) if compressor is None: # model is not compressed or quantized, save as normal - original_save_pretrained.__get__(model, model_class)( - save_directory, **kwargs + original_save_pretrained_func = original_save_pretrained.__get__( + model, model_class + ) + original_save_pretrained_func( + save_directory, state_dict=state_dict, **kwargs ) return - # if we've gotten to this point we have a config so we can run compression - # default safe serialization to True if not explicitly set - kwargs["safe_serialization"] = kwargs.get("safe_serialization", True) - if state_dict is None: - state_dict = get_state_dict_offloaded_model(model) - # make sure we're on the main process when saving if state_dict is not None and len(state_dict) > 0: compressed_state_dict = compressor.compress(model, state_dict) - kwargs["state_dict"] = compressed_state_dict + kwargs["safe_serialization"] = kwargs.get("safe_serialization", True) original_save_pretrained.__get__(model, model_class)( - save_directory, **kwargs + save_directory, state_dict=compressed_state_dict, **kwargs ) compressor.update_config(save_directory) + recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME) + session = active_session() + + if (recipe_yaml_str := session.get_serialized_recipe()) is not None: + with open(recipe_path, "w") as fp: + fp.write(recipe_yaml_str) + + # copy python files from cache dir to save_path if any + copy_python_files_from_model_cache(model, save_directory) + save_pretrained_wrapper._overriden = True return save_pretrained_wrapper @@ -146,3 +217,89 @@ def new_dtype_byte_size(dtype): raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") bit_size = int(bit_search.groups()[0]) return bit_size // 8 + + +def patch_tied_tensors_bug(model: torch.nn.Module): + """ + Patches bug where HF transformers will fail to untie weights under specific + circumstances (https://github.com/huggingface/transformers/issues/33689). + + This function detects those cases and unties the tensors if applicable + + :param model: model to fix + """ + if ( + hasattr(model.config, "tie_word_embeddings") + and not model.config.tie_word_embeddings + ): + input_embed = model.get_input_embeddings() + output_embed = model.get_output_embeddings() + + if storage_ptr(input_embed.weight) == storage_ptr(output_embed.weight): + for module in (input_embed, output_embed): + offloaded = is_module_offloaded(module) + if offloaded: + module._hf_hook.pre_forward(module) + + update_parameter_data(module, module.weight.data.clone(), "weight") + + if offloaded: + module._hf_hook.post_forward(module, None) + + +def get_model_compressor( + model: torch.nn.Module, + sparsity_config: Optional[SparsityCompressionConfig] = None, + quantization_format: Optional[str] = None, + save_compressed: bool = True, + skip_compression_stats: bool = False, + state_dict: Optional[Dict] = None, +): + """ + Obtain the compressor based on the config and the + quantization_format + + :param model: torch model + :param sparsify_config: Sparsity Compression config + :param quantization_format: Format that the model was quantized to. + if not provivided, will be extrapolated from `infer_quantization_format` + :param save_compressed: boolean representing to save in a compressed + format + :param skip_compression_stats: bool allowing compression stats on std out + :param state_dict: state_dict of the model + """ + + # find offloaded state dict if none is provided + if state_dict is None: + state_dict = get_state_dict_offloaded_model(model) + + if sparsity_config is not None: + sparsity_config.global_sparsity = SparsityConfigMetadata.infer_global_sparsity( + model, state_dict=state_dict + ) + sparsity_config.sparsity_structure = ( + SparsityConfigMetadata.infer_sparsity_structure() + ) + elif not skip_compression_stats: + # try to infer a sparsity config from the model if none is provided + logger.info( + "Inferring a sparsity configuration requires a global sparsity " + "calculation. This can be costly for large models. To skip the " + "calculation of compression statistics set " + "skip_compression_stats=True" + ) + sparsity_config = SparsityConfigMetadata.from_pretrained( + model, state_dict=state_dict, compress=save_compressed + ) + + quantization_format = infer_quantization_format( + model=model, + quantization_format=quantization_format, + save_compressed=save_compressed, + sparsity_config=sparsity_config, + ) + return ModelCompressor.from_pretrained_model( + model, + sparsity_config=sparsity_config, + quantization_format=quantization_format, + ) diff --git a/src/llmcompressor/transformers/sparsification/sparse_model.py b/src/llmcompressor/transformers/sparsification/sparse_model.py index 11cee7555..bf09396d7 100644 --- a/src/llmcompressor/transformers/sparsification/sparse_model.py +++ b/src/llmcompressor/transformers/sparsification/sparse_model.py @@ -1,213 +1,23 @@ import inspect -import logging -from pathlib import Path -from typing import Optional, Union +from typing import Optional -import torch -from accelerate import load_checkpoint_and_dispatch -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization import ( - QuantizationStatus, - apply_quantization_config, -) from loguru import logger from torch.nn import Module -from transformers import AutoModelForCausalLM, PreTrainedModel - -from llmcompressor.pytorch.model_load.helpers import initialize_recipe -from llmcompressor.transformers.sparsification.compressed_tensors_utils import ( - modify_save_pretrained, -) -from llmcompressor.transformers.utils.helpers import ( - download_model_directory, - resolve_recipe, -) +from transformers import AutoModelForCausalLM __all__ = [ - "wrap_hf_model_class", - "SparseAutoModel", "SparseAutoModelForCausalLM", - "get_shared_tokenizer_src" + "get_shared_tokenizer_src", ] -def wrap_hf_model_class(hf_model_class: PreTrainedModel) -> PreTrainedModel: - """ - Wrap a HF PreTrainedModel class to - 1. Decompress a compressed model - 2. Initialize any saved recipes - 3. Wrap the `save_pretrained` method to allow saving as a compressed model - - :param hf_model_class: Model class to wrap - :return: Wrapped model class - """ - - # Add the from_pretrained class method - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path, - run_compressed: bool = False, - recipe: Optional[Union[str, Path]] = None, - *model_args, - **kwargs, - ) -> PreTrainedModel: - """ - A wrapper around the PreTrainedModel.from_pretrained method - - :param pretrained_model_name_or_path: the name of or path to the model to load - :param recipe: the path to the recipe file to apply to the model. Can be a - string or Path object. If None, a recipe will be searched for in the - pretrained_model_name_or_path directory and applied if found - :return the created model for causal language modeling - """ - def skip(*args, **kwargs): - pass - - # Skip the initializer step. This accelerates the loading - # of the models, especially for the quantized models - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - - pretrained_model_name_or_path = ( - pretrained_model_name_or_path.as_posix() - if isinstance(pretrained_model_name_or_path, Path) - else pretrained_model_name_or_path - ) - - pretrained_model_name_or_path = download_model_directory( - pretrained_model_name_or_path, **kwargs +class SparseAutoModelForCausalLM: + def from_pretrained(*args, **kwargs): + logger.warning( + "SparseAutoModelForCausalLM is deprecated, " + "please use AutoModelForCausalLM" ) - - # instantiate compressor from model config - compressor = ModelCompressor.from_pretrained( - pretrained_model_name_or_path, **kwargs - ) - - # temporarily set the log level to error, to ignore printing out long missing - # and unexpected key error messages (these are EXPECTED for quantized models) - transformers_logger = logging.getLogger("transformers.modeling_utils") - restore_log_level = transformers_logger.getEffectiveLevel() - transformers_logger.setLevel(level=logging.ERROR) - - if kwargs.get("trust_remote_code"): - # By artifically aliasing the - # class name to the - # hf_model_class we can "trick" the - # `from_pretrained` method into properly - # resolving the logic when - # (has_remote_code and trust_remote_code) == True - cls.__name__ = hf_model_class.__name__ - - model = super(hf_model_class, cls).from_pretrained( - pretrained_model_name_or_path, *model_args, **kwargs - ) - - if model.dtype != model.config.torch_dtype: - logger.warning( - f"The dtype of the loaded model: {model.dtype} is different " - "from from the dtype specified in the model config: " - f"{model.config.torch_dtype}." - "To load the model in the format that it was previously saved in, " - "set torch_dtype=`auto` in the SparseAutoModel creation call." - ) - - # restore transformers logging level now that model shell is loaded - transformers_logger.setLevel(level=restore_log_level) - - # HfQuantizer Quantization - if hasattr(model.config, "quantization_config"): - return model - - # override the PreTrainedModel instance with compression save function - modify_save_pretrained(model) - - # If model is quantized or compressed on disk, initialize quantization - # structure and run decompression - if compressor is not None: - quantization_config = compressor.quantization_config - is_compressed = ( - quantization_config is not None - and quantization_config.quantization_status - == QuantizationStatus.COMPRESSED - ) - if run_compressed and is_compressed: - # initialize quantization, don't decompress - apply_quantization_config( - model, quantization_config, run_compressed=True - ) - model = load_checkpoint_and_dispatch( - model, pretrained_model_name_or_path - ) - else: - # initialize quantization and decompress weights - if quantization_config is not None: - quantization_config.quantization_status = QuantizationStatus.FROZEN - compressor.decompress( - model_path=pretrained_model_name_or_path, model=model - ) - recipe = resolve_recipe(recipe=recipe, model_path=pretrained_model_name_or_path) - - if recipe: - initialize_recipe(model=model, recipe_path=recipe) - - return model - - # Add the wrapped methods to the new class - wrapped_model_class = type( - hf_model_class.__name__, - (hf_model_class,), - { - "from_pretrained": from_pretrained - } - ) - - return wrapped_model_class - - -SparseAutoModelForCausalLM = wrap_hf_model_class(AutoModelForCausalLM) - - -class SparseAutoModel: - """ - Factory class for creating sparse models using transformers AutoModel classes - """ - - @staticmethod - def text_generation_from_pretrained( - model_name_or_path: str, - sequence_length: Optional[int] = None, - recipe: Optional[Union[str, Path]] = None, - trust_remote_code: bool = False, - torch_dtype: Union[str, torch.dtype] = "auto", - **kwargs, - ) -> Module: - """ - :param model_name_or_path: the name of or path to the model to load - :param sequence_length: the maximum length of the sequence to generate. - If None, will use the default sequence length for the model. - Defaults to None. - :param recipe: the recipe to apply to the model. If None, no recipe is applied - :param trust_remote_code: related to trust_remote_code in HF transformers. - If True, will execute the modelling code from the model directory - (if present). Defaults to False. - :param torch_dtype: the torch dtype to use for the model. If "auto", will - use the default dtype for the model. Defaults to "auto". - :return: the created model for text generation - """ - - model = SparseAutoModelForCausalLM.from_pretrained( - model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - recipe=recipe, - **kwargs, - ) - if sequence_length is not None: - model.seqlen = sequence_length - - return model + return AutoModelForCausalLM.from_pretrained(*args, **kwargs) def get_shared_tokenizer_src(student: Module, teacher: Optional[Module]) -> str: diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index 401a454cf..1263bb004 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -3,80 +3,26 @@ huggingface/transformers flows """ -import inspect import os -from collections import OrderedDict -from contextlib import suppress -from enum import Enum -from pathlib import Path -from typing import Iterable, List, Optional -from typing import OrderedDict as OrderedDictType -from typing import Tuple, Union +from typing import TYPE_CHECKING, Optional -import requests -import torch -import transformers -from huggingface_hub import HUGGINGFACE_CO_URL_HOME, HfFileSystem, hf_hub_download from loguru import logger -from transformers import AutoConfig from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import PaddingStrategy -from llmcompressor.utils.fsdp.context import main_process_first_context +if TYPE_CHECKING: + from llmcompressor.transformers import ModelArguments, TrainingArguments __all__ = [ - "RECIPE_NAME", + "RECIPE_FILE_NAME", "detect_last_checkpoint", - "TaskNames", - "resolve_sequence_length", - "ALL_TASK_NAMES", - "create_fake_dataloader", - "POSSIBLE_TOKENIZER_FILES", - "download_repo_from_huggingface_hub", - "download_model_directory", ] - -class TaskNames(Enum): - mlm = {"masked-language-modeling", "mlm"} - qa = {"question-answering", "qa"} - token_classification = {"token-classification", "ner"} - text_classification = { - "text-classification", - "sentiment-analysis", - "sequence-classification", - "glue", - } - text_generation = {"text-generation"} - - -ALL_TASK_NAMES = list(set.union(*[task_names.value for task_names in TaskNames])) -RECIPE_NAME = "recipe.yaml" - -MANDATORY_DEPLOYMENT_FILES = { - "tokenizer_config.json", - "config.json", -} -OPTIONAL_DEPLOYMENT_FILES = {"tokenizer.json", "tokenizer.model"} -NLG_MANDATORY_DEPLOYMENT_FILES = {"special_tokens_map.json"} -NLG_OPTIONAL_DEPLOYMENT_FILES = { - "vocab.json", - "merges.txt", -} -POSSIBLE_TOKENIZER_FILES = { - "vocab.json", - "merges.txt", - "tokenizer.json", - "tokenizer.model", - "special_tokens_map.json", - "tokenizer_config.json", -} -RELEVANT_HF_SUFFIXES = ["json", "md", "bin", "safetensors", "yaml", "yml", "py"] +RECIPE_FILE_NAME = "recipe.yaml" def detect_last_checkpoint( - training_args: "TrainingArguments", # noqa 821 - model_args: Optional["ModelArguments"] = None, # noqa 821 + training_args: "TrainingArguments", + model_args: Optional["ModelArguments"] = None, ): last_checkpoint = None if ( @@ -108,385 +54,3 @@ def detect_last_checkpoint( ) return last_checkpoint - - -def resolve_sequence_length(config: AutoConfig) -> int: - """ - Resolve the sequence length from the config - - :param config: the config to resolve the sequence length from - :return: the sequence length - """ - if hasattr(config, "max_position_embeddings"): - sequence_length = config.max_position_embeddings - - elif hasattr(config, "max_seq_len"): - sequence_length = config.max_seq_len - else: - raise ValueError( - "Could not infer a default sequence length " - "from the HF transformers config. Please specify " - "the sequence length with --sequence_length" - ) - logger.debug( - f"Using default sequence length of {sequence_length} " - "(inferred from HF transformers config) " - ) - return sequence_length - - -def resolve_recipe( - model_path: Union[str, Path], - recipe: Union[str, Path, None] = None, -) -> Union[str, None]: - """ - Resolve the recipe to apply to the model. - :param recipe: the recipe to apply to the model. - It can be one of the following: - - None - This means that we are not either not applying - any recipe and allowing the model to potentially - infer the appropriate pre-existing recipe - from the model_path - - a path to the recipe file - This can be a string or Path object pointing - to a recipe file. If the specified recipe file - is different from the potential pre-existing - recipe for that model (stored in the model_path), - the function will raise an warning - - name of the recipe file (e.g. "recipe.yaml") - Recipe file name specific is assumed to be stored - in the model_path - - a string containing the recipe - Needs to adhere to the SparseML recipe format - - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - - :return: the resolved recipe - """ - - if recipe is None: - return infer_recipe_from_model_path(model_path) - - elif os.path.isfile(recipe): - # recipe is a path to a recipe file - return resolve_recipe_file(recipe, model_path) - - elif os.path.isfile(os.path.join(model_path, recipe)): - # recipe is a name of a recipe file - recipe = os.path.join(model_path, recipe) - return resolve_recipe_file(recipe, model_path) - - elif isinstance(recipe, str): - # recipe is a string containing the recipe - logger.debug( - "Applying the recipe string directly to the model, without " - "checking for a potential existing recipe in the model_path." - ) - return recipe - - logger.info( - "No recipe requested and no default recipe " - f"found in {model_path}. Skipping recipe resolution." - ) - return None - - -def infer_recipe_from_model_path(model_path: Union[str, Path]) -> Optional[str]: - """ - Infer the recipe from the model_path. - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - :return the path to the recipe file if found, None otherwise - """ - model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path - - if os.path.isdir(model_path) or os.path.isfile(model_path): - # model_path is a local path to the model directory or model file - # attempting to find the recipe in the model_directory - model_path = ( - os.path.dirname(model_path) if os.path.isfile(model_path) else model_path - ) - recipe = os.path.join(model_path, RECIPE_NAME) - if os.path.isfile(recipe): - logger.info(f"Found recipe in the model_path: {recipe}") - return recipe - logger.debug(f"No recipe found in the model_path: {model_path}") - return None - - recipe = recipe_from_huggingface_model_id(model_path)[0] - - if recipe is None: - logger.info("Failed to infer the recipe from the model_path") - return recipe - - -def recipe_from_huggingface_model_id( - model_path: str, recipe_name: str = RECIPE_NAME -) -> Tuple[Optional[str], bool]: - """ - Attempts to download the recipe from the huggingface model id. - - :param model_path: Assumed to be the huggingface model id. - If it is not, this function will return None. - :param recipe_name: The name of the recipe file to download. - Defaults to RECIPE_NAME. - :return: tuple: - - the path to the recipe file if found, None otherwise - - True if model_path is a valid huggingface model id, False otherwise - """ - model_id = os.path.join(HUGGINGFACE_CO_URL_HOME, model_path) - request = requests.get(model_id) - if not request.status_code == 200: - logger.debug( - "model_path is not a valid huggingface model id. " - "Skipping recipe resolution." - ) - return None, False - - logger.info( - "model_path is a huggingface model id. " - "Attempting to download recipe from " - f"{HUGGINGFACE_CO_URL_HOME}" - ) - try: - recipe = hf_hub_download(repo_id=model_path, filename=recipe_name) - logger.info(f"Found recipe: {recipe_name} for model id: {model_path}.") - except Exception as e: - logger.info( - f"Unable to to find recipe {recipe_name} " - f"for model id: {model_path}: {e}. " - "Skipping recipe resolution." - ) - recipe = None - return recipe, True - - -def resolve_recipe_file( - requested_recipe: Union[str, Path], model_path: Union[str, Path] -) -> Union[str, Path, None]: - """ - Given the requested recipe and the model_path, return the path to the recipe file. - - :param requested_recipe. Is a full path to the recipe file - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - :return the path to the recipe file if found, None otherwise - """ - # preprocess arguments so that they are all strings - requested_recipe = ( - requested_recipe.as_posix() - if isinstance(requested_recipe, Path) - else requested_recipe - ) - model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path - model_path = ( - os.path.dirname(model_path) if os.path.isfile(model_path) else model_path - ) - - if not os.path.isdir(model_path): - default_recipe, model_exists = recipe_from_huggingface_model_id(model_path) - if not model_exists: - raise ValueError(f"Unrecognized model_path: {model_path}") - - if not default_recipe == requested_recipe and default_recipe is not None: - logger.warning( - f"Attempting to apply recipe: {requested_recipe} " - f"to the model at: {model_path}, " - f"but the model already has a recipe: {default_recipe}. " - f"Using {requested_recipe} instead." - ) - return requested_recipe - - # pathway for model_path that is a directory - default_recipe = os.path.join(model_path, RECIPE_NAME) - default_recipe_exists = os.path.isfile(default_recipe) - default_and_request_recipes_identical = os.path.samefile( - default_recipe, requested_recipe - ) - - if ( - default_recipe_exists - and requested_recipe - and not default_and_request_recipes_identical - ): - logger.warning( - f"Attempting to apply recipe: {requested_recipe} " - f"to the model located in {model_path}, " - f"but the model already has a recipe stored as {default_recipe}. " - f"Using {requested_recipe} instead." - ) - - elif not default_recipe_exists and requested_recipe: - logger.warning( - f"Attempting to apply {requested_recipe} " - f"to the model located in {model_path}." - "However, it is expected that the model " - f"has its target recipe stored as {default_recipe}." - "Applying any recipe before the target recipe may " - "result in unexpected behavior." - f"Applying {requested_recipe} nevertheless." - ) - - elif default_recipe_exists: - logger.info(f"Using the default recipe: {requested_recipe}") - - return requested_recipe - - -def create_fake_dataloader( - model: torch.nn.Module, - tokenizer: transformers.AutoTokenizer, - num_samples: int, -) -> Tuple[Iterable[OrderedDictType[str, torch.Tensor]], List[str]]: - """ - Creates fake transformers dataloader for the model, based on the model's - forward signature. - - :param model: The model to create the dataloader for - :param tokenizer: The tokenizer to use for the dataloader - :param num_samples: The number of fake samples in the dataloader - :return: The data loader (iterable) and the input names for the model - """ - - forward_args_spec = inspect.getfullargspec(model.__class__.forward) - inputs = tokenizer( - "", return_tensors="pt", padding=PaddingStrategy.MAX_LENGTH.value - ).data - fake_inputs = OrderedDict( - [ - (input_key, inputs[input_key][0].reshape(1, -1)) - for input_key in forward_args_spec.args - if input_key in inputs - ] - ) - data_loader = (fake_inputs for _ in range(num_samples)) - input_names = list(fake_inputs.keys()) - return data_loader, input_names - - -def fetch_recipe_path(target: str): - """ - Fetches the recipe path for the given target. - This method will also download the recipe if it is not - already downloaded. - - Takes care of three scenarios: - 1. target is a local path to a model directory - (looks for recipe.yaml in the directory) - 2. target is a HuggingFace stub (downloads and - returns the path to the default recipe) - - :param target: The target to fetch the recipe path for - can be a local path or HuggingFace stub - :return: The path to the recipe for the target - """ - DEFAULT_RECIPE_NAME = "recipe.yaml" - if Path(target).exists(): - # target is a local path - potential_recipe_path = Path(target) / DEFAULT_RECIPE_NAME - return str(potential_recipe_path) if potential_recipe_path.exists() else None - - # Recipe must be downloaded - - recipe_path = None - - # target is a HuggingFace stub - with suppress(Exception): - # suppress any errors if the recipe is not found on HuggingFace - recipe_path = hf_hub_download(repo_id=target, filename=DEFAULT_RECIPE_NAME) - - return recipe_path - - -def download_repo_from_huggingface_hub(repo_id, **kwargs): - """ - Download relevant model files from the Hugging Face Hub - using the huggingface_hub.hf_hub_download function - - Note(s): - - Does not download the entire repo, only the relevant files - for the model, such as the model weights, tokenizer files, etc. - - Does not re-download files that already exist locally, unless - the force_download flag is set to True - - :pre-condition: the repo_id must be a valid Hugging Face Hub repo id - :param repo_id: the repo id to download - :param kwargs: additional keyword arguments to pass to hf_hub_download - """ - hf_filesystem = HfFileSystem() - files = hf_filesystem.ls(repo_id) - - if not files: - raise ValueError(f"Could not find any files in HF repo {repo_id}") - - # All file(s) from hf_filesystem have "name" key - # Extract the file names from the files - relevant_file_names = ( - Path(file["name"]).name - for file in files - if any(file["name"].endswith(suffix) for suffix in RELEVANT_HF_SUFFIXES) - ) - - hub_kwargs_names = ( - "subfolder", - "repo_type", - "revision", - "library_name", - "library_version", - "cache_dir", - "local_dir", - "local_dir_use_symlinks", - "user_agent", - "force_download", - "force_filename", - "proxies", - "etag_timeout", - "resume_download", - "token", - "local_files_only", - "headers", - "legacy_cache_layout", - "endpoint", - ) - hub_kwargs = {name: kwargs[name] for name in hub_kwargs_names if name in kwargs} - - for file_name in relevant_file_names: - last_file = hf_hub_download(repo_id=repo_id, filename=file_name, **hub_kwargs) - - # parent directory of the last file is the model directory - return str(Path(last_file).parent.resolve().absolute()) - - -def download_model_directory(pretrained_model_name_or_path: str, **kwargs): - """ - Download the model directory from the HF hub if the model is not found locally - - :param pretrained_model_name_or_path: the name of or path to the model to load - can be a HuggingFace model stub - :param kwargs: additional keyword arguments to pass to the download function - :return: the path to the downloaded model directory - """ - pretrained_model_path: Path = Path(pretrained_model_name_or_path) - - if pretrained_model_path.exists(): - logger.debug( - "Model directory already exists locally.", - ) - return pretrained_model_name_or_path - - with main_process_first_context(): - logger.debug("Downloading model from HuggingFace Hub.") - return download_repo_from_huggingface_hub( - repo_id=pretrained_model_name_or_path, **kwargs - ) diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index 266acf973..bdf27f620 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -22,6 +22,7 @@ from urllib.parse import urlparse import numpy +import torch from loguru import logger __all__ = [ @@ -59,6 +60,7 @@ "is_package_available", "import_from_path", "getattr_chain", + "DisableKVCache", ] @@ -1041,3 +1043,40 @@ def getattr_chain(obj: Any, chain_str: str, *args, **kwargs) -> Any: res = getattr(res, attr_name) return res + + +class DisableKVCache: + """ + Temporarily disable the key-value cache for transformer models. Used to prevent + excess memory use in one-shot cases where the model only performs the prefill + phase and not the generation phase. + + Example: + >>> model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") + >>> input = torch.randint(0, 32, size=(1, 32)) + >>> with DisableKVCache(model): + ... output = model(input) + """ + + def __init__(self, model: torch.nn.Module): + if hasattr(model.config, "use_cache"): + self.config = model.config + + # MllamaConfig + elif hasattr(model.config, "text_config") and hasattr( + model.config.text_config, "use_cache" + ): + self.config = model.config.text_config + + # unknown config structure + else: + raise NotImplementedError(f"Cannot find `use_cache` for {model.config}") + + self.restore_value = self.config.use_cache + + def __enter__(self): + self.restore_value = self.config.use_cache + self.config.use_cache = False + + def __exit__(self, _exc_type, _exc_val, _exc_tb): + self.config.use_cache = self.restore_value diff --git a/src/llmcompressor/utils/metric_logging.py b/src/llmcompressor/utils/metric_logging.py index d0b3bb11e..0b45a4670 100644 --- a/src/llmcompressor/utils/metric_logging.py +++ b/src/llmcompressor/utils/metric_logging.py @@ -3,6 +3,8 @@ from loguru import logger from torch.nn import Module +__all__ = ["get_GPU_memory_usage", "get_layer_size_mb"] + def get_GPU_memory_usage() -> List[Tuple]: try: @@ -23,7 +25,7 @@ def get_GPU_memory_usage() -> List[Tuple]: handle = pynvml.nvmlDeviceGetHandleByIndex(i) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) memory_usage_percentage = mem_info.used / mem_info.total - total_memory_gb = mem_info.total / (1024**3) + total_memory_gb = mem_info.total / (1e9) usage.append( (memory_usage_percentage, total_memory_gb), ) @@ -35,7 +37,7 @@ def get_GPU_memory_usage() -> List[Tuple]: return [] -def get_layer_size_bytes(module: Module) -> float: +def get_layer_size_mb(module: Module) -> float: param_size = 0 buffer_size = 0 @@ -46,6 +48,6 @@ def get_layer_size_bytes(module: Module) -> float: buffer_size += buffer.nelement() * buffer.element_size() total_size = param_size + buffer_size - total_size_mb = total_size / (1024**2) # Convert bytes to MB + total_size_mb = total_size / (1e6) # Convert bytes to MB return total_size_mb diff --git a/src/llmcompressor/version.py b/src/llmcompressor/version.py index 3d97aaddf..df576ff82 100644 --- a/src/llmcompressor/version.py +++ b/src/llmcompressor/version.py @@ -8,7 +8,7 @@ from typing import Optional, Tuple # Define the base version and build type -version_base = "0.2.0" +version_base = "0.3.1" build_type = "dev" # can be 'release', 'nightly', 'dev', or 'dev' with a dev number diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py new file mode 100644 index 000000000..d8dfea005 --- /dev/null +++ b/tests/e2e/e2e_utils.py @@ -0,0 +1,58 @@ +from datasets import load_dataset +from loguru import logger +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier +from llmcompressor.transformers import oneshot +from tests.testing_utils import preprocess_tokenize_dataset + + +def run_oneshot_for_e2e_testing( + model: str, + device: str, + num_calibration_samples: int, + max_seq_length: int, + dataset_id: str, + recipe: str, + dataset_split: str, + dataset_config: str, + scheme: str, + quant_type: str, +): + # Load model. + oneshot_kwargs = {} + loaded_model = AutoModelForCausalLM.from_pretrained( + model, device_map=device, torch_dtype="auto" + ) + tokenizer = AutoTokenizer.from_pretrained(model) + + if dataset_id: + ds = load_dataset(dataset_id, name=dataset_config, split=dataset_split) + ds = ds.shuffle(seed=42).select(range(num_calibration_samples)) + ds = preprocess_tokenize_dataset(ds, tokenizer, max_seq_length) + oneshot_kwargs["dataset"] = ds + oneshot_kwargs["max_seq_length"] = max_seq_length + oneshot_kwargs["num_calibration_samples"] = num_calibration_samples + + oneshot_kwargs["model"] = loaded_model + if recipe: + oneshot_kwargs["recipe"] = recipe + else: + # Test assumes that if a recipe was not provided, using + # a compatible preset sceme + if quant_type == "GPTQ": + oneshot_kwargs["recipe"] = GPTQModifier( + targets="Linear", scheme=scheme, ignore=["lm_head"] + ) + else: + oneshot_kwargs["recipe"] = QuantizationModifier( + targets="Linear", scheme=scheme, ignore=["lm_head"] + ) + + # Apply quantization. + logger.info("ONESHOT KWARGS", oneshot_kwargs) + oneshot( + **oneshot_kwargs, + oneshot_device=device, + ) + return oneshot_kwargs["model"], tokenizer diff --git a/tests/e2e/vLLM/configs/FP8/fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml similarity index 100% rename from tests/e2e/vLLM/configs/FP8/fp8_dynamic_per_token.yaml rename to tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml diff --git a/tests/e2e/vLLM/configs/FP8/fp8_static_per_tensor.yaml b/tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml similarity index 100% rename from tests/e2e/vLLM/configs/FP8/fp8_static_per_tensor.yaml rename to tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml diff --git a/tests/e2e/vLLM/configs/FP8/fp8_weight_only_channel.yaml b/tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml similarity index 100% rename from tests/e2e/vLLM/configs/FP8/fp8_weight_only_channel.yaml rename to tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml diff --git a/tests/e2e/vLLM/configs/FP8/fp8_weight_only_tensor.yaml b/tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml similarity index 100% rename from tests/e2e/vLLM/configs/FP8/fp8_weight_only_tensor.yaml rename to tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml diff --git a/tests/e2e/vLLM/configs/INT8/int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml similarity index 100% rename from tests/e2e/vLLM/configs/INT8/int8_channel_weight_static_per_tensor_act.yaml rename to tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml diff --git a/tests/e2e/vLLM/configs/INT8/int8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml similarity index 100% rename from tests/e2e/vLLM/configs/INT8/int8_dynamic_per_token.yaml rename to tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml diff --git a/tests/e2e/vLLM/configs/INT8/int8_tensor_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml similarity index 100% rename from tests/e2e/vLLM/configs/INT8/int8_tensor_weight_static_per_tensor_act.yaml rename to tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml diff --git a/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml b/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml new file mode 100644 index 000000000..dafd24025 --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +scheme: W4A16_2of4_channel +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml b/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml new file mode 100644 index 000000000..01135dd9d --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +scheme: W4A16_2of4 +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/actorder/w4a16_actorder_group.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml similarity index 90% rename from tests/e2e/vLLM/configs/actorder/w4a16_actorder_group.yaml rename to tests/e2e/vLLM/configs/w4a16_actorder_group.yaml index ddc9fc803..bb02c51ef 100644 --- a/tests/e2e/vLLM/configs/actorder/w4a16_actorder_group.yaml +++ b/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml @@ -5,5 +5,5 @@ recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml dataset_id: openai/gsm8k dataset_config: main dataset_split: train -scheme: W4A16 +scheme: W4A16_actorder_group save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/actorder/w4a16_actorder_weight.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml similarity index 90% rename from tests/e2e/vLLM/configs/actorder/w4a16_actorder_weight.yaml rename to tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml index 7362be296..318e4706e 100644 --- a/tests/e2e/vLLM/configs/actorder/w4a16_actorder_weight.yaml +++ b/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml @@ -5,5 +5,5 @@ recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml dataset_id: openai/gsm8k dataset_config: main dataset_split: train -scheme: W4A16 +scheme: W4A16_actorder_weight save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/WNA16/w4a16_channel_quant.yaml b/tests/e2e/vLLM/configs/w4a16_channel_quant.yaml similarity index 100% rename from tests/e2e/vLLM/configs/WNA16/w4a16_channel_quant.yaml rename to tests/e2e/vLLM/configs/w4a16_channel_quant.yaml diff --git a/tests/e2e/vLLM/configs/WNA16/w4a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml similarity index 76% rename from tests/e2e/vLLM/configs/WNA16/w4a16_grouped_quant.yaml rename to tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml index bbd1406ce..6a53963e0 100644 --- a/tests/e2e/vLLM/configs/WNA16/w4a16_grouped_quant.yaml +++ b/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml @@ -3,4 +3,5 @@ test_type: "regression" model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scheme: W4A16 dataset_id: HuggingFaceH4/ultrachat_200k -dataset_split: train_sft \ No newline at end of file +dataset_split: train_sft +quant_type: "GPTQ" \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/WNA16/w8a16_channel_quant.yaml b/tests/e2e/vLLM/configs/w8a16_channel_quant.yaml similarity index 100% rename from tests/e2e/vLLM/configs/WNA16/w8a16_channel_quant.yaml rename to tests/e2e/vLLM/configs/w8a16_channel_quant.yaml diff --git a/tests/e2e/vLLM/configs/WNA16/w8a16_grouped_quant.yaml b/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml similarity index 76% rename from tests/e2e/vLLM/configs/WNA16/w8a16_grouped_quant.yaml rename to tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml index 4e9a278a5..44fd79032 100644 --- a/tests/e2e/vLLM/configs/WNA16/w8a16_grouped_quant.yaml +++ b/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml @@ -3,4 +3,5 @@ test_type: "regression" model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scheme: W8A16 dataset_id: HuggingFaceH4/ultrachat_200k -dataset_split: train_sft \ No newline at end of file +dataset_split: train_sft +quant_type: "GPTQ" \ No newline at end of file diff --git a/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml new file mode 100644 index 000000000..461353770 --- /dev/null +++ b/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml @@ -0,0 +1,8 @@ +cadence: "weekly" +model: meta-llama/Meta-Llama-3-8B-Instruct +scheme: FP8_DYNAMIC +num_fewshot: 5 +limit: 1000 +task: "gsm8k" +exact_match,flexible-extract: 0.753 +exact_match,strict-match: 0.753 diff --git a/tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml b/tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml new file mode 100644 index 000000000..b16f5575a --- /dev/null +++ b/tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml @@ -0,0 +1,8 @@ +cadence: "weekly" +model: meta-llama/Meta-Llama-3-8B-Instruct +scheme: INT8 +num_fewshot: 5 +limit: 250 +task: "gsm8k" +exact_match,flexible-extract: 0.728 +exact_match,strict-match: 0.728 diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml index 84d6505cb..f61fba898 100644 --- a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml +++ b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml index 8a6dfbde6..ce6c1498a 100644 --- a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml +++ b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml index 6cfa275af..2c0094f88 100644 --- a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml +++ b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml @@ -1,7 +1,8 @@ quant_stage: quant_modifiers: + SmoothQuantModifier: + smoothing_strength: 0.8 QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml index 6ddcc63b4..4473829e1 100644 --- a/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml +++ b/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml @@ -1,7 +1,8 @@ quant_stage: quant_modifiers: + SmoothQuantModifier: + smoothing_strength: 0.8 QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml index b667b2d10..8a5302c7f 100644 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: - QuantizationModifier: - sequential_update: false + GPTQModifier: ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml index bafd7928d..f7d1b742b 100644 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: - QuantizationModifier: - sequential_update: false + GPTQModifier: ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml new file mode 100644 index 000000000..7523b09a7 --- /dev/null +++ b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml @@ -0,0 +1,21 @@ +sparsity_stage: + run_type: oneshot + sparsity_modifiers: + SparseGPTModifier: + sparsity: 0.5 + mask_structure: "2:4" + sequential_update: false +quantization_stage: + run_type: oneshot + quantization_modifiers: + GPTQModifier: + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "group" + group_size: 128 + targets: ["Linear"] diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml new file mode 100644 index 000000000..b8a4402d8 --- /dev/null +++ b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml @@ -0,0 +1,20 @@ +sparsity_stage: + run_type: oneshot + sparsity_modifiers: + SparseGPTModifier: + sparsity: 0.5 + mask_structure: "2:4" + sequential_update: false +quantization_stage: + run_type: oneshot + quantization_modifiers: + GPTQModifier: + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + targets: ["Linear"] diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml index a42e2922e..b9b9db154 100644 --- a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml +++ b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: false ignore: ["lm_head"] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml index 54b172477..0c8476883 100644 --- a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml +++ b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: false ignore: ["lm_head"] config_groups: group_0: diff --git a/tests/e2e/vLLM/run_tests.sh b/tests/e2e/vLLM/run_tests.sh new file mode 100644 index 000000000..6f19acedb --- /dev/null +++ b/tests/e2e/vLLM/run_tests.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TEST="$OPTARG" + ;; + \? ) + exit 1 + ;; + esac +done + +# Parse list of configs. +for MODEL_CONFIG in "$CONFIG"/* +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG ===" + + export TEST_DATA_FILE="$MODEL_CONFIG" + pytest \ + -r a \ + --capture=tee-sys \ + --junitxml="test-results/e2e-$(date +%s).xml" \ + "$TEST" || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: $MODEL_CONFIG ===" + else + echo "=== FAILED MODEL: $MODEL_CONFIG ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +exit "$SUCCESS" diff --git a/tests/e2e/vLLM/test_lmeval.py b/tests/e2e/vLLM/test_lmeval.py new file mode 100644 index 000000000..f77bda983 --- /dev/null +++ b/tests/e2e/vLLM/test_lmeval.py @@ -0,0 +1,131 @@ +import os +import shutil +from pathlib import Path + +import numpy +import pytest +import yaml +from loguru import logger + +from llmcompressor.core import active_session +from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing +from tests.examples.utils import requires_gpu_count + +try: + import lm_eval + + lm_eval_installed = True +except ImportError: + lm_eval_installed = False + logger.warning("lm_eval is not installed. This test will be skipped") + +TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", None) + + +# Will run each test case in its own process through run_tests.sh +# emulating vLLM CI testing +@requires_gpu_count(1) +@pytest.mark.skipif( + not lm_eval_installed, reason="lm eval is not installed, skipping test" +) +class TestLMEval: + """ + The following test quantizes a model using a preset scheme or recipe, + and then evaluates the model using LM Eval. Each test case is focused on a + specific quantization type (e.g W4A16 with grouped quantization, + W4N16 with channel quantization). To add a new test case, a new config has to be + added to the lm_eval_configs folder. The tests run on a cadence defined by the + `cadence` field. Each config defines the model to quantize. Optionally, a dataset + id and split can be provided for calibration. Finally, all config files must list + a scheme. The scheme can be a preset scheme from + https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py + or another identifier which can be used for the particular test case. If a recipe + is not provided, it is assumed that the scheme provided is a preset scheme and will + be used for quantization. Otherwise, the recipe will always be used if given. + """ # noqa: E501 + + def set_up(self): + eval_config = yaml.safe_load(Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + if os.environ.get("CADENCE", "commit") != eval_config.get("cadence"): + pytest.skip("Skipping test; cadence mismatch") + + self.model = eval_config["model"] + self.scheme = eval_config.get("scheme") + self.dataset_id = eval_config.get("dataset_id") + self.dataset_config = eval_config.get("dataset_config") + self.dataset_split = eval_config.get("dataset_split") + self.recipe = eval_config.get("recipe") + self.quant_type = eval_config.get("quant_type") + self.save_dir = eval_config.get("save_dir") + self.task = eval_config.get("task") + self.num_fewshot = eval_config.get("num_fewshot") + self.limit = eval_config.get("limit") + self.exact_flex = eval_config.get("exact_match,flexible-extract") + self.exact_strict = eval_config.get("exact_match,strict-match") + + logger.info("========== RUNNING ==============") + logger.info(self.scheme) + + self.device = "cuda:0" + self.num_calibration_samples = 256 + self.max_seq_length = 2048 + + def test_lm_eval(self): + # Run vLLM with saved model + self.set_up() + if not self.save_dir: + self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" + oneshot_model, tokenizer = run_oneshot_for_e2e_testing( + model=self.model, + device=self.device, + num_calibration_samples=self.num_calibration_samples, + max_seq_length=self.max_seq_length, + scheme=self.scheme, + dataset_id=self.dataset_id, + dataset_config=self.dataset_config, + dataset_split=self.dataset_split, + recipe=self.recipe, + quant_type=self.quant_type, + ) + + logger.info("================= SAVING TO DISK ======================") + oneshot_model.save_pretrained(self.save_dir) + tokenizer.save_pretrained(self.save_dir) + recipe_path = os.path.join(self.save_dir, "recipe.yaml") + + # Use the session to fetch the recipe; + # Reset session for next test case + session = active_session() + recipe_yaml_str = session.get_serialized_recipe() + with open(recipe_path, "w") as fp: + fp.write(recipe_yaml_str) + session.reset() + + logger.info("================= Running LM Eval ======================") + + model_args = f"pretrained={self.save_dir}" + results = lm_eval.simple_evaluate( + model="hf", + model_args=model_args, + tasks=[self.task], + num_fewshot=self.num_fewshot, + limit=self.limit, + device="cuda:0", + batch_size=100, + ) + + metrics = results["results"][self.task] + exact_match_strict = metrics.get("exact_match,strict-match") + exact_match_flex = metrics.get("exact_match,flexible-extract") + logger.info("Exact Match, Strict") + logger.info(exact_match_strict) + logger.info("Exact Match, Flex") + logger.info(exact_match_flex) + assert numpy.isclose(exact_match_strict, self.exact_strict, rtol=0.05) + assert numpy.isclose(exact_match_flex, self.exact_flex, rtol=0.05) + self.tear_down() + + def tear_down(self): + if self.save_dir is not None: + shutil.rmtree(self.save_dir) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index fec55dde4..b31bfb007 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -1,19 +1,16 @@ +import os import shutil -import unittest +from pathlib import Path +from typing import Callable import pytest -from datasets import load_dataset -from parameterized import parameterized_class -from transformers import AutoTokenizer - -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot -from tests.testing_utils import ( - parse_params, - preprocess_tokenize_dataset, - requires_gpu, - requires_torch, -) +import yaml +from huggingface_hub import HfApi +from loguru import logger + +from llmcompressor.core import active_session +from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing +from tests.examples.utils import requires_gpu_count try: from vllm import LLM, SamplingParams @@ -21,118 +18,131 @@ vllm_installed = True except ImportError: vllm_installed = False + logger.warning("vllm is not installed. This test will be skipped") + +HF_MODEL_HUB_NAME = "nm-testing" +TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "") + -# Defines the file paths to the directories containing the test configs -# for each of the quantization schemes -WNA16 = "tests/e2e/vLLM/configs/WNA16" -FP8 = "tests/e2e/vLLM/configs/FP8" -INT8 = "tests/e2e/vLLM/configs/INT8" -ACTORDER = "tests/e2e/vLLM/configs/actorder" -CONFIGS = [WNA16, FP8, INT8, ACTORDER] +@pytest.fixture +def record_config_file(record_testsuite_property: Callable[[str, object], None]): + test_data_file_name = TEST_DATA_FILE.split("configs/")[-1] + record_testsuite_property("TEST_DATA_FILE_NAME", test_data_file_name) -@requires_gpu -@requires_torch +# Will run each test case in its own process through run_tests.sh +# emulating vLLM CI testing +@requires_gpu_count(1) @pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test") -@parameterized_class(parse_params(CONFIGS)) -class TestvLLM(unittest.TestCase): +class TestvLLM: """ The following test quantizes a model using a preset scheme or recipe, runs the model using vLLM, and then pushes the model to the hub for future use. Each test case is focused on a specific quantization type (e.g W4A16 with grouped quantization, W4N16 with channel quantization). - To add a new test case, a new config has to be added to one of the folders - listed in the `CONFIGS` folder. If the test case is for a data type not listed - in `CONFIGS`, a new folder can be created and added to the list. The tests - run on a cadence defined by the `cadence` field. Each config defines the model - to quantize. Optionally, a dataset id and split can be provided for calibration. - Finally, all config files must list a scheme. The scheme can be a preset scheme - from https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py + To add a new test case, a new config has to be added to the `configs` folder. + The tests run on a cadence defined by the `cadence` field. Each config defines + the model to quantize. Optionally, a dataset id and split can be provided for + calibration. Finally, all config files must list a scheme. The scheme can be a + preset scheme from + https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py or another identifier which can be used for the particular test case. If a recipe is not provided, it is assumed that the scheme provided is a preset scheme and will be used for quantization. Otherwise, the recipe will always be used if given. """ # noqa: E501 - model = None - scheme = None - dataset_id = None - dataset_config = None - dataset_split = None - recipe = None - save_dir = None + def set_up(self): + eval_config = yaml.safe_load(Path(TEST_DATA_FILE).read_text(encoding="utf-8")) - def setUp(self): - print("========== RUNNING ==============") - print(self.scheme) + if os.environ.get("CADENCE", "commit") != eval_config.get("cadence"): + pytest.skip("Skipping test; cadence mismatch") + + self.model = eval_config["model"] + self.scheme = eval_config.get("scheme") + self.dataset_id = eval_config.get("dataset_id") + self.dataset_config = eval_config.get("dataset_config") + self.dataset_split = eval_config.get("dataset_split") + self.recipe = eval_config.get("recipe") + self.quant_type = eval_config.get("quant_type") + self.save_dir = eval_config.get("save_dir") + + logger.info("========== RUNNING ==============") + logger.info(self.scheme) self.device = "cuda:0" - self.oneshot_kwargs = {} self.num_calibration_samples = 256 - self.max_seq_length = 1048 + self.max_seq_length = 2048 self.prompts = [ "The capital of France is", "The president of the US is", "My name is", ] + self.api = HfApi() + @pytest.mark.usefixtures("record_config_file") def test_vllm(self): - # Load model. - loaded_model = SparseAutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device, torch_dtype="auto" - ) - tokenizer = AutoTokenizer.from_pretrained(self.model) - - if self.dataset_id: - ds = load_dataset( - self.dataset_id, name=self.dataset_config, split=self.dataset_split - ) - ds = ds.shuffle(seed=42).select(range(self.num_calibration_samples)) - ds = preprocess_tokenize_dataset(ds, tokenizer, self.max_seq_length) - self.oneshot_kwargs["dataset"] = ds - self.oneshot_kwargs["max_seq_length"] = self.max_seq_length - self.oneshot_kwargs["num_calibration_samples"] = ( - self.num_calibration_samples - ) - - if self.save_dir is None: - self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" + # Run vLLM with saved model + import torch - self.oneshot_kwargs["model"] = loaded_model - if self.recipe: - self.oneshot_kwargs["recipe"] = self.recipe - else: - # Test assumes that if a recipe was not provided, using - # a compatible preset sceme - self.oneshot_kwargs["recipe"] = QuantizationModifier( - targets="Linear", scheme=self.scheme, ignore=["lm_head"] - ) - - # Apply quantization. - print("ONESHOT KWARGS", self.oneshot_kwargs) - oneshot( - **self.oneshot_kwargs, - clear_sparse_session=True, - oneshot_device=self.device, + self.set_up() + if not self.save_dir: + self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" + oneshot_model, tokenizer = run_oneshot_for_e2e_testing( + model=self.model, + device=self.device, + num_calibration_samples=self.num_calibration_samples, + max_seq_length=self.max_seq_length, + scheme=self.scheme, + dataset_id=self.dataset_id, + dataset_config=self.dataset_config, + dataset_split=self.dataset_split, + recipe=self.recipe, + quant_type=self.quant_type, ) - self.oneshot_kwargs["model"].save_pretrained(self.save_dir) + + logger.info("================= SAVING TO DISK ======================") + oneshot_model.save_pretrained(self.save_dir) tokenizer.save_pretrained(self.save_dir) - # Run vLLM with saved model - print("================= RUNNING vLLM =========================") + recipe_path = os.path.join(self.save_dir, "recipe.yaml") + + # Use the session to fetch the recipe; + # Reset session for next test case + session = active_session() + recipe_yaml_str = session.get_serialized_recipe() + with open(recipe_path, "w") as fp: + fp.write(recipe_yaml_str) + session.reset() + + logger.info("================= UPLOADING TO HUB ======================") + + self.api.upload_folder( + repo_id=f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e", + folder_path=self.save_dir, + ) + + logger.info("================= RUNNING vLLM =========================") + sampling_params = SamplingParams(temperature=0.80, top_p=0.95) - llm = LLM(model=self.save_dir) + if "W4A16_2of4" in self.scheme: + # required by the kernel + llm = LLM(model=self.save_dir, dtype=torch.float16) + else: + llm = LLM(model=self.save_dir) outputs = llm.generate(self.prompts, sampling_params) - print("================= vLLM GENERATION ======================") + + logger.info("================= vLLM GENERATION ======================") for output in outputs: assert output prompt = output.prompt generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) - print("================= UPLOADING TO HUB ======================") - self.oneshot_kwargs["model"].push_to_hub(f"nm-testing/{self.save_dir}-e2e") - tokenizer.push_to_hub(f"nm-testing/{self.save_dir}-e2e") + logger.info("PROMPT") + logger.info(prompt) + logger.info("GENERATED TEXT") + logger.info(generated_text) + + self.tear_down() - def tearDown(self): + def tear_down(self): if self.save_dir is not None: shutil.rmtree(self.save_dir) diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py index 26141ec88..3901bfd70 100644 --- a/tests/examples/test_big_models_with_accelerate.py +++ b/tests/examples/test_big_models_with_accelerate.py @@ -49,9 +49,9 @@ def test_readme_has_install_command(self, example_dir: str): ], ), pytest.param( - "multi_gpu_int8_sequential_update.py", + "mult_gpus_int8_device_map.py", "", - id="multi_gpu_int8_sequential_update", + id="mult_gpus_int8_device_map", marks=[requires_gpu_count(2), pytest.mark.multi_gpu], ), ], diff --git a/tests/examples/test_quantization_24_sparse_w4a16.py b/tests/examples/test_quantization_2of4_sparse_w4a16.py similarity index 89% rename from tests/examples/test_quantization_24_sparse_w4a16.py rename to tests/examples/test_quantization_2of4_sparse_w4a16.py index ffb5931fd..b85e6098a 100644 --- a/tests/examples/test_quantization_24_sparse_w4a16.py +++ b/tests/examples/test_quantization_2of4_sparse_w4a16.py @@ -16,14 +16,14 @@ @pytest.fixture def example_dir() -> str: - return "examples/quantization_24_sparse_w4a16" + return "examples/quantization_2of4_sparse_w4a16" @pytest.mark.example @requires_gpu_count(1) class TestQuantization24SparseW4A16: """ - Tests for examples in the "quantization_24_sparse_w4a16" example folder. + Tests for examples in the "quantization_2of4_sparse_w4a16" example folder. """ def test_doc_example_command(self, example_dir: str, tmp_path: Path): @@ -52,7 +52,7 @@ def test_alternative_recipe(self, example_dir: str, tmp_path: Path): script_path = tmp_path / example_dir / script_filename content = script_path.read_text(encoding="utf-8") content = content.replace( - "2:4_w4a16_recipe.yaml", "2:4_w4a16_group-128_recipe.yaml" + "2of4_w4a16_recipe.yaml", "2of4_w4a16_group-128_recipe.yaml" ) script_path.write_text(content, encoding="utf-8") diff --git a/tests/llmcompressor/modifiers/calibration/test_cache.py b/tests/llmcompressor/modifiers/calibration/test_cache.py new file mode 100644 index 000000000..6ea024037 --- /dev/null +++ b/tests/llmcompressor/modifiers/calibration/test_cache.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs + +from llmcompressor.modifiers.quantization.cache import QuantizedKVParameterCache +from llmcompressor.observers import Observer + + +def test_is_quantized_cache_singleton(): + """ + Check if quantized_cache is a singleton, used for + passing in QuantizedKVParameterCache to the forward call of + the model's self_attn + """ + + args = QuantizationArgs() + cache = QuantizedKVParameterCache(args) + observer = args.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=args) + + tensor = torch.tensor([1, 2, 3]) + cache.k_scales.append(tensor) + cache.k_observers.append(observer) + + same_cache = QuantizedKVParameterCache(args) + + assert len(cache.k_scales) == len(same_cache.k_scales) + assert torch.equal(cache.k_scales[0], same_cache.k_scales[0]) + + assert cache.k_observers == same_cache.k_observers + assert hex(id(cache.k_observers[0])) == hex(id(same_cache.k_observers[0])) + + cache.reset() + + +def test_update(): + nbits = 8 + args = QuantizationArgs(nbits=nbits, symmetric=True) + cache = QuantizedKVParameterCache(args) + + max_key_states_val = 1.0 + max_value_states_val = 2.0 + key_states = torch.cat( + (max_key_states_val * torch.ones(1, 2, 2), torch.ones(1, 2, 2)), dim=0 + ) + value_states = torch.cat( + (max_value_states_val * torch.ones(1, 2, 2), torch.ones(1, 2, 2)), dim=0 + ) + layer_idx = 0 + + cache.update(key_states, value_states, layer_idx) + denom = (2 ** (nbits) - 1) / 2 + expected_k_scale = torch.tensor([max_key_states_val / denom]) + expected_v_scale = torch.tensor([max_value_states_val / denom]) + + assert cache.k_scales[0] == expected_k_scale + assert cache.v_scales[0] == expected_v_scale + + # new attn layer + layer_idx = 1 + cache.update(key_states, value_states, layer_idx) + + assert len(cache.k_scales) == 2 + assert len(cache.v_scales) == 2 + + assert len(cache.k_observers) == 2 + assert len(cache.v_observers) == 2 + + cache.reset() + + +def test_cache_reset(): + nbits = 8 + args = QuantizationArgs(nbits=nbits, symmetric=True) + cache = QuantizedKVParameterCache(args) + + max_key_states_val = 1.0 + max_value_states_val = 2.0 + key_states = torch.cat( + (max_key_states_val * torch.ones(1, 2, 2), torch.ones(1, 2, 2)), dim=0 + ) + value_states = torch.cat( + (max_value_states_val * torch.ones(1, 2, 2), torch.ones(1, 2, 2)), dim=0 + ) + layer_idx = 0 + + cache.update(key_states, value_states, layer_idx) + assert len(cache.k_scales) == 1 + assert len(cache.v_scales) == 1 + + assert len(cache.k_observers) == 1 + assert len(cache.v_observers) == 1 + + cache.reset() + + # new instance, different memory addr + different_cache = QuantizedKVParameterCache(args) + + assert len(different_cache.k_scales) == 0 + assert len(different_cache.v_scales) == 0 + + assert len(different_cache.k_observers) == 0 + assert len(different_cache.v_observers) == 0 + + assert hex(id(cache)) != hex(id(different_cache)) diff --git a/tests/llmcompressor/modifiers/calibration/test_frozen.py b/tests/llmcompressor/modifiers/calibration/test_frozen.py new file mode 100644 index 000000000..4b89a0084 --- /dev/null +++ b/tests/llmcompressor/modifiers/calibration/test_frozen.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from compressed_tensors.quantization.quant_scheme import QuantizationScheme +from torch.nn import Linear + +from llmcompressor.modifiers.quantization.calibration import ( + freeze_module_quantization, + initialize_observer, +) + + +def test_set_module_for_calibration(): + num_bits = 8 + quantization_scheme = QuantizationScheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + + layer = Linear(4, 4) + + initialize_module_for_quantization(layer, quantization_scheme) + layer.quantization_status = QuantizationStatus("calibration") + initialize_observer(layer, "weight") + + # should have both input and weight observer after initalizing + assert hasattr(layer, "weight_observer") + + # observers should get deleted after freezing + freeze_module_quantization(layer) + assert not hasattr(layer, "input_observer") + assert not hasattr(layer, "weight_observer") + + assert layer.quantization_status == QuantizationStatus("frozen") diff --git a/tests/llmcompressor/modifiers/calibration/test_kv_cache.py b/tests/llmcompressor/modifiers/calibration/test_kv_cache.py new file mode 100644 index 000000000..25b8468f4 --- /dev/null +++ b/tests/llmcompressor/modifiers/calibration/test_kv_cache.py @@ -0,0 +1,94 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch +from compressed_tensors.quantization import ( + QuantizationConfig, + QuantizationStatus, + apply_quantization_config, + is_attention_module, +) +from transformers import AutoModelForCausalLM + +from llmcompressor.modifiers.quantization.calibration import ( + calibrate_kv_cache_input_hook, + calibrate_kv_cache_output_hook, + freeze_module_quantization, + set_unset_kv_cache, +) + +config = { + "quant_method": "compressed-tensors", + "format": "fakequant", + "kv_cache_scheme": { + "num_bits": 8, + "type": "int", + "symmetric": True, + "strategy": "tensor", + }, + "config_groups": { + "group_1": { + "weights": { + "num_bits": 4, + "type": "int", + "symmetric": True, + "strategy": "tensor", + }, + "targets": ["Linear"], + }, + }, +} + + +def _prep_for_calibration(module: torch.nn.Module): + if is_attention_module(module): + module.register_forward_pre_hook( + calibrate_kv_cache_input_hook, with_kwargs=True + ) + module.register_forward_hook(calibrate_kv_cache_output_hook) + module.quantization_status = QuantizationStatus.CALIBRATION + + +@pytest.mark.parametrize("config", [config]) +def test_kv_cache_quantization(config): + sample = { + name: torch.ones((1, 32)).long() + for name in ["input_ids", "attention_mask", "labels"] + } + model = AutoModelForCausalLM.from_pretrained( + "HuggingFaceM4/tiny-random-LlamaForCausalLM", + torch_dtype="auto", + ) + model.eval() + + config = QuantizationConfig(**config) + config.quantization_status = QuantizationStatus.CALIBRATION + apply_quantization_config(model, config) + model.apply(set_unset_kv_cache) + model.apply(_prep_for_calibration) + + with torch.no_grad(): + _ = model(**sample) + + model.apply(freeze_module_quantization) + + reloaded_config = QuantizationConfig.from_pretrained(model) + + assert ( + config.kv_cache_scheme.model_dump().keys() + == reloaded_config.kv_cache_scheme.model_dump().keys() + ) + assert list(config.kv_cache_scheme.model_dump().values()) == list( + reloaded_config.kv_cache_scheme.model_dump().values() + ) diff --git a/tests/llmcompressor/modifiers/smoothquant/test_base.py b/tests/llmcompressor/modifiers/smoothquant/test_base.py index e2272cd83..335ddf624 100644 --- a/tests/llmcompressor/modifiers/smoothquant/test_base.py +++ b/tests/llmcompressor/modifiers/smoothquant/test_base.py @@ -3,10 +3,7 @@ import pytest from llmcompressor.modifiers.factory import ModifierFactory -from llmcompressor.modifiers.smoothquant.base import ( - DEFAULT_SMOOTHQUANT_MAPPINGS, - SmoothQuantModifier, -) +from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier from tests.llmcompressor.modifiers.conf import setup_modifier_factory @@ -45,7 +42,6 @@ def setUp(self): def test_defaults(self): default_sq = SmoothQuantModifier() assert default_sq.smoothing_strength == 0.5 - assert default_sq.mappings == DEFAULT_SMOOTHQUANT_MAPPINGS def test_override_defaults(self): strength = 0.7 diff --git a/tests/llmcompressor/modifiers/smoothquant/test_utils.py b/tests/llmcompressor/modifiers/smoothquant/test_utils.py new file mode 100644 index 000000000..95be6bd30 --- /dev/null +++ b/tests/llmcompressor/modifiers/smoothquant/test_utils.py @@ -0,0 +1,39 @@ +from unittest.mock import patch + +import pytest + +from llmcompressor.modifiers.smoothquant.utils import ( + get_layer_mappings_from_architecture, + handle_mapping_resolution_errors, +) + +smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils" + + +@pytest.mark.unit +def test_handle_mapping_resolution_errors(): + README_LOCATION = "llmcompressor/modifiers/smoothquant/README.md" + + @handle_mapping_resolution_errors + def func_that_raises_exception(): + raise ValueError("An error occurred") + + with pytest.raises(RuntimeError) as excinfo: + func_that_raises_exception() + + assert "Error resolving mappings for given architecture." in str(excinfo.value) + assert "Please refer to the README at" in str(excinfo.value) + assert README_LOCATION in str(excinfo.value) + + +@pytest.mark.unit +@patch( + f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"} +) +@patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping") +def test_get_layer_mappings_from_architecture(): + # Test when architecture is in MAPPINGS_REGISTRY + assert get_layer_mappings_from_architecture("arch1") == "mapping1" + + # Test when architecture is not in MAPPINGS_REGISTRY + assert get_layer_mappings_from_architecture("arch3") == "default_mapping" diff --git a/tests/llmcompressor/modifiers/utils/test_hooks.py b/tests/llmcompressor/modifiers/utils/test_hooks.py new file mode 100644 index 000000000..5c4fc5891 --- /dev/null +++ b/tests/llmcompressor/modifiers/utils/test_hooks.py @@ -0,0 +1,83 @@ +import torch + +from llmcompressor.modifiers.utils.hooks import HooksMixin + + +class DummyModel(torch.nn.Module): + """Dummy Model for testing hooks""" + + def __init__(self): + super(DummyModel, self).__init__() + + self.linear1 = torch.nn.Linear(1, 2) + self.linear2 = torch.nn.Linear(2, 3) + self.linear3 = torch.nn.Linear(3, 1) + self.dummy_inputs = torch.tensor([0.0]) + + def forward(self, x): + x = self.linear1(x) + x = self.linear2(x) + x = self.linear3(x) + + return x + + +class DummyMod(HooksMixin): + hook_called: bool = False + + def hook(self, *args, **kwargs): + self.hook_called = True + + +class ModA(DummyMod): + pass + + +class ModB(DummyMod): + pass + + +def test_register_hook(): + model = DummyModel() + + mod_a = ModA() + mod_a.register_hook(model.linear1, mod_a.hook, "forward") + + mod_b = ModB() + mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre") + + model(model.dummy_inputs) + assert mod_a.hook_called and mod_b.hook_called + + +def test_remove_hooks(): + model = DummyModel() + + mod_a = ModA() + mod_a.register_hook(model.linear1, mod_a.hook, "forward") + + mod_b = ModB() + mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre") + mod_b.remove_hooks() + + model(model.dummy_inputs) + assert mod_a.hook_called and not mod_b.hook_called + + +def test_disable_hooks(): + model = DummyModel() + + mod_a = ModA() + mod_a.register_hook(model.linear1, mod_a.hook, "forward") + + mod_b = ModB() + mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre") + + with HooksMixin.disable_hooks(): + model(model.dummy_inputs) + assert not mod_a.hook_called and not mod_b.hook_called + + mod_a.hook_called = False + mod_b.hook_called = False + model(model.dummy_inputs) + assert mod_a.hook_called and mod_b.hook_called diff --git a/tests/llmcompressor/observers/__init__.py b/tests/llmcompressor/observers/__init__.py new file mode 100644 index 000000000..0c44f887a --- /dev/null +++ b/tests/llmcompressor/observers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/llmcompressor/observers/test_helpers.py b/tests/llmcompressor/observers/test_helpers.py new file mode 100644 index 000000000..527176019 --- /dev/null +++ b/tests/llmcompressor/observers/test_helpers.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from compressed_tensors.quantization import ( + QuantizationConfig, + QuantizationStatus, + apply_quantization_config, +) +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization.calibration import ( + calibrate_input_hook, + initialize_observer, +) +from llmcompressor.observers.helpers import get_observer_token_count + + +def _prep_for_input_quant_calibration(module: torch.nn.Module): + quantization_scheme = getattr(module, "quantization_scheme", None) + if not quantization_scheme: + return + + module.register_forward_pre_hook(calibrate_input_hook) + module.quantization_status = QuantizationStatus.CALIBRATION + + +def test_get_observer_token_count(): + model = AutoModelForCausalLM.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE") + tokenizer = AutoTokenizer.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE") + model.eval() + config = QuantizationConfig( + format="fakequant", + quantization_status="calibration", + config_groups={ + "group_1": { + "input_activations": { + "num_bits": 8, + "type": "int", + "symmetric": False, + "strategy": "tensor", + }, + "targets": ["Linear"], + }, + }, + ) + apply_quantization_config(model, config) + model.apply(lambda module: initialize_observer(module, base_name="input")) + model.apply(_prep_for_input_quant_calibration) + + # start calibration + calib_list = [ + "I am a string that", + "is used for calibration so", + "that your model is", + "quantized properly.", + ] + + total_num_tokens_observed = 0 + for calib_sample in calib_list: + calib_tensor = tokenizer(calib_sample, return_tensors="pt") + _ = model(**calib_tensor) + total_num_tokens_observed += len(calib_tensor.input_ids.flatten()) + + counter = get_observer_token_count(model) + + # filter out the None values + # (tokens, in the appropriate format, that were not observed by the model) + counter = {k: v for k, v in counter.items() if v is not None} + + # iterate over all the layers in the model where the token count in the proper + # format is has been observed + for i in range(model.config.num_hidden_layers): + # fetch the tokens observed by the router + tokens_observed_by_router = counter.pop( + f"model.layers.{i}.block_sparse_moe.gate" + ) + assert tokens_observed_by_router == total_num_tokens_observed + + # fetch the sum of tokens observed by all the experts + sum_tokens_observed_by_experts = 0 + keys_for_this_layer = [ + k + for k in counter.keys() + if f"model.layers.{i}.block_sparse_moe.experts" in k + ] + for key in keys_for_this_layer: + sum_tokens_observed_by_experts += counter.pop(key) + + # each Mixtral expert is comprised of 3 linear layers, + # so we need to multiply by 3 + assert ( + sum_tokens_observed_by_experts + == total_num_tokens_observed * model.config.num_experts_per_tok * 3 + ) + + # there are no more information in the counter + assert len(counter) == 0 diff --git a/tests/llmcompressor/observers/test_min_max.py b/tests/llmcompressor/observers/test_min_max.py new file mode 100644 index 000000000..f23a06dba --- /dev/null +++ b/tests/llmcompressor/observers/test_min_max.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs + +from llmcompressor.observers import Observer + + +def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor: + perm = torch.randperm(columns) + return torch.tensor([index // group_size for index in range(columns)])[perm] + + +@pytest.mark.parametrize( + "symmetric,expected_scale,expected_zero_point", + [ + (True, 0.0078, 0), + (False, 0.0039, -128), + ], +) +def test_min_max_observer(symmetric, expected_scale, expected_zero_point): + tensor = torch.tensor([1, 1, 1, 1, 1]) + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) + + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + scale, zero_point = observer(tensor) + + assert round(scale.item(), 4) == expected_scale + assert round(zero_point.item(), 4) == expected_zero_point + + +def test_min_max_observer_symmetric_scale_range(): + tensor = torch.rand(4, 4) + tensor *= 127 + + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + scale, zero_point = observer(tensor) + + # if symmetric, max symmetric_range = abs(-128) / 255 + assert round(scale.item(), 4) <= 1.0039 + assert round(zero_point.item(), 4) == 0 + + +def test_min_max_observer_value_update(): + inp = torch.tensor([1, 1, 1, 1, 1]) + inp_update_max = torch.tensor([127, 1, 1, 1, 1]) + inp_update_min = torch.tensor([-128, 1, 1, 1, 1]) + + delta = 1e-6 + + # update the min, max twice total + tensors = [ + inp, + inp, + inp_update_max, # update max + inp, + inp_update_min, # update min + ] + + tensor = inp + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + curr_max = 1 + curr_min = 1 + for i, tensor in enumerate(tensors): + observer(tensor) + curr_max = max(observer.max_val.get("default"), curr_max) + curr_min = min(observer.min_val.get("default"), curr_max) + + if i < 2: + assert curr_max == 1 + assert curr_min == 1 + elif i < 4: + assert abs(curr_max - 2.2600) < delta + assert curr_min == 1 + else: + assert abs(curr_max - 2.2600) < delta + assert abs(curr_min - (-0.2900)) < delta + + +def test_g_idx(): + group_size = 2 + input_shape = (128, 512) + tensor = torch.rand(input_shape) + weights = QuantizationArgs(num_bits=8, group_size=group_size) + g_idx = make_dummy_g_idx(tensor.shape[1], group_size) + + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + scale_g_idx, zero_point_g_idx = observer(tensor, g_idx=g_idx) + + observer.reset() + scale, zero_point = observer(tensor[:, torch.argsort(g_idx)]) + + assert scale_g_idx == pytest.approx(scale) + assert zero_point_g_idx == pytest.approx(zero_point) diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py new file mode 100644 index 000000000..ec2ecf1b5 --- /dev/null +++ b/tests/llmcompressor/observers/test_mse.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs + +from llmcompressor.observers import MovingAverageMSEObserver, Observer + + +@pytest.mark.parametrize( + "symmetric,expected_scale,expected_zero_point", + [ + (True, 0.0078, 0), + (False, 0.0039, -128), + ], +) +def test_mse_observer(symmetric, expected_scale, expected_zero_point): + tensor = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0]) + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse") + + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + scale, zero_point = observer(tensor) + + assert isinstance(observer, MovingAverageMSEObserver) + assert round(scale.item(), 4) == expected_scale + assert round(zero_point.item(), 4) == expected_zero_point + + +def test_mse_observer_symmetric_scale_range(): + tensor = torch.rand(4, 4) + tensor *= 127 + + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + observer = Observer.load_from_registry(observer, quantization_args=weights) + scale, zero_point = observer(tensor) + + # if symmetric, max symmetric_range = abs(-128) / 255 + assert round(scale.item(), 4) <= 1.0039 + assert round(zero_point.item(), 4) == 0 diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py index 5421af4cf..1a229a6aa 100644 --- a/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py +++ b/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py @@ -13,11 +13,9 @@ setup_modifier_factory, ) from tests.llmcompressor.pytorch.helpers import LinearNet -from tests.testing_utils import requires_torch @pytest.mark.unit -@requires_torch class TestInvalidLayerwiseRecipesRaiseExceptions(unittest.TestCase): def setUp(self): setup_modifier_factory() @@ -45,7 +43,6 @@ def test_invalid_layerwise_recipes_raise_exceptions(self, sparsity, targets): @pytest.mark.unit -@requires_torch class TestSuccessfulLayerwiseRecipe(unittest.TestCase): def setUp(self): setup_modifier_factory() @@ -66,7 +63,6 @@ def test_successful_layerwise_recipe(self): @pytest.mark.unit -@requires_torch class TestCreateDefaultQuantModifier(unittest.TestCase): def setUp(self): setup_modifier_factory() @@ -91,7 +87,6 @@ def test_create_default_quant_modifier(self): @pytest.mark.unit -@requires_torch class TestSetQuantIfModifierAlreadyExists(unittest.TestCase): def setUp(self): setup_modifier_factory() diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py index 4fac2b12f..b2050c179 100644 --- a/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py +++ b/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py @@ -4,11 +4,9 @@ from llmcompressor.modifiers.factory import ModifierFactory from tests.llmcompressor.modifiers.conf import setup_modifier_factory -from tests.testing_utils import requires_torch @pytest.mark.unit -@requires_torch class TestWandaPytorchIsRegistered(unittest.TestCase): def setUp(self): self.kwargs = dict( diff --git a/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py index c521a1361..7977c4546 100644 --- a/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py +++ b/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py @@ -6,11 +6,9 @@ from llmcompressor.core import State from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from tests.llmcompressor.pytorch.helpers import LinearNet -from tests.testing_utils import requires_torch @pytest.mark.unit -@requires_torch class TestSmoothQuantMapping(unittest.TestCase): def setUp(self): self.model = LinearNet() diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml index 44ad696b4..6ee1d31d5 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml @@ -15,5 +15,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml index def4b362f..468259a9c 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml @@ -15,5 +15,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml index 0d386df13..f36ac0595 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml @@ -13,6 +13,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False - \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml index f3c2db93c..6df9cd8af 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml @@ -2,7 +2,6 @@ test_stage: quant_modifiers: GPTQModifier: block_size: 128 - sequential_update: False ignore: ["lm_head", "model.layers.0.mlp.down_proj"] config_groups: group_0: diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml index 31ba456bd..02387f6c9 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml @@ -14,5 +14,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml index 34e0a77e0..67aa5df3f 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml @@ -14,5 +14,4 @@ test_stage: targets: ["Linear", "Embedding"] GPTQModifier: block_size: 128 - sequential_update: False targets: ["re:model.layers.\\d+$"] \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index 6159646ed..d516616bf 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 844cf457d..7e9bc3f2f 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w4a16-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w4a16-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml index 367d3fd4f..af1e5df8b 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a16-dense" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w8a16-dense" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 583edba18..086a67ed6 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a8-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w8a8-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py index a96cb4e02..c0f0d2c02 100644 --- a/tests/llmcompressor/transformers/compression/test_quantization.py +++ b/tests/llmcompressor/transformers/compression/test_quantization.py @@ -8,18 +8,17 @@ from compressed_tensors.quantization.utils import is_module_quantized from parameterized import parameterized_class from torch.utils.data import DataLoader -from transformers import AutoTokenizer, DefaultDataCollator +from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator from llmcompressor.pytorch.utils import tensors_to_device -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.finetune.data import TextGenerationDataset from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/compression/configs" -@requires_torch @requires_gpu @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) @@ -37,15 +36,16 @@ class TestQuantizationMatches(unittest.TestCase): def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() - cls.model = SparseAutoModelForCausalLM.from_pretrained( + cls.model = AutoModelForCausalLM.from_pretrained( cls.model_stub, torch_dtype=cls.weight_dtype, device_map="cuda:0" ) - cls._run_oneshot( + model = cls._run_oneshot( cls.model, cls.new_recipe, cls.dataset, os.path.join(cls.test_dir, cls.output), ) + cls.session_model = model @classmethod def tearDownClass(cls): @@ -68,10 +68,14 @@ def _run_oneshot(model, recipe, dataset, output_dir): num_calibration_samples=num_calibration_samples, recipe=recipe, pad_to_max_length=pad_to_max_length, - clear_sparse_session=True, + clear_sparse_session=False, splits={"calibration": "train_gen[:5%]"}, save_compressed=False, ) + from llmcompressor.pytorch.model_load.helpers import get_session_model + + # note: get_session_model() is None outside of function scope + return get_session_model() def _get_quant_info(self, model): quant_info_weights = {} @@ -96,7 +100,7 @@ def _get_quant_info(self, model): return quant_info_weights, quant_info_inputs def test_quantization_reload(self): - model_reloaded = SparseAutoModelForCausalLM.from_pretrained( + model_reloaded = AutoModelForCausalLM.from_pretrained( os.path.join(self.test_dir, self.output), torch_dtype="auto", device_map="cuda:0", diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 97070377b..0c2a0ab0e 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -3,43 +3,63 @@ import unittest import torch +from compressed_tensors import QUANTIZATION_CONFIG_NAME +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.quantization import QuantizationStatus from parameterized import parameterized_class -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from llmcompressor.transformers import SparseAutoModelForCausalLM -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" -@requires_torch @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): model_stub = None + empty_model = None @classmethod def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() - cls.compressed_model = SparseAutoModelForCausalLM.from_pretrained( - cls.model_stub, torch_dtype="auto", device_map="auto", run_compressed=True + # TODO: Give option on HFQuantizer to run run_compressed True/False + # currently hardcoded to True + cls.compressed_model = AutoModelForCausalLM.from_pretrained( + cls.model_stub, + torch_dtype="auto", + device_map="auto", + # run_compressed=True, # TODO: Give option on HFQuantizer ) - cls.uncompressed_model = SparseAutoModelForCausalLM.from_pretrained( - cls.model_stub, torch_dtype="auto", device_map="auto", run_compressed=False + # TODO: Use ModelCompressor until decompression is supported through + # HFQuant/run_compressed can be turned off. + cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( + cls.empty_model, + torch_dtype=cls.compressed_model.dtype, + device_map=cls.compressed_model.device, ) + config = AutoConfig.from_pretrained(cls.model_stub) + compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) + cls.compressor = ModelCompressor.from_compression_config(compression_config) + cls.compressor.quantization_config.quantization_status = ( + QuantizationStatus.FROZEN + ) + cls.compressor.decompress( + model_path=cls.model_stub, model=cls.uncompressed_model + ) + cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub) - cls.device = cls.compressed_model.device def test_compressed_matches_uncompressed(self): SAMPLE_INPUT = [ "I love 4-bit quantization because", - "What is the capital of Paris?", + "What is the capital of France?", "def fibonacci(n):", ] inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - self.device + self.compressed_model.device ) compressed_output = self.tokenizer.batch_decode( self.compressed_model.generate(**inputs, max_length=50) diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py index 3415858af..a602c4828 100644 --- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py +++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py @@ -8,7 +8,6 @@ from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments from llmcompressor.transformers.finetune.runner import StageRunner from llmcompressor.transformers.finetune.training_args import TrainingArguments -from tests.testing_utils import requires_torch @pytest.mark.unit @@ -283,7 +282,6 @@ def test_split_loading(self, split_def): self.assertIsInstance(train_dataset[0], dict) -@requires_torch @pytest.mark.unit class TestTokenizationDataset(unittest.TestCase): @pytest.fixture(autouse=True) diff --git a/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml index 781d359d4..2b5999c3d 100644 --- a/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml +++ b/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml @@ -1,5 +1,5 @@ cadence: "nightly" test_type: "regression" -model: "Xenova/llama2.c-stories15M" +model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" dataset_config_name: wikitext-2-raw-v1 dataset: wikitext \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py index bcc3c8e85..e7c8e7b9a 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py @@ -10,7 +10,7 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_custom" GPU_CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_custom/gpu" @@ -109,9 +109,9 @@ def create_mock_file(self, extension, content, path, filename): def tearDown(self): shutil.rmtree(self.output) + self.monkeypatch.undo() -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneshotCustomDatasetSmall(TestFinetuneNoRecipeCustomDataset): @@ -122,14 +122,20 @@ class TestOneshotCustomDatasetSmall(TestFinetuneNoRecipeCustomDataset): def setUp(self): import torch - self.device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.monkeypatch = pytest.MonkeyPatch() + + if torch.cuda.is_available(): + self.device = "cuda:0" + self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0") + else: + self.device = "cpu" + self.output = "./oneshot_output" def test_oneshot_then_finetune_small(self): self._test_finetune_wout_recipe_custom_dataset() -@requires_torch @requires_gpu @pytest.mark.integration @parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY)) @@ -140,15 +146,17 @@ class TestOneshotCustomDatasetGPU(TestFinetuneNoRecipeCustomDataset): def setUp(self): import torch + from transformers import AutoModelForCausalLM - from llmcompressor.transformers import SparseAutoModelForCausalLM - + self.monkeypatch = pytest.MonkeyPatch() self.device = "cuda:0" self.output = "./oneshot_output" + self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0") - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device, torch_dtype=torch.bfloat16 ) + self.monkeypatch = pytest.MonkeyPatch() def test_oneshot_then_finetune_gpu(self): self._test_finetune_wout_recipe_custom_dataset() diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py b/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py index 47ef85244..ec517e2d6 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py @@ -5,13 +5,12 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" @pytest.mark.integration -@requires_torch @requires_gpu @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneshotWithModifierObject(unittest.TestCase): diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py index 0087c7c2d..7facd088e 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py @@ -4,13 +4,12 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" @pytest.mark.integration -@requires_torch @requires_gpu @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestFinetuneWithoutRecipe(unittest.TestCase): diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py index 66ce22535..870503496 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py @@ -3,11 +3,11 @@ import unittest import pytest -from compressed_tensors.compressors.model_compressor import ModelCompressor +from compressed_tensors.compressors import ModelCompressor from parameterized import parameterized_class from transformers import AutoConfig -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_oneshot_configs" GPU_CONFIGS_DIRECTORY = ( @@ -19,9 +19,9 @@ class TestOneshotAndFinetune(unittest.TestCase): def _test_oneshot_and_finetune(self): from llmcompressor.transformers import apply - splits = {"train": "train[:50%]", "calibration": "train[50%:60%]"} + splits = {"train": "train[:30%]", "calibration": "train[30%:40%]"} if self.dataset == "ultrachat-200k": - splits = {"train": "train_gen[:50%]", "calibration": "train_gen[50%:60%]"} + splits = {"train": "train_gen[:30%]", "calibration": "train_gen[30%:40%]"} apply( model=self.model, @@ -56,7 +56,6 @@ def tearDown(self): shutil.rmtree(self.output) -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneshotAndFinetuneSmall(TestOneshotAndFinetune): @@ -77,7 +76,6 @@ def test_oneshot_then_finetune_small(self): self._test_oneshot_and_finetune() -@requires_torch @requires_gpu @pytest.mark.integration @parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY)) @@ -91,13 +89,12 @@ class TestOneshotAndFinetuneGPU(TestOneshotAndFinetune): def setUp(self): import torch - - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM self.device = "cuda:0" self.output = "./finetune_output" - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device, torch_dtype=torch.bfloat16 ) diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py index 1a09a0eea..509464a34 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py @@ -4,13 +4,12 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_tokenizer" @pytest.mark.integration -@requires_torch @requires_gpu @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneshotAndFinetuneWithTokenizer(unittest.TestCase): @@ -20,12 +19,17 @@ class TestOneshotAndFinetuneWithTokenizer(unittest.TestCase): def setUp(self): self.output = "./finetune_output" + # finetune workflows in general seem to have trouble with multi-gpus + # use just one atm + self.monkeypatch = pytest.MonkeyPatch() def test_oneshot_and_finetune_with_tokenizer(self): from datasets import load_dataset - from transformers import AutoTokenizer + from transformers import AutoModelForCausalLM, AutoTokenizer - from llmcompressor.transformers import SparseAutoModelForCausalLM, compress + from llmcompressor.transformers import compress + + self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0") recipe_str = ( "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" @@ -33,9 +37,8 @@ def test_oneshot_and_finetune_with_tokenizer(self): tokenizer = AutoTokenizer.from_pretrained( self.model, ) - device = "cuda:0" - model_loaded = SparseAutoModelForCausalLM.from_pretrained( - self.model, device_map=device + model_loaded = AutoModelForCausalLM.from_pretrained( + self.model, device_map="auto" ) dataset_loaded = load_dataset( @@ -60,5 +63,12 @@ def test_oneshot_and_finetune_with_tokenizer(self): tokenizer=tokenizer, ) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + "cuda" + ) + output = model_loaded.generate(input_ids, max_new_tokens=100) + print(tokenizer.decode(output[0])) + def tearDown(self): shutil.rmtree(self.output) + self.monkeypatch.undo() diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py index db81aff05..e9c3d7c5c 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py @@ -5,11 +5,8 @@ import pytest -from tests.testing_utils import requires_torch - @pytest.mark.unit -@requires_torch @pytest.mark.skipif( "CADENCE" in os.environ and (os.environ["CADENCE"] == "weekly" or os.environ["CADENCE"] == "nightly"), @@ -21,15 +18,13 @@ def setUp(self): self.output = Path("./finetune_output") def test_oneshot_then_finetune(self): + from transformers import AutoModelForCausalLM + from llmcompressor.core import create_session - from llmcompressor.transformers import ( - SparseAutoModelForCausalLM, - oneshot, - train, - ) + from llmcompressor.transformers import oneshot, train recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml" - model = SparseAutoModelForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( "Xenova/llama2.c-stories15M", device_map="auto" ) dataset = "open_platypus" @@ -52,18 +47,33 @@ def test_oneshot_then_finetune(self): recipe_str = ( "tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml" ) - model = SparseAutoModelForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( self.output / "oneshot_out", device_map="auto" ) - distill_teacher = SparseAutoModelForCausalLM.from_pretrained( + distill_teacher = AutoModelForCausalLM.from_pretrained( "Xenova/llama2.c-stories15M", device_map="auto" ) dataset = "open_platypus" concatenate_data = False output_dir = self.output / "finetune_out" splits = "train[:50%]" - max_steps = 50 + max_steps = 25 + + with create_session(): + train( + model=model, + distill_teacher=distill_teacher, + dataset=dataset, + output_dir=output_dir, + num_calibration_samples=num_calibration_samples, + recipe=recipe_str, + concatenate_data=concatenate_data, + splits=splits, + max_steps=max_steps, + ) + # test reloading checkpoint and final model + model = AutoModelForCausalLM.from_pretrained(output_dir, device_map="auto") with create_session(): train( model=model, @@ -75,6 +85,7 @@ def test_oneshot_then_finetune(self): concatenate_data=concatenate_data, splits=splits, max_steps=max_steps, + resume_from_checkpoint=True, # use last checkpoint ) def tearDown(self): diff --git a/tests/llmcompressor/transformers/finetune/test_safetensors.py b/tests/llmcompressor/transformers/finetune/test_safetensors.py index 09d08b459..84c1bf1b2 100644 --- a/tests/llmcompressor/transformers/finetune/test_safetensors.py +++ b/tests/llmcompressor/transformers/finetune/test_safetensors.py @@ -6,13 +6,12 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" @pytest.mark.integration -@requires_torch @requires_gpu @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestSafetensors(unittest.TestCase): diff --git a/tests/llmcompressor/transformers/gptq/test_oneshot.py b/tests/llmcompressor/transformers/gptq/test_oneshot.py index 017679fa5..7f1a1ec99 100644 --- a/tests/llmcompressor/transformers/gptq/test_oneshot.py +++ b/tests/llmcompressor/transformers/gptq/test_oneshot.py @@ -3,18 +3,14 @@ from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme from parameterized import parameterized_class +from transformers import AutoModelForCausalLM from llmcompressor.modifiers.quantization.gptq import GPTQModifier -from llmcompressor.transformers.sparsification.sparse_model import ( - SparseAutoModelForCausalLM, -) -from tests.testing_utils import requires_torch recipe_str = """ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -28,7 +24,6 @@ recipe_modifier_full = GPTQModifier( ignore=["lm_head"], - sequential_update=False, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel") @@ -36,20 +31,30 @@ }, ) +recipe_modifier_full_group = GPTQModifier( + ignore=["lm_head"], + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, +) + recipe_modifier_shorthand_a = GPTQModifier( - ignore=["lm_head"], sequential_update=False, targets="Linear", scheme="W4A16" + ignore=["lm_head"], targets="Linear", scheme="W4A16" ) recipe_modifier_shorthand_b = GPTQModifier( - ignore=["lm_head"], sequential_update=False, scheme={"W4A16": ["Linear"]} + ignore=["lm_head"], scheme={"W4A16": ["Linear"]} ) -@requires_torch @parameterized_class( [ {"recipe": recipe_str}, {"recipe": recipe_modifier_full}, + {"recipe": recipe_modifier_full_group}, {"recipe": recipe_modifier_shorthand_a}, {"recipe": recipe_modifier_shorthand_b}, ] @@ -75,8 +80,9 @@ def test_oneshot_application(self): oneshot_device=self.device, num_calibration_samples=9, ) - - model_loaded = SparseAutoModelForCausalLM.from_pretrained(self.output) + model_loaded = AutoModelForCausalLM.from_pretrained( + self.output, device_map=self.device + ) # Check that the model is quantized # for compression_config - decompress() will attach a quantization_config diff --git a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml index 2764ac033..906d0c8da 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml @@ -19,7 +19,6 @@ test_stage: ] preserve_sparsity_mask: True GPTQModifier: - sequential_update: False dampening_frac: 0.01 targets: [ "model.layers.0", diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant.yaml index f5dd8a271..435503e50 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/quant.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/quant.yaml @@ -4,7 +4,6 @@ test_stage: smoothing_strength: 0.6 GPTQModifier: block_size: 128 - sequential_update: False percdamp: 0.01 config_groups: group_0: diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml index f0e98f0ed..05022fd80 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml @@ -15,7 +15,6 @@ test_stage: targets: ["Linear"] GPTQModifier: block_size: 128 - sequential_update: True SparseGPTModifier: sparsity: 0.5 block_size: 128 diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py index bd9142516..2f6c51ebb 100644 --- a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py +++ b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py @@ -6,7 +6,7 @@ import yaml from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs" GPU_CONFIGS_DIRECTORY = ( @@ -83,7 +83,6 @@ def tearDown(self): shutil.rmtree(self.output) -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestConsecutiveRunsSmall(TestConsecutiveRuns): @@ -106,7 +105,6 @@ def test_consecutive_runs_small(self): # TODO: @Satrat and @dsikka, revisit if we want these nightly or weekly @requires_gpu -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY)) class TestConsecutiveRunsGPU(TestConsecutiveRuns): @@ -118,9 +116,9 @@ class TestConsecutiveRunsGPU(TestConsecutiveRuns): device = None def setUp(self): - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device ) diff --git a/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py b/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py index 957f19b3f..5095fe827 100644 --- a/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py +++ b/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py @@ -6,14 +6,13 @@ from parameterized import parameterized_class from llmcompressor.core import reset_session -from tests.testing_utils import parse_params, requires_torch +from tests.testing_utils import parse_params MASK_STRUCTURE_CONFIGS_DIRECTORY = ( "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/mask_structure" ) -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(MASK_STRUCTURE_CONFIGS_DIRECTORY)) class TestMaskStructurePreserved(unittest.TestCase): diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py index a24fbfca4..cb7f64943 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py @@ -4,7 +4,7 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/completion" GPU_CONFIGS_DIRECTORY = ( @@ -99,7 +99,6 @@ def tearDown(self): shutil.rmtree(self.output) -@requires_torch @requires_gpu @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) @@ -121,7 +120,6 @@ def test_obcq_completion_small(self): self._test_oneshot_completion() -@requires_torch @requires_gpu @pytest.mark.integration @parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY)) @@ -136,14 +134,13 @@ class TestOBCQCompletionGPU(TestOBCQCompletion): def setUp(self): import torch - - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM self.model_name = None self.output = "./oneshot_output" self.model_name = self.model - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device, torch_dtype=torch.bfloat16 ) diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py index 5d5a06fbc..997794ae9 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py @@ -3,16 +3,14 @@ import pytest from llmcompressor.utils.pytorch.module import get_no_split_params -from tests.testing_utils import requires_torch @pytest.mark.integration -@requires_torch class TestInferTargets(unittest.TestCase): def setUp(self): - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM - model = SparseAutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M") + model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M") self.modifiable_model = model self.targets = get_no_split_params(self.modifiable_model) diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py index 6b2729f6a..f9818391c 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py @@ -2,20 +2,16 @@ import pytest -from tests.testing_utils import requires_torch - @pytest.mark.integration -@requires_torch class TestLMHead(unittest.TestCase): def setUp(self): import torch - - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( "Xenova/llama2.c-stories15M", device_map=self.device ) self.kwargs = { diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py index 7412920f6..0ef7f872d 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py @@ -5,13 +5,12 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_gpu, requires_torch +from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/sparse" GPU_CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu" -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestSparsities(unittest.TestCase): @@ -59,7 +58,6 @@ def tearDown(self): # TODO: @Satrat and @dsikka, revisit if we want these nightly or weekly @requires_gpu -@requires_torch @pytest.mark.integration @parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY)) class TestSparsitiesGPU(unittest.TestCase): @@ -71,12 +69,11 @@ class TestSparsitiesGPU(unittest.TestCase): def setUp(self): import torch - - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM self.output = "./oneshot_output" - self.model = SparseAutoModelForCausalLM.from_pretrained( + self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device, torch_dtype=torch.bfloat16 ) diff --git a/tests/llmcompressor/transformers/obcq/test_sgpt_defaults.py b/tests/llmcompressor/transformers/obcq/test_sgpt_defaults.py index 8cdca786a..dd27ebc2e 100644 --- a/tests/llmcompressor/transformers/obcq/test_sgpt_defaults.py +++ b/tests/llmcompressor/transformers/obcq/test_sgpt_defaults.py @@ -2,11 +2,8 @@ import pytest -from tests.testing_utils import requires_torch - @pytest.mark.integration -@requires_torch class TestSGPTDefaults(unittest.TestCase): def test_sgpt_defaults(self): from llmcompressor.core.state import State diff --git a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py index 0a28585b2..a64a218db 100644 --- a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py +++ b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py @@ -5,7 +5,7 @@ from parameterized import parameterized_class from tests.llmcompressor.transformers.oneshot.dataset_processing import get_data_utils -from tests.testing_utils import parse_params, requires_torch +from tests.testing_utils import parse_params CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/oneshot/oneshot_configs" @@ -15,7 +15,6 @@ @pytest.mark.smoke @pytest.mark.integration -@requires_torch @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneShotInputs(unittest.TestCase): model = None @@ -25,12 +24,10 @@ class TestOneShotInputs(unittest.TestCase): tokenize = None def setUp(self): - from transformers import AutoTokenizer - - from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoModelForCausalLM, AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.model) - self.model = SparseAutoModelForCausalLM.from_pretrained(self.model) + self.model = AutoModelForCausalLM.from_pretrained(self.model) self.output = "./oneshot_output" self.kwargs = {"dataset_config_name": self.dataset_config_name} diff --git a/tests/llmcompressor/transformers/oneshot/test_cli.py b/tests/llmcompressor/transformers/oneshot/test_cli.py index 15b1ba379..5780ca46f 100644 --- a/tests/llmcompressor/transformers/oneshot/test_cli.py +++ b/tests/llmcompressor/transformers/oneshot/test_cli.py @@ -4,14 +4,13 @@ import pytest from parameterized import parameterized_class -from tests.testing_utils import parse_params, requires_torch, run_cli_command +from tests.testing_utils import parse_params, run_cli_command CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/oneshot/oneshot_configs" @pytest.mark.smoke @pytest.mark.integration -@requires_torch @parameterized_class(parse_params(CONFIGS_DIRECTORY)) class TestOneShotCli(unittest.TestCase): model = None diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py index 398085f2e..df9726647 100644 --- a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py @@ -1,20 +1,28 @@ +import logging import math import shutil import pytest import torch +from accelerate import cpu_offload +from accelerate.accelerator import get_state_dict_offloaded_model from compressed_tensors import QUANTIZATION_CONFIG_NAME from compressed_tensors.compressors import ModelCompressor from compressed_tensors.config import BitmaskConfig, DenseSparsityConfig from compressed_tensors.quantization import QuantizationStatus -from transformers import AutoConfig +from compressed_tensors.utils import get_offloaded_device, update_prefix_dict +from transformers import AutoConfig, AutoModelForCausalLM from llmcompressor.core import reset_session from llmcompressor.pytorch.utils.helpers import tensor_sparsity -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.sparsity_config import ( SparsityConfigMetadata, ) +from llmcompressor.transformers.sparsification.compressed_tensors_utils import ( + modify_save_pretrained, + patch_tied_tensors_bug, +) @pytest.mark.parametrize( @@ -55,16 +63,26 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path): clear_sparse_session=False, ) - model = SparseAutoModelForCausalLM.from_pretrained( + # temporarily set the log level to error, to ignore printing out long missing + # and unexpected key error messages (these are EXPECTED for quantized models) + transformers_logger = logging.getLogger("transformers.modeling_utils") + restore_log_level = transformers_logger.getEffectiveLevel() + transformers_logger.setLevel(level=logging.ERROR) + + model = AutoModelForCausalLM.from_pretrained( tmp_path / "oneshot_out", torch_dtype=dtype ) + # restore transformers logging level now that model shell is loaded + transformers_logger.setLevel(level=restore_log_level) + # assert that sample layer has the intended sparsity assert math.isclose( tensor_sparsity(model.state_dict()[one_of_sparse_weights]), expected_sparsity, rel_tol=1e-3, ) + inferred_structure = SparsityConfigMetadata.infer_sparsity_structure() assert inferred_structure == "0:0" @@ -87,7 +105,7 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path): ] == SparsityConfigMetadata.infer_global_sparsity(model) assert sparsity_config["sparsity_structure"] == inferred_structure - dense_model = SparseAutoModelForCausalLM.from_pretrained( + dense_model = AutoModelForCausalLM.from_pretrained( tmp_path / "compress_out", torch_dtype="auto" ) @@ -111,7 +129,7 @@ def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed): reset_session() model_path = "Xenova/llama2.c-stories15M" - model = SparseAutoModelForCausalLM.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path) inferred_global_sparsity = SparsityConfigMetadata.infer_global_sparsity(model) assert math.isclose(inferred_global_sparsity, 0.0, rel_tol=1e-3) @@ -139,10 +157,11 @@ def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed): ["dense", torch.float32], ["dense", torch.float16], ["int_quantized", torch.float32], - # [True, "int_quantized", torch.float16], ], ) def test_quant_model_reload(format, dtype, tmp_path): + from llmcompressor.pytorch.model_load.helpers import get_session_model + recipe_str = ( "tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml" ) @@ -153,52 +172,64 @@ def test_quant_model_reload(format, dtype, tmp_path): dataset = "open_platypus" concatenate_data = False num_calibration_samples = 64 - output_dir = tmp_path / "oneshot_out" splits = {"calibration": "train[:10%]"} + empty_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype) # create a quantized model oneshot( model=model_path, dataset=dataset, - output_dir=output_dir, num_calibration_samples=num_calibration_samples, recipe=recipe_str, concatenate_data=concatenate_data, splits=splits, oneshot_device=device, + clear_sparse_session=False, precision=dtype, ) - model = SparseAutoModelForCausalLM.from_pretrained( - tmp_path / "oneshot_out", torch_dtype=dtype - ) + # Fetch the oneshot model + model = get_session_model() + og_state_dict = model.state_dict() + path = tmp_path / "compressed" for _, module in model.named_modules(): if hasattr(module, "quantization_scheme"): assert module.weight.dtype == dtype assert module.quantization_status == QuantizationStatus.FROZEN + # Save to disk model.save_pretrained( - tmp_path / "compress_out", + path, quantization_format=format, save_compressed=True, ) - config = AutoConfig.from_pretrained(tmp_path / "compress_out") + # Verify config on disk + config = AutoConfig.from_pretrained(path) compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) quant_config = ModelCompressor.parse_quantization_config(compression_config) assert quant_config["format"] == format + # As HFQuantizer doesn't decompress the model, use the compressor to decompress + # the model instead + compressor = ModelCompressor.from_compression_config(compression_config) + compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN + compressor.decompress(model_path=path, model=empty_model) + + # eventually use this pathway once HFQuant Decompression works + """ dense_model = SparseAutoModelForCausalLM.from_pretrained( - tmp_path / "compress_out", torch_dtype="auto" + "compress_out", torch_dtype="auto", device_map=device ) - - og_state_dict = model.state_dict() - reconstructed_state_dict = dense_model.state_dict() + """ + # Verify the abs difference between the decompressed model + # and the original model + reconstructed_state_dict = empty_model.state_dict() assert len(og_state_dict) == len(reconstructed_state_dict) for key in og_state_dict.keys(): - dense_tensor = og_state_dict[key] - reconstructed_tensor = reconstructed_state_dict[key] + dense_tensor = og_state_dict[key].to(device) + reconstructed_tensor = reconstructed_state_dict[key].to(device) assert dense_tensor.dtype == reconstructed_tensor.dtype if key.endswith("weight") and format != "dense": # we don't expect an exact match for compressed @@ -206,5 +237,130 @@ def test_quant_model_reload(format, dtype, tmp_path): assert not torch.any(diff > 0.01).item() else: assert torch.equal(dense_tensor, reconstructed_tensor) - shutil.rmtree(tmp_path) + + +# technically only tie_word_embeddings=False is supported right now +# setting to True is discouraged +@pytest.mark.parametrize( + "offload,torch_dtype,tie_word_embeddings,device_map", + [ + # dtype + (False, torch.float16, False, "cpu"), + (False, torch.float16, True, "cpu"), + (False, torch.float32, False, "cpu"), + (False, torch.float32, True, "cpu"), + # offloading + (True, torch.float16, False, "cpu"), + (True, torch.float32, False, "cpu"), + # (True, torch.float16, True, "cpu"), # TODO: fails + # (True, torch.float32, True, "cpu"), # TODO: fails + ], +) +def test_model_reload(offload, torch_dtype, tie_word_embeddings, device_map, tmp_path): + model_path = "Xenova/llama2.c-stories15M" + save_path = tmp_path / "save_path" + + model = AutoModelForCausalLM.from_pretrained( + model_path, + tie_word_embeddings=tie_word_embeddings, + torch_dtype=torch_dtype, + device_map=device_map, + ) + if offload: + model = cpu_offload(model) + + patch_tied_tensors_bug(model) + modify_save_pretrained(model) + model.save_pretrained(save_path, safe_serialization=True) + + reloaded = AutoModelForCausalLM.from_pretrained( + save_path, torch_dtype="auto", device_map="cpu" + ) + + model_dict = get_state_dict_offloaded_model(model) + reloaded_dict = get_state_dict_offloaded_model(reloaded) + assert model_dict.keys() == reloaded_dict.keys() + for key in model_dict: + assert torch.equal(model_dict[key].cpu(), reloaded_dict[key].cpu()) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires gpu") +@pytest.mark.parametrize( + "offload,torch_dtype,tie_word_embeddings,device_map", + [ + (False, torch.float32, False, "cuda:0"), + (True, torch.float32, False, "cuda:0"), + (True, torch.float16, True, "cuda:0"), + (True, torch.float32, True, "cuda:0"), + ], +) +def test_model_reload_gpu( + offload, torch_dtype, tie_word_embeddings, device_map, tmp_path +): + test_model_reload(offload, torch_dtype, tie_word_embeddings, device_map, tmp_path) + + +@pytest.mark.parametrize( + "offload,torch_dtype,tie_word_embeddings,device_map", + [ + (False, torch.float16, False, "cpu"), + (False, torch.float32, False, "cpu"), + (True, torch.float32, False, "cpu"), + (False, torch.float16, True, "cpu"), + (False, torch.float32, True, "cpu"), + (True, torch.float16, True, "cpu"), + (True, torch.float32, True, "cpu"), + ], +) +def test_model_shared_tensors( + offload, torch_dtype, tie_word_embeddings, device_map, tmp_path +): + # load model + model = AutoModelForCausalLM.from_pretrained( + "Xenova/llama2.c-stories15M", + torch_dtype=torch_dtype, + tie_word_embeddings=tie_word_embeddings, + device_map=device_map, + ) + patch_tied_tensors_bug(model) + + if offload: + model = cpu_offload(model) + + # modify lm head + with torch.no_grad(): + if offload: + model.lm_head._hf_hook.pre_forward(model.lm_head) + + model.lm_head.weight += 1 + + if offload: + device = get_offloaded_device(model.lm_head) + update_prefix_dict(model.lm_head, "weight", model.lm_head.weight.to(device)) + model.lm_head._hf_hook.post_forward(model.lm_head, None) + + # check that embed_tokens is not modified + model_dict = get_state_dict_offloaded_model(model) + lm_head = model_dict["lm_head.weight"] + embed_tokens = model_dict["model.embed_tokens.weight"] + if tie_word_embeddings: + assert torch.equal(lm_head, embed_tokens) + else: + assert not torch.equal(lm_head, embed_tokens) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires gpu") +@pytest.mark.parametrize( + "offload,torch_dtype,tie_word_embeddings,device_map", + [ + (False, torch.float32, False, "cuda:0"), + (False, torch.float32, True, "cuda:0"), + ], +) +def test_model_shared_tensors_gpu( + offload, torch_dtype, tie_word_embeddings, device_map, tmp_path +): + test_model_shared_tensors( + offload, torch_dtype, tie_word_embeddings, device_map, tmp_path + ) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 1f3a15326..07b166013 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -37,10 +37,6 @@ def is_gpu_available(): return False -def requires_torch(test_case): - return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case) - - def requires_gpu(test_case): return unittest.skipUnless(is_gpu_available(), "test requires GPU")(test_case) @@ -81,10 +77,13 @@ def _parse_configs_dir(current_config_dir): ), f"Config_directory {current_config_dir} is not a directory" for file in os.listdir(current_config_dir): - config = _load_yaml(os.path.join(current_config_dir, file)) + config_path = os.path.join(current_config_dir, file) + config = _load_yaml(config_path) if not config: continue + config["testconfig_path"] = config_path + cadence = os.environ.get("CADENCE", "commit") expected_cadence = config.get("cadence") @@ -111,6 +110,7 @@ def _parse_configs_dir(current_config_dir): _parse_configs_dir(config) else: _parse_configs_dir(configs_directory) + return config_dicts