diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 6c08d4acc..155fbf726 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -5,8 +5,10 @@ from llmcompressor.transformers import oneshot # Select model and load it. -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( +# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" + +model = SparseAutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", diff --git a/tests/llmcompressor/test_kv_cache.py b/tests/llmcompressor/test_kv_cache.py index 59520ae09..2acd3b2b6 100644 --- a/tests/llmcompressor/test_kv_cache.py +++ b/tests/llmcompressor/test_kv_cache.py @@ -4,13 +4,13 @@ import pytest import torch from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization.cache import KVCacheScaleType +from compressed_tensors.quantization.lifecycle import KVCacheScaleType from compressed_tensors.quantization.utils.helpers import iter_named_quantizable_modules from datasets import load_dataset -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.core import reset_session -from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers import oneshot MODEL_IDS = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", @@ -83,7 +83,7 @@ def test_kv_cache_config_format(oneshot_fixture): def test_kv_cache_model_state_dict_attr(oneshot_fixture): for output_dir, _ in oneshot_fixture.items(): - model = SparseAutoModelForCausalLM.from_pretrained(output_dir) + model = AutoModelForCausalLM.from_pretrained(output_dir) counts = 0 for name, submodule in iter_named_quantizable_modules( @@ -99,7 +99,7 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture): def test_kv_cache_model_populate_kv_scales_only(tmp_path): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = SparseAutoModelForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda:0" if torch.cuda.is_available() else "cpu", torch_dtype="auto", @@ -187,7 +187,7 @@ def tokenize(sample): # check for vllm loading assert quant_config.quant_method == "compressed-tensors" - model = SparseAutoModelForCausalLM.from_pretrained(output_dir) + model = AutoModelForCausalLM.from_pretrained(output_dir) counts = 0 for name, submodule in iter_named_quantizable_modules( @@ -203,7 +203,7 @@ def tokenize(sample): def test_kv_cache_with_gptq(tmp_path): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = SparseAutoModelForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda:0" if torch.cuda.is_available() else "cpu", torch_dtype="auto", @@ -298,7 +298,7 @@ def tokenize(sample): assert scheme.dynamic == kv_cache_dynamic assert scheme.symmetric == kv_cache_symmetric - model = SparseAutoModelForCausalLM.from_pretrained(output_dir) + model = AutoModelForCausalLM.from_pretrained(output_dir) counts = 0 for name, submodule in iter_named_quantizable_modules(