diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index 6159646ed..d516616bf 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 844cf457d..7e9bc3f2f 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w4a16-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w4a16-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml index 367d3fd4f..af1e5df8b 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a16-dense" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w8a16-dense" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 583edba18..086a67ed6 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,3 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a8-compressed" \ No newline at end of file +model_stub: "nm-testing/tinyllama-w8a8-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 3d2db48aa..33a31c332 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -3,8 +3,11 @@ import unittest import torch +from compressed_tensors import QUANTIZATION_CONFIG_NAME +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.quantization import QuantizationStatus from parameterized import parameterized_class -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from tests.testing_utils import parse_params, requires_gpu, requires_torch @@ -16,6 +19,7 @@ @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): model_stub = None + empty_model = None @classmethod def setUpClass(cls): @@ -29,25 +33,34 @@ def setUpClass(cls): device_map="auto", # run_compressed=True, # TODO: Give option on HFQuantizer ) + # TODO: Use ModelCompressor until decompression is supported through + # HFQuant/run_compressed can be turned off. cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( - cls.model_stub, - torch_dtype="auto", - device_map="auto", - # run_compressed=False, # TODO: Give option on HFQuantizer + cls.empty_model, + torch_dtype=cls.compressed_model.dtype, + device_map=cls.compressed_model.device, + ) + config = AutoConfig.from_pretrained(cls.model_stub) + compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) + cls.compressor = ModelCompressor.from_compression_config(compression_config) + cls.compressor.quantization_config.quantization_status = ( + QuantizationStatus.FROZEN + ) + cls.compressor.decompress( + model_path=cls.model_stub, model=cls.uncompressed_model ) cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub) - cls.device = cls.compressed_model.device def test_compressed_matches_uncompressed(self): SAMPLE_INPUT = [ "I love 4-bit quantization because", - "What is the capital of Paris?", + "What is the capital of France?", "def fibonacci(n):", ] inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - self.device + self.compressed_model.device ) compressed_output = self.tokenizer.batch_decode( self.compressed_model.generate(**inputs, max_length=50)