Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Test Fix] Fix/update test_run_compressed #970

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a16-dense"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a8-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed
105 changes: 105 additions & 0 deletions tests/llmcompressor/transformers/compression/test_decompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import copy
import shutil
import tempfile
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
"""
Test decompression - given a skeleton model and path to the optimized model,
write the optimized model's safetensors to the skeleton model and decompress
Ex. write weight_scale to skeleton model and then fp4 -> fp16

"""

compressed_model_stub = None
skeleton_model_stub = None

SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

@classmethod
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)

self.compressed_model = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
)

self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.compressed_model.dtype,
device_map=self.compressed_model.device,
)

assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)

self.decompressed_model = None
config = AutoConfig.from_pretrained(self.compressed_model_stub)

compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)

# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)

# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)

# self.dense_model should be decompressed
assert dense_model is not self.dense_model

self.decompressed_model = self.dense_model

assert hasattr(
self.decompressed_model.model.layers[0].self_attn.q_proj, "weight_scale"
)

def test_compressed_matches_uncompressed(self):
for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.compressed_model.device
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)
)
uncompressed_output = self.tokenizer.batch_decode(
self.decompressed_model.generate(**inputs, max_length=50)
)

assert compressed_output == uncompressed_output

@classmethod
def tearDownClass(self):
shutil.rmtree(self.test_dir)
del self.compressed_model
del self.dense_model
del self.decompressed_model
torch.cuda.empty_cache()
154 changes: 120 additions & 34 deletions tests/llmcompressor/transformers/compression/test_run_compressed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,53 +3,139 @@
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"
COMPRESSED_LINEAR_CONFIG_DIR = (
"tests/llmcompressor/transformers/compression/run_compressed_configs"
)


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
model_stub = None
empty_model = None
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR))
class TestRunCompressedDecompression(unittest.TestCase):
"""
Test the run_compressed input arg to AutoModelForCausalLM, where HFQuantizer is
responsible for decompressing if model is compressed.

Diagram flow https://tinyurl.com/2ynb6wbu

Given an optimized model that was saved (uncompressed),
and saved as run_compressed (compressed), decompress the compressed model
and check the outputs.

All modules should be linear, runs default foward calls

Test the run_compressed input arg to AutoModelForCausalLM, where HFQuantizer is
responsible for decompressing if model is compressed.

Diagram flow https://tinyurl.com/2ynb6wbu


"""

compressed_model_stub = None
uncompressed_model_stub = None

@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()

# TODO: Give option on HFQuantizer to run run_compressed True/False
# currently hardcoded to True
cls.compressed_model = AutoModelForCausalLM.from_pretrained(
cls.model_stub,
quantization_config = CompressedTensorsConfig(run_compressed=False)
cls.decompressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
# run_compressed=True, # TODO: Give option on HFQuantizer
quantization_config=quantization_config,
)
# TODO: Use ModelCompressor until decompression is supported through
# HFQuant/run_compressed can be turned off.

cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
cls.empty_model,
torch_dtype=cls.compressed_model.dtype,
device_map=cls.compressed_model.device,
cls.uncompressed_model_stub,
torch_dtype=cls.decompressed_model.dtype,
device_map=cls.decompressed_model.device,
)

cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub)

def test_compressed_matches_decompressed(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
self.decompressed_model.device
)
decompressed_output = self.tokenizer.batch_decode(
self.decompressed_model.generate(**inputs, max_length=50)
)
config = AutoConfig.from_pretrained(cls.model_stub)
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
cls.compressor = ModelCompressor.from_compression_config(compression_config)
cls.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
uncompressed_output = self.tokenizer.batch_decode(
self.uncompressed_model.generate(**inputs, max_length=50)
)

for idx in range(len(SAMPLE_INPUT)):
assert decompressed_output[idx] == uncompressed_output[idx]

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.decompressed_model
del cls.uncompressed_model
torch.cuda.empty_cache()


@requires_gpu
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR))
class TestRunCompressedForward(unittest.TestCase):
"""
Given an optimized model that was saved (uncompressed),
and saved as run_compressed (compressed), do not decompressed the compressed model
and check the outputs.

All compressed model should have CompressedLinear, which has its custom forward call

"""

compressed_model_stub = None

@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()

# Should have CompressedLinear modules
cls.compressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
)
cls.compressor.decompress(
model_path=cls.model_stub, model=cls.uncompressed_model

# Should just be linear modules
quantization_config = CompressedTensorsConfig(run_compressed=False)
cls.decompressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype=cls.compressed_model.dtype,
device_map=cls.compressed_model.device,
quantization_config=quantization_config,
)

cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub)

def test_compressed_linear_modules_exist(self):
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
self.compressed_model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1

# some linear models are not compressed - ex. lm_head
assert compressed_linear_counts > 0

def test_compressed_matches_uncompressed(self):
SAMPLE_INPUT = [
Expand All @@ -59,21 +145,21 @@ def test_compressed_matches_uncompressed(self):
]

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
self.compressed_model.device
self.decompressed_model.device
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)
compressed_model_out = self.tokenizer.batch_decode(
self.decompressed_model.generate(**inputs, max_length=50)
)
uncompressed_output = self.tokenizer.batch_decode(
self.uncompressed_model.generate(**inputs, max_length=50)
decompressed_model_out = self.tokenizer.batch_decode(
self.decompressed_model.generate(**inputs, max_length=50)
)

for idx in range(len(SAMPLE_INPUT)):
assert compressed_output[idx] == uncompressed_output[idx]
assert compressed_model_out[idx] == decompressed_model_out[idx]

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.decompressed_model
del cls.compressed_model
del cls.uncompressed_model
torch.cuda.empty_cache()
Loading