From 4ad32a8352f4d56ec88b9035dbee6deecf6451f4 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 22 Nov 2024 13:58:47 -0500 Subject: [PATCH 1/2] correct calculations, rename function (#878) Signed-off-by: Kyle Sayers --- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 7 ++----- src/llmcompressor/utils/metric_logging.py | 8 +++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 542f64bab..02eafb669 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -9,10 +9,7 @@ from llmcompressor.observers import Observer from llmcompressor.pytorch.utils.helpers import tensor_sparsity from llmcompressor.utils import getattr_chain -from llmcompressor.utils.metric_logging import ( - get_GPU_memory_usage, - get_layer_size_bytes, -) +from llmcompressor.utils.metric_logging import get_GPU_memory_usage, get_layer_size_mb try: import transformers @@ -353,5 +350,5 @@ def _log_metrics(self, start_tick: float, losses: torch.Tensor): patch.log( "METRIC", - f"Compressed layer size: {get_layer_size_bytes(self.layer)} MB", + f"Compressed layer size: {get_layer_size_mb(self.layer)} MB", ) diff --git a/src/llmcompressor/utils/metric_logging.py b/src/llmcompressor/utils/metric_logging.py index d0b3bb11e..0b45a4670 100644 --- a/src/llmcompressor/utils/metric_logging.py +++ b/src/llmcompressor/utils/metric_logging.py @@ -3,6 +3,8 @@ from loguru import logger from torch.nn import Module +__all__ = ["get_GPU_memory_usage", "get_layer_size_mb"] + def get_GPU_memory_usage() -> List[Tuple]: try: @@ -23,7 +25,7 @@ def get_GPU_memory_usage() -> List[Tuple]: handle = pynvml.nvmlDeviceGetHandleByIndex(i) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) memory_usage_percentage = mem_info.used / mem_info.total - total_memory_gb = mem_info.total / (1024**3) + total_memory_gb = mem_info.total / (1e9) usage.append( (memory_usage_percentage, total_memory_gb), ) @@ -35,7 +37,7 @@ def get_GPU_memory_usage() -> List[Tuple]: return [] -def get_layer_size_bytes(module: Module) -> float: +def get_layer_size_mb(module: Module) -> float: param_size = 0 buffer_size = 0 @@ -46,6 +48,6 @@ def get_layer_size_bytes(module: Module) -> float: buffer_size += buffer.nelement() * buffer.element_size() total_size = param_size + buffer_size - total_size_mb = total_size / (1024**2) # Convert bytes to MB + total_size_mb = total_size / (1e6) # Convert bytes to MB return total_size_mb From 411ceecbc5df4071303d552b3cfec98790ba04e7 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 22 Nov 2024 19:15:27 -0500 Subject: [PATCH 2/2] make it known to the user that vllm should be used (#921) Co-authored-by: George --- examples/quantization_kv_cache/llama3_fp8_kv_example.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index c34d8c9c8..6c08d4acc 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -1,4 +1,5 @@ from datasets import load_dataset +from loguru import logger from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.transformers import oneshot @@ -81,6 +82,11 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +logger.info( + "Running sample generation. ", + "Note: Inference with the quantized kv_cache is not supported. ", + "Please use vLLM for inference with the quantized kv_cache.", +) # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============")