Skip to content

Commit

Permalink
Merge branch 'main' into add-recipe-check-vllm-e2e
Browse files Browse the repository at this point in the history
  • Loading branch information
horheynm authored Nov 25, 2024
2 parents e41d025 + 411ceec commit 69ab73d
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 8 deletions.
6 changes: 6 additions & 0 deletions examples/quantization_kv_cache/llama3_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datasets import load_dataset
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
Expand Down Expand Up @@ -81,6 +82,11 @@ def process_and_tokenize(example):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

logger.info(
"Running sample generation. ",
"Note: Inference with the quantized kv_cache is not supported. ",
"Please use vLLM for inference with the quantized kv_cache.",
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
from llmcompressor.observers import Observer
from llmcompressor.pytorch.utils.helpers import tensor_sparsity
from llmcompressor.utils import getattr_chain
from llmcompressor.utils.metric_logging import (
get_GPU_memory_usage,
get_layer_size_bytes,
)
from llmcompressor.utils.metric_logging import get_GPU_memory_usage, get_layer_size_mb

try:
import transformers
Expand Down Expand Up @@ -353,5 +350,5 @@ def _log_metrics(self, start_tick: float, losses: torch.Tensor):

patch.log(
"METRIC",
f"Compressed layer size: {get_layer_size_bytes(self.layer)} MB",
f"Compressed layer size: {get_layer_size_mb(self.layer)} MB",
)
8 changes: 5 additions & 3 deletions src/llmcompressor/utils/metric_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from loguru import logger
from torch.nn import Module

__all__ = ["get_GPU_memory_usage", "get_layer_size_mb"]


def get_GPU_memory_usage() -> List[Tuple]:
try:
Expand All @@ -23,7 +25,7 @@ def get_GPU_memory_usage() -> List[Tuple]:
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
memory_usage_percentage = mem_info.used / mem_info.total
total_memory_gb = mem_info.total / (1024**3)
total_memory_gb = mem_info.total / (1e9)
usage.append(
(memory_usage_percentage, total_memory_gb),
)
Expand All @@ -35,7 +37,7 @@ def get_GPU_memory_usage() -> List[Tuple]:
return []


def get_layer_size_bytes(module: Module) -> float:
def get_layer_size_mb(module: Module) -> float:
param_size = 0
buffer_size = 0

Expand All @@ -46,6 +48,6 @@ def get_layer_size_bytes(module: Module) -> float:
buffer_size += buffer.nelement() * buffer.element_size()

total_size = param_size + buffer_size
total_size_mb = total_size / (1024**2) # Convert bytes to MB
total_size_mb = total_size / (1e6) # Convert bytes to MB

return total_size_mb

0 comments on commit 69ab73d

Please sign in to comment.