From 69d7f1d982e6bbf109f9084f83d1a82acfff0213 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 13 Dec 2023 10:35:35 -0500 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/utils/quantization_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index aa1173b9ad3e3e..b94f794d9b48e6 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -381,9 +381,9 @@ class GPTQConfig(QuantizationConfigMixin): cache_block_outputs (`bool`, *optional*, defaults to `True`): Whether to cache block outputs to reuse as inputs for the succeeding block. modules_in_block_to_quantize (`List[List[str]]`, *optional*): - List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized. + List of list of module names to quantize in the specified block. This argument is useful to exclude certain linear modules from being quantized. The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. If not set, we will quantize all linear layers. - Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]` + Example: `modules_in_block_to_quantize =[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]` """ def __init__(