From 4522ed78b438ec382df20f786dd48978ee0a79c9 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 26 Jun 2024 22:10:12 -0400 Subject: [PATCH] convert-hf : allow converting the weird BitNet 1.3B Its FFN size is 5460 which is not convenient. The offending tensors are kept in F16, which makes the final model 5.01 bpw. --- convert-hf-to-gguf.py | 16 ++++++++++------ gguf-py/gguf/quants.py | 4 ++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ee8350e43219e9..bf971e18078948 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -299,12 +299,16 @@ def write_tensors(self): if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: # TODO: cleaner model-specific per-tensor types # NOTE: Q1_3 is only relevant for BitNet 1.58b - if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any( - self.match_model_tensor_name(new_name, key, None) - for key in [ - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ] + if ( + self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 + and gguf.can_quantize_to_q1_3(data) + and not any( + self.match_model_tensor_name(new_name, key, None) + for key in [ + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ] + ) ): data = gguf.quantize_q1_3(data) assert data.dtype == np.uint8 diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index c66b83b3f82832..c96e6a34361e4c 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -126,6 +126,10 @@ def quantize_q8_0(data: np.ndarray): __q1_3_block_size, __q1_3_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q1_3] +def can_quantize_to_q1_3(n: np.ndarray) -> bool: + return n.shape[-1] % __q1_3_block_size == 0 + + def __quantize_q1_3_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: return (*s[:-1], s[-1] // __q1_3_block_size * __q1_3_type_size)