diff --git a/examples/backends/bitblas_int4_demo.py b/examples/backends/bitblas_int4_demo.py index 24571fd..3593b0a 100644 --- a/examples/backends/bitblas_int4_demo.py +++ b/examples/backends/bitblas_int4_demo.py @@ -20,18 +20,18 @@ #Quantize #all 4-bit -quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1) +quant_config = BaseQuantizeConfig(nbits=4, group_size=64, axis=1) #Mixed 4-bit (bitblas) / 2-bit (ATEN) # quant_config = { -# "self_attn.q_proj": BaseQuantizeConfig(nbits=2, group_size=32, quant_scale=False, quant_zero=False, axis=0), -# "self_attn.k_proj": BaseQuantizeConfig(nbits=2, group_size=32, quant_scale=False, quant_zero=False, axis=0), -# "self_attn.v_proj": BaseQuantizeConfig(nbits=2, group_size=32, quant_scale=False, quant_zero=False, axis=0), -# "self_attn.o_proj": BaseQuantizeConfig(nbits=2, group_size=32, quant_scale=False, quant_zero=False, axis=0), - -# "mlp.gate_proj": BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1), -# "mlp.up_proj": BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1), -# "mlp.down_proj": BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1), +# "self_attn.q_proj": BaseQuantizeConfig(nbits=2, group_size=32, axis=0), +# "self_attn.k_proj": BaseQuantizeConfig(nbits=2, group_size=32, axis=0), +# "self_attn.v_proj": BaseQuantizeConfig(nbits=2, group_size=32, axis=0), +# "self_attn.o_proj": BaseQuantizeConfig(nbits=2, group_size=32, axis=0), + +# "mlp.gate_proj": BaseQuantizeConfig(nbits=4, group_size=64, axis=1), +# "mlp.up_proj": BaseQuantizeConfig(nbits=4, group_size=64, axis=1), +# "mlp.down_proj": BaseQuantizeConfig(nbits=4, group_size=64, axis=1), # } # HQQLinear.set_backend(HQQBackend.ATEN) diff --git a/examples/backends/marlin_int4_demo.py b/examples/backends/marlin_int4_demo.py index ce7a49f..39efacd 100755 --- a/examples/backends/marlin_int4_demo.py +++ b/examples/backends/marlin_int4_demo.py @@ -19,7 +19,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_path, torch_dtype=compute_dtype, attn_implementation="sdpa") #Quantize -quant_config = BaseQuantizeConfig(nbits=4, group_size=None, quant_scale=False, quant_zero=False, axis=1) +quant_config = BaseQuantizeConfig(nbits=4, group_size=None, axis=1) AutoHQQHFModel.setup_model(model) AutoHQQHFModel.quantize_model(model, quant_config=quant_config, compute_dtype=compute_dtype, device=device) HQQLinear.set_backend(HQQBackend.PYTORCH) diff --git a/examples/backends/quantize_and_run.py b/examples/backends/quantize_and_run.py index 7dcb15d..25cfe47 100644 --- a/examples/backends/quantize_and_run.py +++ b/examples/backends/quantize_and_run.py @@ -18,7 +18,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_dir, torch_dtype=compute_dtype, attn_implementation="sdpa") tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) -quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1) +quant_config = BaseQuantizeConfig(nbits=4, group_size=64, axis=1) AutoHQQHFModel.quantize_model(model, quant_config=quant_config, device=device, compute_dtype=compute_dtype) #Use optimized inference kernels diff --git a/examples/backends/torchao_int4_demo.py b/examples/backends/torchao_int4_demo.py index 54e343b..62be2e7 100755 --- a/examples/backends/torchao_int4_demo.py +++ b/examples/backends/torchao_int4_demo.py @@ -20,7 +20,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_path, torch_dtype=compute_dtype, attn_implementation="sdpa") #Quantize -quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1) +quant_config = BaseQuantizeConfig(nbits=4, group_size=64, axis=1) AutoHQQHFModel.setup_model(model) AutoHQQHFModel.quantize_model(model, quant_config=quant_config, compute_dtype=compute_dtype, device=device) HQQLinear.set_backend(HQQBackend.PYTORCH) diff --git a/examples/backends/transformers_demo.py b/examples/backends/transformers_demo.py new file mode 100644 index 0000000..65c4953 --- /dev/null +++ b/examples/backends/transformers_demo.py @@ -0,0 +1,36 @@ +#Works with multi-gpu as well, tested with BitBlas + +import torch, gc +from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig + +device = 'auto' +dtype = torch.float16 +model_id = 'meta-llama/Meta-Llama-3-8B-Instruct' +cache_dir = '.' + +quant_config = HqqConfig(nbits=4, group_size=64, axis=1) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=dtype, + cache_dir=cache_dir, + device_map=device, + quantization_config=quant_config +) + +tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) + +#Patching +from hqq.utils.patching import * +from hqq.core.quantize import * +HQQLinear.set_backend(HQQBackend.PYTORCH) +prepare_for_inference(model, backend='bitblas', verbose=True) #Takes a while + +#Import custom HF generator +from hqq.utils.generation_hf import HFGenerator + +#Generate +gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Quick test - slower inference +#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Takes a while - fastest + +out = gen.generate("Write an essay about large language models.", print_tokens=True) diff --git a/examples/llama2_benchmark/quant_llama2_hqq_demo.py b/examples/llama2_benchmark/quant_llama2_hqq_demo.py index a5db11d..5c7be24 100755 --- a/examples/llama2_benchmark/quant_llama2_hqq_demo.py +++ b/examples/llama2_benchmark/quant_llama2_hqq_demo.py @@ -18,11 +18,11 @@ ###################################################################################### from hqq.core.quantize import * -#quant_config = BaseQuantizeConfig(nbits=8, group_size=128) -quant_config = BaseQuantizeConfig(nbits=4, group_size=64) -#quant_config = BaseQuantizeConfig(nbits=3, group_size=64) -#quant_config = BaseQuantizeConfig(nbits=2, group_size=16) -#quant_config = BaseQuantizeConfig(nbits=2, group_size=16, quant_scale=True) #scale is quantized to 8-bit/g=128 +#quant_config = BaseQuantizeConfig(nbits=8, group_size=128, axis=0) +quant_config = BaseQuantizeConfig(nbits=4, group_size=64, axis=0) +#quant_config = BaseQuantizeConfig(nbits=3, group_size=64, axis=0) +#quant_config = BaseQuantizeConfig(nbits=2, group_size=16, axis=0) +#quant_config = BaseQuantizeConfig(nbits=2, group_size=16, quant_scale=True, axis=0) #scale is quantized to 8-bit/g=128 model.quantize_model(quant_config=quant_config) diff --git a/examples/vllm/llama2_example.py b/examples/vllm/llama2_example.py deleted file mode 100755 index e5a29a4..0000000 --- a/examples/vllm/llama2_example.py +++ /dev/null @@ -1,25 +0,0 @@ -model_id = 'meta-llama/Llama-2-7b-chat-hf' - -#Load VLLM un-quantized model -from hqq.engine.vllm import HQQLLM -model = HQQLLM(model=model_id) - -#Quantize the model -from hqq.core.quantize import * -model.quantize_model(BaseQuantizeConfig(nbits=4, group_size=64)) - -#Optional: Save the model -#model.save_quantized(model_id.split('/')[-1] + '_quantized') - -#Optional: Set backend -HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE) #set backend - -#Generation -from vllm.entrypoints.llm import SamplingParams -sampling_params = SamplingParams(temperature=0.6, top_p=0.90, max_tokens=1000, repetition_penalty=1.2) - -prompt = "How can I build a car?" - -output = model.generate([prompt], sampling_params)[0] -print(output.prompt) -print(output.outputs[0].text) \ No newline at end of file