Skip to content

Commit

Permalink
Merge branch 'mvafin/support_awq' of https://github.com/mvafin/optimu…
Browse files Browse the repository at this point in the history
…m-intel into mvafin/support_awq
  • Loading branch information
eaidova committed Dec 20, 2024
2 parents b0e4860 + 5d8bcb7 commit 630d36a
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,61 @@ def get_num_quantized_nodes(model):
return num_fake_quantize, num_weight_nodes


@contextmanager
def mock_torch_cuda_is_available(to_patch):
original_is_available = torch.cuda.is_available
if to_patch:
torch.cuda.is_available = lambda: True
try:
yield
finally:
if to_patch:
torch.cuda.is_available = original_is_available


@contextmanager
def patch_awq_for_inference(to_patch):
orig_gemm_forward = None
if to_patch:
# patch GEMM module to allow inference without CUDA GPU
from awq.modules.linear.gemm import WQLinearMMFunction
from awq.utils.packing_utils import dequantize_gemm

def new_forward(
ctx,
x,
qweight,
qzeros,
scales,
w_bit=4,
group_size=128,
bias=None,
out_features=0,
):
ctx.out_features = out_features

out_shape = x.shape[:-1] + (out_features,)
x = x.to(torch.float16)

out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
out = torch.matmul(x, out)

out = out + bias if bias is not None else out
out = out.reshape(out_shape)

if len(out.shape) == 2:
out = out.unsqueeze(0)
return out

orig_gemm_forward = WQLinearMMFunction.forward
WQLinearMMFunction.forward = new_forward
try:
yield
finally:
if orig_gemm_forward is not None:
WQLinearMMFunction.forward = orig_gemm_forward


def compare_num_quantized_nodes_per_model(
test_case: unittest.TestCase,
models: List[Union[ov.Model, OVBaseModel]],
Expand Down

0 comments on commit 630d36a

Please sign in to comment.