Add lowerings for PT2E QDQ ops

PiperOrigin-RevId: 697880225
google-ai-edge · Dec 3, 2024 · 4d8d0f4 · 4d8d0f4
1 parent a055e62
commit 4d8d0f4
Show file tree

Hide file tree

Showing 7 changed files with 361 additions and 13 deletions.
diff --git a/ai_edge_torch/_convert/test/test_convert.py b/ai_edge_torch/_convert/test/test_convert.py
@@ -21,10 +21,12 @@
 import ai_edge_torch
 from ai_edge_torch import config
 from ai_edge_torch._convert import conversion_utils
+from ai_edge_torch.quantize import pt2e_quantizer
 from ai_edge_torch.testing import model_coverage
 import numpy as np
 import torch
 from torch import nn
+from torch.ao.quantization import quantize_pt2e
 import torchvision
 
 from absl.testing import absltest as googletest
@@ -506,6 +508,52 @@ def forward(self, x):
         model_coverage.compare_tflite_torch(edge_model, torch_module, args)
     )
 
+  def test_convert_resnet18_pt2e_per_layer(self):
+    # Step 1: export resnet18
+    args = (torch.randn(1, 3, 224, 224),)
+    m = torchvision.models.resnet18().eval()
+    m = torch._export.capture_pre_autograd_graph(m, args)
+
+    # Step 2: Insert observers or fake quantize modules
+    quantizer = pt2e_quantizer.PT2EQuantizer().set_global(
+        pt2e_quantizer.get_symmetric_quantization_config(is_per_channel=False)
+    )
+    m = quantize_pt2e.prepare_pt2e(m, quantizer)
+
+    # Step 3: Quantize the model
+    m = quantize_pt2e.convert_pt2e(m, fold_quantize=False)
+
+    # pylint: disable=broad-except
+    try:
+      ai_edge_torch.convert(m, args)
+    except Exception as err:
+      self.fail(f"PT2E conversion failed: {err}")
+    # pylint: enable=broad-except
+
+  def test_convert_resnet18_pt2e_per_channel(self):
+    # Step 1: export resnet18
+    args = (torch.randn(1, 3, 224, 224),)
+    m = torchvision.models.resnet18().eval()
+    m = torch._export.capture_pre_autograd_graph(m, args)
+
+    # Step 2: Insert observers or fake quantize modules
+    quantizer = pt2e_quantizer.PT2EQuantizer().set_global(
+        pt2e_quantizer.get_symmetric_quantization_config(is_per_channel=True)
+    )
+    m = quantize_pt2e.prepare_pt2e(m, quantizer)
+    # Step 3: Run through example inputs, otherwise per-channel
+    # quant may have scalar scale/zero_point
+    m(*args)
+    # Step 4: Quantize the model
+    m = quantize_pt2e.convert_pt2e(m, fold_quantize=False)
+
+    # pylint: disable=broad-except
+    try:
+      ai_edge_torch.convert(m, args)
+    except Exception as err:
+      self.fail(f"PT2E conversion failed: {err}")
+    # pylint: enable=broad-except
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/ai_edge_torch/odml_torch/export.py b/ai_edge_torch/odml_torch/export.py
@@ -258,6 +258,43 @@ def rewrite_arange(node: torch.fx.Node):
       rewrite_arange(node)
 
 
+# TODO(b/331481564) Make this a ai_edge_torch FX pass.
+def _convert_q_dq_per_channel_args_to_list(
+    exported_program: torch.export.ExportedProgram,
+):
+  """Resolve tensor inputs to Q/DQ ops as static number list for lowering.
+
+  This pass makes the ExportedProgram in a non-executable state. This pass must
+  be run after all run_decompositions calls.
+  """
+  placeholder_nodes = [
+      n for n in exported_program.graph.nodes if n.op == "placeholder"
+  ]
+  export_flat_args = _torch_future.graph_module_flat_inputs(
+      exported_program, *exported_program.example_inputs
+  )
+
+  placeholder_tensor = {
+      n: tensor for n, tensor in zip(placeholder_nodes, export_flat_args)
+  }
+
+  graph_module = exported_program.graph_module
+  for node in graph_module.graph.nodes:
+    if node.target in (
+        torch.ops.quantized_decomposed.quantize_per_channel.default,
+        torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+        torch.ops.quantized_decomposed.dequantize_per_channel.default,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    ):
+      input, scale_node, zero_point_node = node.args[:3]
+      scale = placeholder_tensor[scale_node]
+      zero_point = placeholder_tensor[zero_point_node]
+
+      scale = scale.detach().numpy().tolist()
+      zero_point = zero_point.detach().numpy().tolist()
+      node.args = (input, scale, zero_point, *node.args[3:])
+
+
 def exported_program_to_mlir(
     exported_program: torch.export.ExportedProgram,
 ) -> MlirLowered:
@@ -270,6 +307,7 @@ def exported_program_to_mlir(
   exported_program = _torch_future.safe_run_decompositions(
       exported_program, lowerings.decompositions()
   )
+  _convert_q_dq_per_channel_args_to_list(exported_program)
 
   with export_utils.create_ir_context() as context, ir.Location.unknown():
 

diff --git a/ai_edge_torch/odml_torch/export_utils.py b/ai_edge_torch/odml_torch/export_utils.py
@@ -14,9 +14,9 @@
 # ==============================================================================
 """Utilities for ODML Torch export."""
 
-import functools
 import re
 from typing import Sequence, cast
+from ai_edge_torch.odml_torch.lowerings import utils as lowering_utils
 import jax._src.interpreters.mlir
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import func
@@ -47,7 +47,6 @@ def create_ir_context():
   # TODO(b/362798610) Build MLIR pybinding in ai-edge-torch release.
   context = jax._src.interpreters.mlir.make_ir_context()
   context.allow_unregistered_dialects = True
-
   return context
 
 
@@ -135,17 +134,7 @@ def build_ir_attr(val):
   return ir.StringAttr.get(str(val))
 
 
-def torch_dtype_to_ir_element_type(dtype):
-  ty_get = {
-      torch.double: ir.F64Type.get,
-      torch.float32: ir.F32Type.get,
-      torch.half: ir.F16Type.get,
-      torch.long: functools.partial(ir.IntegerType.get_signless, 64),
-      torch.int32: functools.partial(ir.IntegerType.get_signless, 32),
-      torch.int16: functools.partial(ir.IntegerType.get_signless, 16),
-      torch.bool: functools.partial(ir.IntegerType.get_signless, 1),
-  }[dtype]
-  return ty_get()
+torch_dtype_to_ir_element_type = lowering_utils.torch_dtype_to_ir_element_type
 
 
 def ir_element_type_to_torch_dtype(ty):

diff --git a/ai_edge_torch/odml_torch/lowerings/__init__.py b/ai_edge_torch/odml_torch/lowerings/__init__.py
@@ -17,6 +17,7 @@
 from . import _convolution
 from . import _jax_lowerings
 from . import _layer_norm
+from . import _quantized_decomposed
 from . import context
 from . import registry
 from . import utils

diff --git a/ai_edge_torch/odml_torch/lowerings/_quantized_decomposed.py b/ai_edge_torch/odml_torch/lowerings/_quantized_decomposed.py
@@ -0,0 +1,174 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Lowerings for PT2E torch.ops.quantized_decomposed ops."""
+from typing import Union, cast
+
+from ai_edge_torch.odml_torch.lowerings import context
+from ai_edge_torch.odml_torch.lowerings import utils
+from jax._src.lib.mlir import ir
+from jax._src.lib.mlir.dialects import hlo as stablehlo
+import torch
+import torch.ao.quantization.fx._decomposed
+import torch.utils._pytree as pytree
+
+from . import registry
+
+lower = registry.lower
+LoweringContext = context.LoweringContext
+
+
+def _uniform_quantized_type(
+    stored_type: str | ir.Type,
+    expressed_type: str | ir.Type,
+    *,
+    scale=float | list[float] | tuple[float],
+    zero_point=float | list[float] | tuple[float],
+    storage_type_min: int | None = None,
+    storage_type_max: int | None = None,
+    channel_axis: int | None = None,
+    channel_axis_size: int | None = None,
+):
+  """Polyfill for quant.UniformQuantizedType."""
+  if storage_type_min and storage_type_max:
+    storage_min_max = f"<{storage_type_min}:{storage_type_max}>"
+  else:
+    storage_min_max = ""
+
+  if channel_axis is not None:
+    # Per-channel quantization
+    # https://mlir.llvm.org/docs/Dialects/QuantDialect/#per-channel-quantization
+    assert isinstance(scale, (list, tuple))
+    assert isinstance(zero_point, (list, tuple))
+
+    if len(scale) == 1:
+      scale *= channel_axis_size
+    if len(zero_point) == 1:
+      zero_point *= channel_axis_size
+
+    assert len(scale) == len(zero_point) == channel_axis_size
+    scale_zp_strs = []
+    for s, zp in zip(scale, zero_point):
+      scale_zp_strs.append(f"{s}:{zp}")
+    scale_zp = "{" + ",".join(scale_zp_strs) + "}"
+    return ir.Type.parse(
+        f"!quant.uniform<{stored_type}{storage_min_max}:{expressed_type}:{channel_axis},{scale_zp}>"
+    )
+  else:
+    # Per-layer quantization
+    # https://mlir.llvm.org/docs/Dialects/QuantDialect/#per-layer-quantization
+    scale = pytree.tree_flatten([scale])[0][-1]
+    zero_point = pytree.tree_flatten([zero_point])[0][-1]
+    scale_zp = f"{scale}:{zero_point}"
+    return ir.Type.parse(
+        f"!quant.uniform<{stored_type}{storage_min_max}:{expressed_type},{scale_zp}>"
+    )
+
+
+# Quant dialect is not registered in the Python MLIR pybinding used by
+# odml-torch. Therefore, stablehlo.uniform_quantize/uniform_dequantize ops and
+# quant types are represented in stablehlo.custom_call to pass MLIR verification
+# and VHLO serialization before converter.
+# TODO(b/362798610) Build MLIR pybinding in ai-edge-torch release.
+
+
+# Schema:
+#   - quantized_decomposed::quantize_per_tensor(Tensor input, float scale,
+#       int zero_point, int quant_min, int quant_max,
+#       ScalarType dtype) -> Tensor
+#   - quantized_decomposed::quantize_per_tensor.tensor(Tensor input,
+#       Tensor scale, Tensor zero_point, int quant_min, int quant_max,
+#       ScalarType dtype) -> Tensor
+#
+# Scale and zero_point in tensors are automatically converted to list before
+# lowering.
+@lower(torch.ops.quantized_decomposed.quantize_per_tensor)
+def _quantize_per_tensor(
+    lctx: LoweringContext,
+    input: ir.Value,
+    scale: Union[float, list[float]],
+    zero_point: Union[float, list[float]],
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+  input_ty = cast(ir.RankedTensorType, input.type)
+  qty = _uniform_quantized_type(
+      utils.torch_dtype_to_ir_element_type(dtype),
+      input_ty.element_type,
+      scale=scale,
+      zero_point=zero_point,
+      storage_type_min=quant_min,
+      storage_type_max=quant_max,
+  )
+  return stablehlo.custom_call(
+      call_target_name="odml_torch.uniform_quantize",
+      inputs=[input],
+      result=[input_ty],
+      backend_config=ir.StringAttr.get(
+          str(ir.RankedTensorType.get(input_ty.shape, qty))
+      ),
+  )
+
+
+# Schema:
+#   - quantized_decomposed::quantize_per_channel(Tensor input, Tensor scales,
+#       Tensor zero_points, int axis, int quant_min, int quant_max,
+#       ScalarType dtype) -> Tensor
+#
+# Scale and zero_point in tensors are automatically converted to list before
+# lowering.
+@lower(torch.ops.quantized_decomposed.quantize_per_channel)
+def _quantize_per_channel(
+    lctx: LoweringContext,
+    input: ir.Value,
+    scale: list[float],
+    zero_point: list[float],
+    axis: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+  input_ty = cast(ir.RankedTensorType, input.type)
+  qty = _uniform_quantized_type(
+      utils.torch_dtype_to_ir_element_type(dtype),
+      input_ty.element_type,
+      scale=scale,
+      zero_point=zero_point,
+      channel_axis=axis,
+      channel_axis_size=input_ty.shape[axis],
+      storage_type_min=quant_min,
+      storage_type_max=quant_max,
+  )
+  return stablehlo.custom_call(
+      call_target_name="odml_torch.uniform_quantize",
+      inputs=[input],
+      result=[input_ty],
+      backend_config=ir.StringAttr.get(
+          str(ir.RankedTensorType.get(input_ty.shape, qty))
+      ),
+  )
+
+
+@lower(torch.ops.quantized_decomposed.dequantize_per_tensor)
+@lower(torch.ops.quantized_decomposed.dequantize_per_channel)
+def _dequantize(lctx: LoweringContext, input: ir.Value, *args, **kwargs):
+  result_meta = lctx.node.meta.get("tensor_meta")
+  result_elty = utils.torch_dtype_to_ir_element_type(result_meta.dtype)
+
+  return stablehlo.custom_call(
+      call_target_name="odml_torch.uniform_dequantize",
+      inputs=[input],
+      result=[ir.RankedTensorType.get(result_meta.shape, result_elty)],
+  )
diff --git a/ai_edge_torch/odml_torch/lowerings/utils.py b/ai_edge_torch/odml_torch/lowerings/utils.py
@@ -14,13 +14,29 @@
 # ==============================================================================
 """Utilities for building MLIR lowerings."""
 
+import functools
 import numbers
 from typing import Any
 from typing import Optional
 
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import hlo as stablehlo
 import numpy as np
+import torch
+
+
+def torch_dtype_to_ir_element_type(dtype):
+  ty_get = {
+      torch.double: ir.F64Type.get,
+      torch.float32: ir.F32Type.get,
+      torch.half: ir.F16Type.get,
+      torch.long: functools.partial(ir.IntegerType.get_signless, 64),
+      torch.int32: functools.partial(ir.IntegerType.get_signless, 32),
+      torch.int16: functools.partial(ir.IntegerType.get_signless, 16),
+      torch.int8: functools.partial(ir.IntegerType.get_signless, 8),
+      torch.bool: functools.partial(ir.IntegerType.get_signless, 1),
+  }[dtype]
+  return ty_get()
 
 
 def splat(val, ty, shape=tuple(), *, loc: Optional[Any] = None):