Merge branch 'main' into lint_tutorials

pytorch · Jan 8, 2025 · 784ccea · 784ccea
2 parents b6a0a83 + 070345d
commit 784ccea
Show file tree

Hide file tree

Showing 14 changed files with 1,270 additions and 448 deletions.
diff --git a/examples/sam2_amg_server/compile_export_utils.py b/examples/sam2_amg_server/compile_export_utils.py
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -266,6 +266,7 @@ def main(
         "checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth"
     ),
     quantization: Optional[str] = None,
+    min_sqnr: Optional[float] = None,
     sparsity: Optional[str] = None,
     kv_cache_quantization: bool = False,
     cache_size: Optional[int] = None,
@@ -706,27 +707,31 @@ def ffn_or_attn_only(mod, fqn):
                     manual=True,
                     qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             elif "autoquant-float8" == quantization:
                 model = autoquant(
                     model,
                     manual=True,
                     qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             elif "autoquant-fp" == quantization:
                 model = autoquant(
                     model,
                     manual=True,
                     qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             elif "autoquant-sparse" == quantization:
                 model = autoquant(
                     model,
                     manual=True,
                     qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             elif "autoquant-gemlite-int4" == quantization:
                 import os
@@ -742,6 +747,7 @@ def ffn_or_attn_only(mod, fqn):
                     manual=True,
                     qtensor_class_list=torchao.quantization.GEMLITE_INT4_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             elif "autoquant-all" == quantization:
                 try:
@@ -761,9 +767,12 @@ def ffn_or_attn_only(mod, fqn):
                     manual=True,
                     qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST,
                     example_input=inputs,
+                    min_sqnr=min_sqnr,
                 )
             else:
-                model = autoquant(model, manual=True, example_input=inputs)
+                model = autoquant(
+                    model, manual=True, example_input=inputs, min_sqnr=min_sqnr
+                )
 
             generate(
                 model,
@@ -1015,12 +1024,30 @@ def callback(x):
         f.close()
 
     if output_json_path:
-        headers = ["name", "dtype", "device", "arch", "metric", "actual", "target"]
+        headers = [
+            "name",
+            "dtype",
+            "min_sqnr",
+            "device",
+            "arch",
+            "metric",
+            "actual",
+            "target",
+        ]
         name = checkpoint_path.parent.name
         arch = get_arch_name()
         dtype = quantization or "noquant"
-        memory_result = [name, dtype, device, arch, "mem/s", bandwidth, None]
-        performance_result = [name, dtype, device, arch, "tok/s", tokpersec, None]
+        memory_result = [name, dtype, min_sqnr, device, arch, "mem/s", bandwidth, None]
+        performance_result = [
+            name,
+            dtype,
+            min_sqnr,
+            device,
+            arch,
+            "tok/s",
+            tokpersec,
+            None,
+        ]
         write_json_result = (
             write_json_result_local if output_json_local else write_json_result_ossci
         )
@@ -1073,6 +1100,14 @@ def callback(x):
             + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, int8adq-int4w-symm"
         ),
     )
+    parser.add_argument(
+        "--min_sqnr",
+        type=float,
+        default=None,
+        help=(
+            "min sqnr for quantizing v.s. not quantizing a layer, used in autoquant options",
+        ),
+    )
     parser.add_argument(
         "-s",
         "--sparsity",
@@ -1148,6 +1183,7 @@ def callback(x):
         args.temperature,
         args.checkpoint_path,
         args.quantization,
+        args.min_sqnr,
         args.sparsity,
         args.kv_cache_quantization,
         args.cache_size,

diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -284,6 +284,7 @@ def run(
     use_compile="False",
     use_compile_decoder=False,
     compress=None,
+    min_sqnr=None,
     num_workers=0,
     use_rel_pos=True,
     pad_input_image_batch=True,
@@ -457,31 +458,38 @@ def mlp_only(mod, name):
                 example_input=example_input,
                 manual=True,
                 qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
             )
         elif "autoquant-float8" == compress:
             autoquant(
                 predictor.model.image_encoder,
                 example_input=example_input,
                 manual=True,
                 qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
             )
         elif "autoquant-sparse" == compress:
             autoquant(
                 predictor.model.image_encoder,
                 example_input=example_input,
                 manual=True,
                 qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
             )
         elif "autoquant-all" == compress:
             autoquant(
                 predictor.model.image_encoder,
                 example_input=example_input,
                 manual=True,
                 qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
             )
         else:
             autoquant(
-                predictor.model.image_encoder, example_input=example_input, manual=True
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                min_sqnr=min_sqnr,
             )
         predictor.model.image_encoder(example_input)
         predictor.model.image_encoder.finalize_autoquant()
@@ -630,20 +638,39 @@ def mlp_only(mod, name):
         f.write(vals + "\n")
 
     if output_json_path:
-        headers = ["name", "dtype", "device", "arch", "metric", "actual", "target"]
+        headers = [
+            "name",
+            "dtype",
+            "min_sqnr",
+            "device",
+            "arch",
+            "metric",
+            "actual",
+            "target",
+        ]
         name = sam_model_type
         arch = get_arch_name()
         dtype = compress or "noquant"
         memory_result = [
             name,
             dtype,
+            min_sqnr,
             device,
             arch,
             "memory(MiB)",
             max_memory_allocated_bytes,
             None,
         ]
-        performance_result = [name, dtype, device, arch, "img_s(avg)", img_s, None]
+        performance_result = [
+            name,
+            dtype,
+            min_sqnr,
+            device,
+            arch,
+            "img_s(avg)",
+            img_s,
+            None,
+        ]
         write_json_result = (
             write_json_result_local if output_json_local else write_json_result_ossci
         )

diff --git a/torchao/_models/utils.py b/torchao/_models/utils.py
@@ -30,6 +30,7 @@ def write_json_result_ossci(output_json_path, headers, row):
             "name": "TorchAO benchmark",
             "mode": "inference",
             "dtype": mapping_headers["dtype"],
+            "min_sqnr": mapping_headers["min_sqnr"],
             "extra_info": {
                 "device": mapping_headers["device"],
                 "arch": mapping_headers["arch"],
@@ -38,7 +39,7 @@ def write_json_result_ossci(output_json_path, headers, row):
         "model": {
             "name": mapping_headers["name"],
             "type": "model",
-            "origins": ["torchao/_models"],
+            "origins": ["torchao"],
         },
         "metric": {
             "name": mapping_headers["metric"],
@@ -79,6 +80,7 @@ def write_json_result_local(output_json_path, headers, row):
             "name": "TorchAO benchmark",
             "mode": "inference",
             "dtype": mapping_headers["dtype"],
+            "min_sqnr": mapping_headers["min_sqnr"],
             "extra_info": {
                 "device": mapping_headers["device"],
                 "arch": mapping_headers["arch"],
@@ -87,7 +89,7 @@ def write_json_result_local(output_json_path, headers, row):
         "model": {
             "name": mapping_headers["name"],
             "type": "model",
-            "origins": ["torchao/_models"],
+            "origins": ["torchao"],
         },
         "metric": {
             "name": mapping_headers["metric"],

diff --git a/torchao/csrc/cuda/sparse_marlin/marlin_kernel_nm.cu b/torchao/csrc/cuda/sparse_marlin/marlin_kernel_nm.cu
@@ -401,10 +401,13 @@ __global__ void Marlin_24(
         meta_ptr[i] += m_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred)
+            cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -429,7 +432,7 @@ __global__ void Marlin_24(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));

diff --git a/torchao/prototype/float8nocompile/__init__.py b/torchao/prototype/float8nocompile/__init__.py