Merge branch 'pytorch:main' into add_coat_optimizer

pytorch · Dec 21, 2024 · 6eba1d1 · 6eba1d1
2 parents f9d0aa1 + eab345c
commit 6eba1d1
Show file tree

Hide file tree

Showing 89 changed files with 4,841 additions and 1,479 deletions.
diff --git a/.github/workflows/dashboard_perf_test.yml b/.github/workflows/dashboard_perf_test.yml
@@ -6,15 +6,15 @@ on:
       - ciflow/benchmark/*
   workflow_dispatch:
   schedule:
-    - cron: 0 7 * * 0-6
+    - cron: 0 7 * * *
 
 jobs:
   benchmark:
     runs-on: linux.aws.a100
     strategy:
       matrix:
         torch-spec:
-          - '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
+          - '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
     steps:
       - uses: actions/checkout@v3
 
@@ -31,14 +31,37 @@ jobs:
           ${CONDA_RUN} pip install ${{ matrix.torch-spec }}
           ${CONDA_RUN} pip install -r dev-requirements.txt
           ${CONDA_RUN} pip install .
+          # SAM 2.1
+          ${CONDA_RUN} pip install -r examples/sam2_amg_server/requirements.txt
 
+          # llama3
           export CHECKPOINT_PATH=checkpoints
-          export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-          ${CONDA_RUN} python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf --hf_token ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+          ${CONDA_RUN} python scripts/download.py --repo_id ${MODEL_REPO} --hf_token ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           ${CONDA_RUN} python scripts/convert_hf_checkpoint.py --checkpoint_dir "${CHECKPOINT_PATH}/${MODEL_REPO}"
 
           mkdir -p ${{ runner.temp }}/benchmark-results
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --output_json_path ${{ runner.temp }}/benchmark-results/benchmark-results.json
+          # llama3 - compile baseline
+          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+
+          # llama3 - autoquant
+          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+
+          # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
+          # # SAM
+          # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
+          # # SAM compile baselilne
+          # ${CONDA_RUN} sh torchao/_models/sam/setup.sh
+          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+
+          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          
+          # SAM 2.1
+          # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2
+          # cd examples/sam2_amg_server
+          # hydra.errors.MissingConfigException: Cannot find primary config 'configs/sam2.1/sam2.1_hiera_l.yaml'. Check that it's in your config search path.
+          # ${CONDA_RUN} python server.py ${CHECKPOINT_PATH}/sam2 large --port 4000 --host localhost --fast --benchmark --dry --output_json_path ${{ runner.temp }}/benchmark-results/sam2-benchmark-results.json
+          # ${CONDA_RUN} python server.py ${CHECKPOINT_PATH}/sam2 large --port 4000 --host localhost --fast --use_autoquant --benchmark --dry --output_json_path ${{ runner.temp }}/benchmark-results/sam2-benchmark-results.json
 
       - name: Upload the benchmark results to OSS benchmark database for the dashboard
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main

diff --git a/.github/workflows/ruff_linter.yml b/.github/workflows/ruff_linter.yml
@@ -22,7 +22,7 @@ jobs:
     permissions:
       contents: write
       pull-requests: write
-    
+
     strategy:
       matrix:
         python-version: ["3.9"]
@@ -33,43 +33,43 @@ jobs:
         PR_URL=${{ github.event.inputs.pr_url }}
         PR_NUMBER=$(echo $PR_URL | grep -oE '[0-9]+$')
         echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-        
+
     - uses: actions/checkout@v3
       if: github.event_name == 'workflow_dispatch'
       with:
         fetch-depth: 0
         token: ${{ secrets.GITHUB_TOKEN }}
-    
+
     - name: Checkout PR branch
       if: github.event_name == 'workflow_dispatch'
       run: |
         gh pr checkout ${{ env.PR_NUMBER }}
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        
+
     - uses: actions/checkout@v3
       if: github.event_name != 'workflow_dispatch'
       with:
         fetch-depth: 0
-        
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
-        
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install ruff==0.6.8
-        
+
     - name: Regular lint check
       if: github.event_name != 'workflow_dispatch'
       run: |
-        ruff check .
         # --isolated is used to skip the allowlist at all so this applies to all files
         # please be careful when using this large changes means everyone needs to rebase
+        # if you do be sure to update .pre-commit-config.yaml
         ruff check --isolated --select F821,F823,W191
-        ruff check --select F,I
+        ruff check
         ruff format --check || {
           echo "Ruff check failed, please try again after running 'ruff format'."
           exit 1
@@ -80,11 +80,11 @@ jobs:
       run: |
         git config --global user.name 'github-actions[bot]'
         git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-        
+
         # Apply fixes
-        ruff check --select F,I --fix
+        ruff check --fix
         ruff format .
-        
+
         # Commit and push if there are changes
         if [[ -n "$(git status --porcelain)" ]]; then
           git add .

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,3 +21,9 @@ repos:
           - F,I
       # Run the formatter.
       - id: ruff-format
+      # Run isolated checks.
+      - id: ruff
+        alias: ruff-isolated
+        args:
+          - --isolated
+          - select F821,F823,W191
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -9,11 +9,18 @@ sentencepiece # for gpt-fast tokenizer
 expecttest
 
 # For prototype features and benchmarks
-bitsandbytes #needed for testing triton quant / dequant ops for 8-bit optimizers
+bitsandbytes # needed for testing triton quant / dequant ops for 8-bit optimizers
 matplotlib
 pandas
 fire  # QOL for commandline scripts
 tabulate  # QOL for printing tables to stdout
+tiktoken
+blobfile
+lm_eval
+# sam
+diskcache
+pycocotools
+tqdm
 
 # Custom CUDA Extensions
 ninja

diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
@@ -28,6 +28,11 @@
 import asyncio
 from contextlib import asynccontextmanager
 import contextlib
+from torchao._models.utils import (
+    get_arch_name,
+    write_json_result_ossci,
+    write_json_result_local,
+)
 
 from torch._inductor import config as inductorconfig
 inductorconfig.triton.unique_kernel_names = True
@@ -269,8 +274,10 @@ def benchmark_fn(func, inp, mask_generator, warmup=3, runs=10):
     t = time.time()
     for _ in range(runs):
         func(inp, mask_generator)
-    print(f"Benchmark took {(time.time() - t)/runs}s per iteration.")
-    max_memory_allocated()
+    avg_time_per_run = (time.time() - t)/runs
+    print(f"Benchmark took {avg_time_per_run}s per iteration.")
+    max_memory_allocated_bytes, max_memory_allocated_percentage = max_memory_allocated()
+    return avg_time_per_run, max_memory_allocated_bytes, max_memory_allocated_percentage
 
 
 def max_memory_allocated():
@@ -279,6 +286,7 @@ def max_memory_allocated():
     max_memory_allocated_percentage = int(100 * (max_memory_allocated_bytes / total_memory))
     max_memory_allocated_bytes = max_memory_allocated_bytes >> 20
     print(f"max_memory_allocated_bytes: {max_memory_allocated_bytes}MiB or {max_memory_allocated_percentage}%")
+    return max_memory_allocated_bytes, max_memory_allocated_percentage
 
 
 def unittest_fn(masks, ref_masks, order_by_area=False, verbose=False):
@@ -527,10 +535,10 @@ def set_furious(mask_generator):
     mask_generator.predictor.model.sam_mask_decoder._src_dtype = torch.float16
 
 def set_autoquant(mask_generator):
+    import torchao
     from torchao import autoquant
-    from torchao.quantization import DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST
     # NOTE: Not baseline feature
-    mask_generator.predictor.model.image_encoder = autoquant(mask_generator.predictor.model.image_encoder, qtensor_class_list=DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)
+    mask_generator.predictor.model.image_encoder = autoquant(mask_generator.predictor.model.image_encoder, qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)
     mask_generator.predictor._transforms_device = mask_generator.predictor.device
     torch.set_float32_matmul_precision('high')
     # NOTE: this fails when we run
@@ -556,7 +564,9 @@ def main(checkpoint_path,
          dry=False,
          batch_size=1,
          load_fast="",
-         save_fast=""):
+         save_fast="",
+         output_json_path=None,
+         output_json_local=False):
     if verbose:
         logging.basicConfig(level=logging.INFO,
                             format='%(asctime)s - %(levelname)s - %(message)s',
@@ -626,9 +636,9 @@ def main(checkpoint_path,
     if benchmark:
         print(f"batch size {batch_size} dog benchmark")
         if batch_size == 1:
-            benchmark_fn(image_tensor_to_masks, image_tensor, mask_generator)
+            result = benchmark_fn(image_tensor_to_masks, image_tensor, mask_generator)
         else:
-            benchmark_fn(image_tensors_to_masks, [image_tensor] * batch_size, mask_generator)
+            result = benchmark_fn(image_tensors_to_masks, [image_tensor] * batch_size, mask_generator)
 
         for i, shapes in enumerate([example_shapes(), example_shapes_2()]):
             print(f"batch size {batch_size} example shapes {i} benchmark")
@@ -644,6 +654,20 @@ def main(checkpoint_path,
                 print("len(random_images): ", len(random_images))
                 benchmark_fn(image_tensors_to_masks, random_images, mask_generator)
 
+        if output_json_path:
+            headers = ["name", "dtype", "device", "arch", "metric", "actual", "target"]
+            name = "sam2-" + model_type
+            arch = get_arch_name()
+            dtype = "autoquant" if use_autoquant else "noquant"
+            avg_time_per_run, max_memory_allocated_bytes, max_memory_allocated_percentage = result
+            memory_result = [name, dtype, device, arch, "memory(MiB)", max_memory_allocated_bytes, None]
+            memory_percent_result = [name, dtype, device, arch, "memory(%)", max_memory_allocated_percentage, None]
+            performance_result = [name, dtype, device, arch, "time_s(avg)", avg_time_per_run, None]
+            write_json_result = write_json_result_local if output_json_local else write_json_result_ossci
+            write_json_result(output_json_path, headers, memory_result)
+            write_json_result(output_json_path, headers, memory_percent_result)
+            write_json_result(output_json_path, headers, performance_result)
+
     if profile is not None:
         print(f"Saving profile under {profile}")
         if batch_size == 1:

diff --git a/ruff.toml b/ruff.toml
@@ -20,4 +20,5 @@ include = [
     "test/prototype/low_bit_optim/**.py",
 ]
 
+lint.select = ["F", "I"]
 lint.ignore = ["E731"]
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -32,7 +32,7 @@ def convert_hf_checkpoint(
     model_map_json_safetensors = checkpoint_dir / 'model.safetensors.index.json'
     model_map_json_pytorch = checkpoint_dir / "pytorch_model.bin.index.json"
     model_map_json = None
-   
+
     try:
       assert model_map_json_safetensors.is_file()
       model_map_json = model_map_json_safetensors
@@ -46,7 +46,7 @@ def convert_hf_checkpoint(
         print(f"Found pytorch index at {model_map_json_pytorch}")
       except AssertionError:
         print(f"{model_map_json_pytorch} not found")
-   
+
     if model_map_json is None: raise Exception("No model map found!")
 
     with open(model_map_json) as json_map:
@@ -85,7 +85,7 @@ def permute(w, n_head):
        else:
            state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
            merged_result.update(state_dict)
-    
+
     if config.tie_word_embeddings:
         merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
 

diff --git a/scripts/download_sam2_ckpts.sh b/scripts/download_sam2_ckpts.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Use either wget or curl to download the checkpoints
+if command -v wget &> /dev/null; then
+    CMD="wget -P"
+elif command -v curl &> /dev/null; then
+    CMD="curl -L -O"
+else
+    echo "Please install wget or curl to download the checkpoints."
+    exit 1
+fi
+
+# Define the URLs for SAM 2 checkpoints
+# SAM2_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/072824"
+# sam2_hiera_t_url="${SAM2_BASE_URL}/sam2_hiera_tiny.pt"
+# sam2_hiera_s_url="${SAM2_BASE_URL}/sam2_hiera_small.pt"
+# sam2_hiera_b_plus_url="${SAM2_BASE_URL}/sam2_hiera_base_plus.pt"
+# sam2_hiera_l_url="${SAM2_BASE_URL}/sam2_hiera_large.pt"
+
+# Download each of the four checkpoints using wget
+# echo "Downloading sam2_hiera_tiny.pt checkpoint..."
+# $CMD $sam2_hiera_t_url || { echo "Failed to download checkpoint from $sam2_hiera_t_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_small.pt checkpoint..."
+# $CMD $sam2_hiera_s_url || { echo "Failed to download checkpoint from $sam2_hiera_s_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_base_plus.pt checkpoint..."
+# $CMD $sam2_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2_hiera_b_plus_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_large.pt checkpoint..."
+# $CMD $sam2_hiera_l_url || { echo "Failed to download checkpoint from $sam2_hiera_l_url"; exit 1; }
+
+# Define the URLs for SAM 2.1 checkpoints
+SAM2p1_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824"
+sam2p1_hiera_t_url="${SAM2p1_BASE_URL}/sam2.1_hiera_tiny.pt"
+sam2p1_hiera_s_url="${SAM2p1_BASE_URL}/sam2.1_hiera_small.pt"
+sam2p1_hiera_b_plus_url="${SAM2p1_BASE_URL}/sam2.1_hiera_base_plus.pt"
+sam2p1_hiera_l_url="${SAM2p1_BASE_URL}/sam2.1_hiera_large.pt"
+
+# $1 is the directory to store the checkpoint
+DEFAULT_DIR=test
+if [ -z "$1" ]; then
+  DIR_NAME=$DEFAULT_DIR
+else
+  # Use provided directory name
+  DIR_NAME=$1
+fi
+
+# SAM 2.1 checkpoints
+echo "Downloading sam2.1_hiera_tiny.pt checkpoint..."
+$CMD $DIR_NAME $sam2p1_hiera_t_url || { echo "Failed to download checkpoint from $sam2p1_hiera_t_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_small.pt checkpoint..."
+$CMD $DIR_NAME $sam2p1_hiera_s_url || { echo "Failed to download checkpoint from $sam2p1_hiera_s_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_base_plus.pt checkpoint..."
+$CMD $DIR_NAME $sam2p1_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2p1_hiera_b_plus_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_large.pt checkpoint..."
+$CMD $DIR_NAME $sam2p1_hiera_l_url || { echo "Failed to download checkpoint from $sam2p1_hiera_l_url"; exit 1; }
+
+echo "All checkpoints are downloaded successfully."
diff --git a/scripts/run_ruff_fix.sh b/scripts/run_ruff_fix.sh
@@ -0,0 +1,6 @@
+ruff check . --fix
+# --isolated is used to skip the allowlist at all so this applies to all files
+# please be careful when using this large changes means everyone needs to rebase
+ruff check --isolated --select F821,F823,W191 --fix
+ruff check --select F,I --fix
+ruff format .