From a60980573f50f4dcd502232708c9d20af8cfbb67 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:20:47 -0800
Subject: [PATCH 1/9] wip

---
 benchmark_requirements.txt                  |  59 ++++++++++
 scripts/convert_hf_checkpoint.py            |   4 +-
 scripts/prepare.sh                          |  12 +-
 torchao/_models/llama/benchmark_results.txt |  14 +++
 torchao/_models/llama/benchmarks.sh         | 118 ++++++++++----------
 torchao/_models/llama/generate.py           |   4 +-
 6 files changed, 142 insertions(+), 69 deletions(-)
 create mode 100644 benchmark_requirements.txt

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
new file mode 100644
index 0000000000..20c2c964ee
--- /dev/null
+++ b/benchmark_requirements.txt
@@ -0,0 +1,59 @@
+Package             Version                    Editable project location
+------------------- -------------------------- -------------------------
+attrs               24.2.0
+black               24.8.0
+blobfile            3.0.0
+certifi             2024.8.30
+cffi                1.17.1
+charset-normalizer  3.4.0
+click               8.1.7
+expecttest          0.2.1
+filelock            3.16.1
+fsspec              2024.10.0
+huggingface-hub     0.26.2
+hypothesis          6.115.6
+idna                3.10
+Jinja2              3.1.4
+lxml                5.3.0
+markdown-it-py      2.2.0
+MarkupSafe          2.1.5
+mdurl               0.1.0
+mkl_fft             1.3.11
+mkl_random          1.2.8
+mkl-service         2.4.0
+mpmath              1.3.0
+mypy                1.11.2
+mypy-extensions     1.0.0
+networkx            3.4.2
+numpy               2.0.1
+packaging           24.1
+pathspec            0.10.3
+pillow              11.0.0
+pip                 24.3.1
+platformdirs        3.10.0
+protobuf            4.25.3
+psutil              5.9.0
+pycparser           2.21
+pycryptodomex       3.21.0
+Pygments            2.15.1
+pytorch-triton-rocm 3.1.0+cf34004b8a
+PyYAML              6.0.2
+regex               2024.11.6
+requests            2.32.3
+rich                13.7.1
+safetensors         0.4.5
+setuptools          75.1.0
+six                 1.16.0
+sortedcontainers    2.4.0
+sympy               1.13.1
+tiktoken            0.8.0
+tokenizers          0.20.3
+torch               2.6.0.dev20241122+rocm6.2
+torchao             0.7.0+git9bb1b230          /data/users/jessecai/ao
+torchaudio          2.5.0.dev20241125+rocm6.2
+torchvision         0.20.0.dev20241125+rocm6.2
+tqdm                4.67.1
+transformers        4.46.3
+typing_extensions   4.11.0
+urllib3             2.2.3
+wheel               0.44.0
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 11d425ceb2..7cf018b5fb 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -86,8 +86,8 @@ def permute(w, n_head):
            state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
            merged_result.update(state_dict)
     
-    if config.tie_word_embeddings:
-        merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
+    # if config.tie_word_embeddings:
+        # merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
 
     final_result = {}
     for key, value in merged_result.items():
diff --git a/scripts/prepare.sh b/scripts/prepare.sh
index db426e3b11..04c0558861 100644
--- a/scripts/prepare.sh
+++ b/scripts/prepare.sh
@@ -1,8 +1,8 @@
-python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
-python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
+#python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
+#python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
 python scripts/download.py --repo_id meta-llama/Meta-Llama-3.1-8B
-python scripts/download.py --repo_id meta-llama/Llama-3.2-3B
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
+#python scripts/download.py --repo_id meta-llama/Llama-3.2-3B
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
 python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3.1-8B
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-3.2-3B
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-3.2-3B
diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt
index d59c5f552e..1b85545d38 100644
--- a/torchao/_models/llama/benchmark_results.txt
+++ b/torchao/_models/llama/benchmark_results.txt
@@ -50,3 +50,17 @@ OTHER BENCHMARKS
 20240910010056, tok/s= 47.85, mem/s= 213.24 GB/s, peak_mem=11.85 GB, model_size= 4.46 GB quant: uintx-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
 20240910010647, tok/s= 34.83, mem/s= 261.42 GB/s, peak_mem=14.99 GB, model_size= 7.51 GB quant: uintx-2-8, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
 20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+
+20241125122729, tok/s=160.77, mem/s=2413.14 GB/s, peak_mem=16.60 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123143, tok/s=213.99, mem/s=1609.22 GB/s, peak_mem=10.80 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123322, tok/s=131.13, mem/s= 553.61 GB/s, peak_mem= 6.81 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123409, tok/s=168.82, mem/s=1267.56 GB/s, peak_mem=11.83 GB, model_size= 7.51 GB quant: float8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123544, tok/s=141.95, mem/s=1065.36 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-tensor, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-tensor --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123640, tok/s=142.23, mem/s=1067.49 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+
+20241125130306, tok/s=163.37, mem/s=2452.20 GB/s, peak_mem=16.60 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130348, tok/s=214.74, mem/s=1614.87 GB/s, peak_mem=10.80 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130423, tok/s=131.46, mem/s= 555.03 GB/s, peak_mem= 6.57 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130601, tok/s=173.17, mem/s=1300.17 GB/s, peak_mem=11.83 GB, model_size= 7.51 GB quant: float8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130727, tok/s=138.31, mem/s=1038.01 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-tensor, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-tensor --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130822, tok/s=140.46, mem/s=1054.20 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
\ No newline at end of file
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
index 63733c736d..c1a5545121 100644
--- a/torchao/_models/llama/benchmarks.sh
+++ b/torchao/_models/llama/benchmarks.sh
@@ -1,21 +1,21 @@
 export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 
 # README BENCHMARKS
-export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
+#export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
 
-export MODEL_REPO=meta-llama/Meta-Llama-3-8B
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
+#export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
 
 export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
@@ -29,53 +29,53 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 # OTHER BENCHMARKS
 
 # kv cache quantization
-export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask
+#export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask
 
-export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
+#export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
 
-export MODEL_REPO=meta-llama/Meta-Llama-3-8B
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
+#export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
+## python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
 
-# Different Batch Size Benchmarks
-export MODEL_REPO=meta-llama/Meta-Llama-3-8B
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 1
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 128
+## Different Batch Size Benchmarks
+#export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 1
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 32
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 128
 
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 1
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 128
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 1
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 32
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 128
 
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 1
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 128
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 1
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 32
+#python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 128
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
index 862f5d186d..11d19789bd 100644
--- a/torchao/_models/llama/generate.py
+++ b/torchao/_models/llama/generate.py
@@ -298,7 +298,7 @@ def main(
             group_size = int(_quant_args[2])
             quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq))
         elif "float8wo" in quantization:
-            quantize_(model, float8_weight_only())
+            quantize_(model, float8_weight_only(weight_dtype=torch.float8_e4m3fnuz))
         elif "float8dq" in quantization:
             granularity = str(quantization.split("-")[-1])
             if granularity=="tensor":
@@ -307,7 +307,7 @@ def main(
                 granularity = PerRow()
             else:
                 granularity = PerTensor()
-            quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity))
+            quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity, weight_dtype=torch.float8_e4m3fnuz, activation_dtype=torch.float8_e4m3fnuz))
         elif "autoquant_v2" in quantization:
             from torchao._models._eval import InputRecorder
             from torchao._models.llama.model import prepare_inputs_for_model

From aec9414f9ca5d36a1106e66496c895cfe9b1e7ef Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:35:12 -0800
Subject: [PATCH 2/9] update requirements

---
 benchmark_requirements.txt | 115 ++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 59 deletions(-)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 20c2c964ee..208b5c89ed 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -1,59 +1,56 @@
-Package             Version                    Editable project location
-------------------- -------------------------- -------------------------
-attrs               24.2.0
-black               24.8.0
-blobfile            3.0.0
-certifi             2024.8.30
-cffi                1.17.1
-charset-normalizer  3.4.0
-click               8.1.7
-expecttest          0.2.1
-filelock            3.16.1
-fsspec              2024.10.0
-huggingface-hub     0.26.2
-hypothesis          6.115.6
-idna                3.10
-Jinja2              3.1.4
-lxml                5.3.0
-markdown-it-py      2.2.0
-MarkupSafe          2.1.5
-mdurl               0.1.0
-mkl_fft             1.3.11
-mkl_random          1.2.8
-mkl-service         2.4.0
-mpmath              1.3.0
-mypy                1.11.2
-mypy-extensions     1.0.0
-networkx            3.4.2
-numpy               2.0.1
-packaging           24.1
-pathspec            0.10.3
-pillow              11.0.0
-pip                 24.3.1
-platformdirs        3.10.0
-protobuf            4.25.3
-psutil              5.9.0
-pycparser           2.21
-pycryptodomex       3.21.0
-Pygments            2.15.1
-pytorch-triton-rocm 3.1.0+cf34004b8a
-PyYAML              6.0.2
-regex               2024.11.6
-requests            2.32.3
-rich                13.7.1
-safetensors         0.4.5
-setuptools          75.1.0
-six                 1.16.0
-sortedcontainers    2.4.0
-sympy               1.13.1
-tiktoken            0.8.0
-tokenizers          0.20.3
-torch               2.6.0.dev20241122+rocm6.2
-torchao             0.7.0+git9bb1b230          /data/users/jessecai/ao
-torchaudio          2.5.0.dev20241125+rocm6.2
-torchvision         0.20.0.dev20241125+rocm6.2
-tqdm                4.67.1
-transformers        4.46.3
-typing_extensions   4.11.0
-urllib3             2.2.3
-wheel               0.44.0
+attrs @ file:///croot/attrs_1729089401488/work
+black @ file:///croot/black_1725573853246/work
+blobfile==3.0.0
+certifi==2024.8.30
+cffi @ file:///croot/cffi_1726856441404/work
+charset-normalizer==3.4.0
+click @ file:///work/perseverance-python-buildout/croot/click_1698845879718/work
+expecttest==0.2.1
+filelock==3.16.1
+fsspec==2024.10.0
+huggingface-hub==0.26.2
+hypothesis @ file:///croot/hypothesis_1730479536060/work
+idna==3.10
+Jinja2==3.1.4
+lxml==5.3.0
+markdown-it-py @ file:///work/perseverance-python-buildout/croot/markdown-it-py_1698846045803/work
+MarkupSafe==2.1.5
+mdurl @ file:///work/perseverance-python-buildout/croot/mdurl_1698845653285/work
+mkl-service==2.4.0
+mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
+mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
+mpmath==1.3.0
+mypy @ file:///croot/mypy-split_1725573876658/work
+mypy-extensions @ file:///work/perseverance-python-buildout/croot/mypy_extensions_1698863276135/work
+networkx==3.4.2
+numpy @ file:///croot/numpy_and_numpy_base_1725470312869/work/dist/numpy-2.0.1-cp312-cp312-linux_x86_64.whl#sha256=e2374991344fa2241a2153ef3d550d3a1cd2d50cb1f1d51eddc82870abc88021
+packaging @ file:///croot/packaging_1720101850331/work
+pathspec @ file:///work/perseverance-python-buildout/croot/pathspec_1698805478393/work
+pillow==11.0.0
+platformdirs @ file:///work/perseverance-python-buildout/croot/platformdirs_1701732573265/work
+protobuf==4.25.3
+psutil @ file:///work/perseverance-python-buildout/croot/psutil_1698863411559/work
+pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
+pycryptodomex==3.21.0
+Pygments @ file:///work/perseverance-python-buildout/croot/pygments_1698846270603/work
+pytorch-triton-rocm==3.1.0+cf34004b8a
+PyYAML @ file:///croot/pyyaml_1728657952215/work
+regex==2024.11.6
+requests==2.32.3
+rich @ file:///croot/rich_1720637495510/work
+safetensors==0.4.5
+setuptools==75.1.0
+six==1.16.0
+sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
+sympy==1.13.1
+tiktoken==0.8.0
+tokenizers==0.20.3
+torch==2.6.0.dev20241122+rocm6.2
+-e git+ssh://git@github.com/pytorch/ao.git@a60980573f50f4dcd502232708c9d20af8cfbb67#egg=torchao
+torchaudio==2.5.0.dev20241125+rocm6.2
+torchvision==0.20.0.dev20241125+rocm6.2
+tqdm==4.67.1
+transformers==4.46.3
+typing_extensions @ file:///croot/typing_extensions_1715268824938/work
+urllib3==2.2.3
+wheel==0.44.0

From 24ec5977a2d6db4a1b86e5424b7d276fc28be4d1 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:39:25 -0800
Subject: [PATCH 3/9] new

---
 benchmark_requirements.txt | 47 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 208b5c89ed..99c44c76c7 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -1,56 +1,57 @@
-attrs @ file:///croot/attrs_1729089401488/work
-black @ file:///croot/black_1725573853246/work
+attrs==24.2.0
+black==24.8.0
 blobfile==3.0.0
 certifi==2024.8.30
-cffi @ file:///croot/cffi_1726856441404/work
+cffi==1.17.1
 charset-normalizer==3.4.0
-click @ file:///work/perseverance-python-buildout/croot/click_1698845879718/work
+click==8.1.7
 expecttest==0.2.1
 filelock==3.16.1
 fsspec==2024.10.0
 huggingface-hub==0.26.2
-hypothesis @ file:///croot/hypothesis_1730479536060/work
+hypothesis==6.115.6
 idna==3.10
 Jinja2==3.1.4
 lxml==5.3.0
-markdown-it-py @ file:///work/perseverance-python-buildout/croot/markdown-it-py_1698846045803/work
+markdown-it-py==2.2.0
 MarkupSafe==2.1.5
-mdurl @ file:///work/perseverance-python-buildout/croot/mdurl_1698845653285/work
+mdurl==0.1.0
+mkl_fft==1.3.11
+mkl_random==1.2.8
 mkl-service==2.4.0
-mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
-mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
 mpmath==1.3.0
-mypy @ file:///croot/mypy-split_1725573876658/work
-mypy-extensions @ file:///work/perseverance-python-buildout/croot/mypy_extensions_1698863276135/work
+mypy==1.11.2
+mypy-extensions==1.0.0
 networkx==3.4.2
-numpy @ file:///croot/numpy_and_numpy_base_1725470312869/work/dist/numpy-2.0.1-cp312-cp312-linux_x86_64.whl#sha256=e2374991344fa2241a2153ef3d550d3a1cd2d50cb1f1d51eddc82870abc88021
-packaging @ file:///croot/packaging_1720101850331/work
-pathspec @ file:///work/perseverance-python-buildout/croot/pathspec_1698805478393/work
+numpy==2.0.1
+packaging==24.1
+pathspec==0.10.3
 pillow==11.0.0
-platformdirs @ file:///work/perseverance-python-buildout/croot/platformdirs_1701732573265/work
+pip==24.3.1
+platformdirs==3.10.0
 protobuf==4.25.3
-psutil @ file:///work/perseverance-python-buildout/croot/psutil_1698863411559/work
-pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
+psutil==5.9.0
+pycparser==2.21
 pycryptodomex==3.21.0
-Pygments @ file:///work/perseverance-python-buildout/croot/pygments_1698846270603/work
+Pygments==2.15.1
 pytorch-triton-rocm==3.1.0+cf34004b8a
-PyYAML @ file:///croot/pyyaml_1728657952215/work
+PyYAML==6.0.2
 regex==2024.11.6
 requests==2.32.3
-rich @ file:///croot/rich_1720637495510/work
+rich==13.7.1
 safetensors==0.4.5
 setuptools==75.1.0
 six==1.16.0
-sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
+sortedcontainers==2.4.0
 sympy==1.13.1
 tiktoken==0.8.0
 tokenizers==0.20.3
 torch==2.6.0.dev20241122+rocm6.2
--e git+ssh://git@github.com/pytorch/ao.git@a60980573f50f4dcd502232708c9d20af8cfbb67#egg=torchao
+torchao==0.7.0+git9bb1b230
 torchaudio==2.5.0.dev20241125+rocm6.2
 torchvision==0.20.0.dev20241125+rocm6.2
 tqdm==4.67.1
 transformers==4.46.3
-typing_extensions @ file:///croot/typing_extensions_1715268824938/work
+typing_extensions==4.11.0
 urllib3==2.2.3
 wheel==0.44.0

From edad9f953c57d223da41ededace1cc03e4198c81 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:46:31 -0800
Subject: [PATCH 4/9] updated

---
 benchmark_requirements.txt |  5 -----
 pytorch_requirements.txt   | 10 ++++++++++
 2 files changed, 10 insertions(+), 5 deletions(-)
 create mode 100644 pytorch_requirements.txt

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 99c44c76c7..9cfd06052f 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -34,7 +34,6 @@ psutil==5.9.0
 pycparser==2.21
 pycryptodomex==3.21.0
 Pygments==2.15.1
-pytorch-triton-rocm==3.1.0+cf34004b8a
 PyYAML==6.0.2
 regex==2024.11.6
 requests==2.32.3
@@ -46,10 +45,6 @@ sortedcontainers==2.4.0
 sympy==1.13.1
 tiktoken==0.8.0
 tokenizers==0.20.3
-torch==2.6.0.dev20241122+rocm6.2
-torchao==0.7.0+git9bb1b230
-torchaudio==2.5.0.dev20241125+rocm6.2
-torchvision==0.20.0.dev20241125+rocm6.2
 tqdm==4.67.1
 transformers==4.46.3
 typing_extensions==4.11.0
diff --git a/pytorch_requirements.txt b/pytorch_requirements.txt
new file mode 100644
index 0000000000..094ed0ebca
--- /dev/null
+++ b/pytorch_requirements.txt
@@ -0,0 +1,10 @@
+transformers==4.46.3
+typing_extensions==4.11.0
+urllib3==2.2.3
+wheel==0.44.0
+
+--index-url https://download.pytorch.org/whl/nightly/rocm6.2
+torch==2.6.0.dev20241122+rocm6.2
+pytorch-triton-rocm==3.1.0+cf34004b8a
+torchaudio==2.5.0.dev20241125+rocm6.2
+torchvision==0.20.0.dev20241125+rocm6.2

From 421709d5c3f5b14cfb01847139709b25a6fb30f3 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:51:16 -0800
Subject: [PATCH 5/9] fix typo

---
 pytorch_requirements.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pytorch_requirements.txt b/pytorch_requirements.txt
index 094ed0ebca..f06e208742 100644
--- a/pytorch_requirements.txt
+++ b/pytorch_requirements.txt
@@ -1,8 +1,3 @@
-transformers==4.46.3
-typing_extensions==4.11.0
-urllib3==2.2.3
-wheel==0.44.0
-
 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 torch==2.6.0.dev20241122+rocm6.2
 pytorch-triton-rocm==3.1.0+cf34004b8a

From e374ac04f61ea6cccb324edd5249dce29e6befac Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:54:07 -0800
Subject: [PATCH 6/9] wip

---
 benchmark_requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 9cfd06052f..1841f561cd 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -50,3 +50,6 @@ transformers==4.46.3
 typing_extensions==4.11.0
 urllib3==2.2.3
 wheel==0.44.0
+
+safetensors
+requests

From f66b270798298f3f4b576f3c9d4bb154b9b0ddaf Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:55:47 -0800
Subject: [PATCH 7/9] fix

---
 benchmark_requirements.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 1841f561cd..cc2d8f1388 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -16,7 +16,7 @@ lxml==5.3.0
 markdown-it-py==2.2.0
 MarkupSafe==2.1.5
 mdurl==0.1.0
-mkl_fft==1.3.11
+mkl_fft
 mkl_random==1.2.8
 mkl-service==2.4.0
 mpmath==1.3.0
@@ -50,6 +50,3 @@ transformers==4.46.3
 typing_extensions==4.11.0
 urllib3==2.2.3
 wheel==0.44.0
-
-safetensors
-requests

From e12bf4523dbf2d01c344c5c1df8f3367d03e51ba Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:57:27 -0800
Subject: [PATCH 8/9] unpin mkl libs

---
 benchmark_requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index cc2d8f1388..9614ba29aa 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -17,8 +17,8 @@ markdown-it-py==2.2.0
 MarkupSafe==2.1.5
 mdurl==0.1.0
 mkl_fft
-mkl_random==1.2.8
-mkl-service==2.4.0
+mkl_random
+mkl-service
 mpmath==1.3.0
 mypy==1.11.2
 mypy-extensions==1.0.0

From 6a30ecd05bd8bdc218b2fc8ca6a56a30337b05a9 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 4 Dec 2024 16:58:20 -0800
Subject: [PATCH 9/9] remove mkl libs

---
 benchmark_requirements.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
index 9614ba29aa..956ea83123 100644
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt
@@ -16,9 +16,6 @@ lxml==5.3.0
 markdown-it-py==2.2.0
 MarkupSafe==2.1.5
 mdurl==0.1.0
-mkl_fft
-mkl_random
-mkl-service
 mpmath==1.3.0
 mypy==1.11.2
 mypy-extensions==1.0.0