pytorch · jcaip · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/benchmark_requirements.txt b/benchmark_requirements.txt
@@ -0,0 +1,49 @@
+attrs==24.2.0
+black==24.8.0
+blobfile==3.0.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+expecttest==0.2.1
+filelock==3.16.1
+fsspec==2024.10.0
+huggingface-hub==0.26.2
+hypothesis==6.115.6
+idna==3.10
+Jinja2==3.1.4
+lxml==5.3.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.5
+mdurl==0.1.0
+mpmath==1.3.0
+mypy==1.11.2
+mypy-extensions==1.0.0
+networkx==3.4.2
+numpy==2.0.1
+packaging==24.1
+pathspec==0.10.3
+pillow==11.0.0
+pip==24.3.1
+platformdirs==3.10.0
+protobuf==4.25.3
+psutil==5.9.0
+pycparser==2.21
+pycryptodomex==3.21.0
+Pygments==2.15.1
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.7.1
+safetensors==0.4.5
+setuptools==75.1.0
+six==1.16.0
+sortedcontainers==2.4.0
+sympy==1.13.1
+tiktoken==0.8.0
+tokenizers==0.20.3
+tqdm==4.67.1
+transformers==4.46.3
+typing_extensions==4.11.0
+urllib3==2.2.3
+wheel==0.44.0
diff --git a/pytorch_requirements.txt b/pytorch_requirements.txt
@@ -0,0 +1,5 @@
+--index-url https://download.pytorch.org/whl/nightly/rocm6.2
+torch==2.6.0.dev20241122+rocm6.2
+pytorch-triton-rocm==3.1.0+cf34004b8a
+torchaudio==2.5.0.dev20241125+rocm6.2
+torchvision==0.20.0.dev20241125+rocm6.2
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -86,8 +86,8 @@ def permute(w, n_head):
            state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
            merged_result.update(state_dict)
 
-    if config.tie_word_embeddings:
-        merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
+    # if config.tie_word_embeddings:
+        # merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
 
     final_result = {}
     for key, value in merged_result.items():

diff --git a/scripts/prepare.sh b/scripts/prepare.sh
@@ -1,8 +1,8 @@
-python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
-python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
+#python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
+#python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
 python scripts/download.py --repo_id meta-llama/Meta-Llama-3.1-8B
-python scripts/download.py --repo_id meta-llama/Llama-3.2-3B
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
+#python scripts/download.py --repo_id meta-llama/Llama-3.2-3B
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
 python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3.1-8B
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-3.2-3B
+#python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-3.2-3B
diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt
@@ -50,3 +50,17 @@ OTHER BENCHMARKS
 20240910010056, tok/s= 47.85, mem/s= 213.24 GB/s, peak_mem=11.85 GB, model_size= 4.46 GB quant: uintx-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
 20240910010647, tok/s= 34.83, mem/s= 261.42 GB/s, peak_mem=14.99 GB, model_size= 7.51 GB quant: uintx-2-8, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
 20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+
+20241125122729, tok/s=160.77, mem/s=2413.14 GB/s, peak_mem=16.60 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123143, tok/s=213.99, mem/s=1609.22 GB/s, peak_mem=10.80 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123322, tok/s=131.13, mem/s= 553.61 GB/s, peak_mem= 6.81 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123409, tok/s=168.82, mem/s=1267.56 GB/s, peak_mem=11.83 GB, model_size= 7.51 GB quant: float8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123544, tok/s=141.95, mem/s=1065.36 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-tensor, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-tensor --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125123640, tok/s=142.23, mem/s=1067.49 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+
+20241125130306, tok/s=163.37, mem/s=2452.20 GB/s, peak_mem=16.60 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130348, tok/s=214.74, mem/s=1614.87 GB/s, peak_mem=10.80 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130423, tok/s=131.46, mem/s= 555.03 GB/s, peak_mem= 6.57 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130601, tok/s=173.17, mem/s=1300.17 GB/s, peak_mem=11.83 GB, model_size= 7.51 GB quant: float8wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130727, tok/s=138.31, mem/s=1038.01 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-tensor, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-tensor --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241125130822, tok/s=140.46, mem/s=1054.20 GB/s, peak_mem=12.98 GB, model_size= 7.51 GB quant: float8dq-wo, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization float8dq-wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8