-
Notifications
You must be signed in to change notification settings - Fork 5
/
ort-llm.py
345 lines (300 loc) · 18.3 KB
/
ort-llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#
# simple llm benchmark using onnxruntime.
# models use nick names, like llama3.2-1b, They are mapped we assume models in the MODELS dictionary few lines down
# and prefixed with $PWD/models/.
#
# prep:
# pip install transformers onnxruntime
#
# run:
# python ort-llm.py --model llama3.2-1b --provider webgpu --use-iob
# python ort-llm.py --model llama3.2-1b --fp32 --provider webgpu --use-iob
#
import argparse
import json
import time
from transformers import AutoTokenizer, AutoConfig
import onnxruntime as ort
import numpy as np
import os
import sys
import shutil
MODELS = {
"llama3.2-1b": {"name": "llama3.2-1B", "path": "tjs/Llama-3.2-1B-Instruct"},
"llama3.2-1b-gqa": {"name": "llama3.2-1B-gqa", "path": "tjs/Llama-3.2-1B-Instruct-gqa"},
"llama3.2-3b": {"name": "llama3.2-3B", "path": "tjs/Llama-3.2-3B-Instruct"},
"phi3.5": {"name": "phi3.5", "path": "microsoft/Phi-3.5-mini-instruct-onnx-web"},
"phi3.5-gqa": {"name": "phi3.5-gqa", "path": "microsoft/Phi-3.5-mini-instruct-gqa"},
"phi3.5-128": {"name": "phi3.5-128", "path": "microsoft/Phi-3.5-mini-instruct-128"},
"phi3": {"name": "phi3", "path": "microsoft/Phi-3-mini-4k-instruct-onnx-web"},
"phi3-dml": {"name": "phi3", "path": "microsoft/Phi-3-mini-4k-instruct-onnx-directml"},
"gemma-2b": {"name": "gemma-2b", "path": "Xenova/gemma-2-2b-it-onnx-web"},
"smollm": {"name": "smollm", "path": "Xenova/smollm-360M-instruct-add-basics"},
"smollmv2": {"name": "smollmv2", "path": "tjs/SmolLM2-135M-Instruct-onnx-web"},
"tinyllama": {"name": "tinyllama", "path": "schmuell/TinyLlama-1.1B-Chat-v1.0"},
"qwen2.5-0.5b": {"name": "qwen2.5-0.5b", "path": "tjs/Qwen2.5-0.5B-Instruct-onnx-web"},
"foo": {"name": "foo", "path": "tjs/foo"},
}
SUM = """
Summarize:
Constantinople, now known as Istanbul in modern Turkey, was a historically significant city that served as the capital of both the Roman/Byzantine
Empire and the Ottoman Empire. Its rich history spans over 2,500 years, with its strategic location at the crossroads between Europe and Asia contributing
to its prominence throughout various periods. The city was originally founded by Greek colonists from Megara as Byzantium around 657 BC. It became a
significant center of trade due to its position on the Bosphorus, controlling passage between the Black Sea and the Mediterranean. However, it gained
even greater importance after Emperor Constantine I (Constantinus) relocated his capital there in 324 AD, thus renaming it Constantinople.
The Byzantine Empire developed into a major hub of Christian culture and religion, with Hagia Sophia being one of its most iconic structures
built during the reign of Emperor Justinian I. The city flourished as an artistic and intellectual center until the 12th century when it faced
significant challenges from various invaders, including Arabs, Bulgarians, Crusaders, and Venetians.
In 1453, Constantinople fell to the Ottoman Empire after a protracted siege led by Sultan Mehmed II. The city was renamed Istanbul as part of the empire's
policy of Islamization, but it retained much of its Greek Byzantine culture and architecture under the new rule. Today, Istanbul is Turkey's largest city and
an important cultural, economic, and transportation hub. The historical significance of Constantinople/Istanbul lies in its architectural landmarks
such as Hagia Sophia, The Hippodrome (now Sultanahmet Square), the Chora Church, and many more that showcase a blend of Byzantine, Roman, and Ottoman influences.
"""
SUM_LONG = """
Summarize:
As of now, **NVIDIA Nsight Graphics** does not natively support **WebGPU**, because WebGPU is a relatively new API designed to
offer more modern graphics capabilities directly from web browsers, whereas Nsight focuses primarily on low-level graphics APIs like Vulkan, Direct3D, and OpenGL. However, there are still ways to profile **WebGPU shaders**, either by using WebGPU-specific tools or by leveraging alternative strategies in conjunction with Nsight.
### Here's how you can approach profiling WebGPU shaders:
### 1. **Browser's Developer Tools**
WebGPU runs inside web browsers, so the primary way to profile WebGPU shaders is through the browser's built-in developer tools.
- **Google Chrome** (and other Chromium-based browsers) have a WebGPU implementation. The **Performance tab** can give some general insight into GPU workload and execution times.
- However, this is not as detailed as using tools like Nsight for native APIs like Vulkan.
You can follow these steps for basic profiling:
- Open **Chrome DevTools** (F12 or right-click and choose Inspect).
- Go to the **Performance** tab.
- Start recording while running your WebGPU workload.
- Stop the recording to analyze the GPU time and function calls.
For **WebGPU shader debugging**, WebGPU currently doesn't provide as many sophisticated tools for low-level shader profiling as Vulkan or DirectX.
### 2. **Emulate WebGPU with Vulkan and Use Nsight**
To use **NVIDIA Nsight Graphics** or other advanced GPU profiling tools (which are generally more mature than WebGPU's current ecosystem), you can take an indirect approach:
- **WGPU** is a popular Rust-based implementation of WebGPU, which can compile WebGPU code to Vulkan, Metal, or Direct3D 12.
- By targeting **Vulkan** in a WebGPU project (using WGPU), you can capture the Vulkan API traces in Nsight Graphics.
- This way, you can benefit from Nsight's advanced GPU profiling and shader analysis tools by profiling the Vulkan backend that your WebGPU project uses.
Steps:
1. Set up **WGPU** (or another WebGPU-to-Vulkan translation layer).
2. Run your WebGPU code, but have it target the Vulkan backend.
3. Open **NVIDIA Nsight Graphics** and capture a frame from the Vulkan-based WebGPU app.
4. Analyze shader execution times, memory usage, and other GPU performance metrics using Nsight's tools.
This approach allows you to leverage Vulkan profiling tools on WebGPU projects indirectly.
### 3. **Shader Profiling with Spirv-Cross**
WebGPU shaders are written in **WGSL** or can be translated from HLSL/GLSL into SPIR-V using tools like **SPIRV-Cross**. If you're using SPIR-V shaders, Nsight Graphics can profile them once you've translated the WebGPU pipeline to Vulkan or other supported APIs.
- **SPIR-V** shader code can be compiled using tools like **glslang** or **SPIRV-Tools** to ensure it's compatible with Vulkan.
- Profiling SPIR-V shaders is supported by Nsight, so once you have your WebGPU shaders translated to SPIR-V, you can take advantage of Nsight's shader analysis capabilities.
### 4. **Monitor GPU Performance via Chrome's Internals**
Although Nsight Graphics doesn't support WebGPU directly, you can monitor GPU usage through **Chrome’s Task Manager** (Shift + Esc) to get rough insights into GPU memory usage and execution.
Additionally, Chrome flags like `--enable-gpu-benchmarking` and `--enable-webgpu` might give you more low-level insight into how WebGPU commands are being dispatched.
### 5. **Wait for WebGPU Toolchain Maturity**
As WebGPU matures, tools specifically designed to profile and debug WebGPU shaders will become more common. For example, upcoming features in **Google Chrome DevTools** and other WebGPU-focused browser tools could make shader profiling easier and more accessible without relying on Vulkan backends.
### Conclusion
1. **Direct Nsight support for WebGPU** is currently not available.
2. You can **use browser developer tools** (like Chrome's Performance tab) for high-level profiling.
3. **Convert WebGPU to Vulkan** via **WGPU** or similar projects to profile using Nsight.
4. Use **SPIR-V shaders** for more direct shader profiling via Nsight in Vulkan-based projects.
While the tools for WebGPU shader profiling are not as mature as those for Vulkan or DirectX, the combination of browser tools and Vulkan translation layers can provide insights for performance tuning.
"""
TASK = {
"easy": "Tell me about Constantinople.",
"sum": SUM,
"sum_long": SUM_LONG,
"lha": "Tell me about the lighthouse of alexandria with details.",
}
def get_args():
parser = argparse.ArgumentParser(description='tool')
parser.add_argument('--model', required=True, choices=MODELS.keys(), help='model')
parser.add_argument('--provider', default="cpu", choices=["cpu", "cuda", "webgpu", "dml"], help='provider')
parser.add_argument('--max_tokens', type=int, default=9999, help='max_tokens')
parser.add_argument('--profile', help='profile')
parser.add_argument('--verbose', action='store_true', help='verbose')
parser.add_argument('--fp32', action='store_true', help='use fp32, fp16 is the default')
parser.add_argument('--use-iob', action='store_true', help='use io-bindings')
parser.add_argument('--quiet', action='store_true', help='no output duing run')
parser.add_argument('--values', help='dump values')
parser.add_argument('--csv', help='csv')
parser.add_argument('--tag', default="test", help='tag')
parser.add_argument('--platform', default="", help='platform')
parser.add_argument('--task', default="easy", help='task')
parser.add_argument('--root', default="models", help='root of models')
args = parser.parse_args()
args.model = MODELS.get(args.model, args.model)
return args
class LLM(object):
def __init__(self):
pass
def setup_session(self, model_path, ep, verbose, profile=False, fp32=False, use_iob=False):
self.use_iob = use_iob
self.ep = ep
self.profile = profile
config = AutoConfig.from_pretrained(model_path, local_files_only=True)
self.config = config
opt = ort.SessionOptions()
providers = ['CPUExecutionProvider']
if ep == "cuda":
providers = ['CUDAExecutionProvider']
elif ep == 'dml':
providers = ['DmlExecutionProvider']
elif ep == 'webgpu':
providers = ['WebGpuExecutionProvider']
if verbose:
opt.log_verbosity_level = 3
opt.log_severity_level = 1
if profile:
opt.enable_profiling = True
model_file = os.path.join(model_path, "onnx", "model_q4.onnx") if fp32 else os.path.join(model_path, "onnx", "model_q4f16.onnx")
self.sess = ort.InferenceSession(model_file, sess_options=opt, providers=providers)
self.device_id = int(self.sess.get_provider_options()[providers[0]]['device_id']) if ep in ['cuda', 'dml'] else 0
self.output_names = [o.name for o in self.sess.get_outputs()]
self.input_names = [i.name for i in self.sess.get_inputs()]
self.need_position_id = 'position_ids' in self.input_names
self.logits_idx = self.output_names.index('logits')
self.eos = self.config.eos_token_id
if not isinstance(self.eos, list):
self.eos = [self.eos]
return self.sess
def setup_feed(self):
self.feed = {}
try:
dim_kv = self.config.head_dim
except:
dim_kv = int(self.config.hidden_size / self.config.num_attention_heads)
kv_dims = [1, self.config.num_key_value_heads, 0, dim_kv]
for i, input_meta in enumerate(self.sess.get_inputs()):
if input_meta.name.startswith("past"):
dtype = np.float32 if input_meta.type == "tensor(float)" else np.float16
self.feed[input_meta.name] = np.array([], dtype=dtype).reshape(kv_dims)
self.output_tokens = []
if self.use_iob:
self.iob = self.sess.io_binding()
self.iob.bind_output('logits')
for i, name in enumerate(self.output_names):
if name.startswith('present'):
self.iob.bind_output(name, device_type=self.ep, device_id=self.device_id)
for i, name in enumerate(self.input_names):
if name.startswith('past'):
v = self.feed[name]
self.iob.bind_cpu_input(name, v)
def update_kv_cache_iob(self):
for i, name in enumerate(self.output_names):
if name.startswith('present'):
new_name = name.replace('present', 'past_key_values')
ort_output = self.iob.get_outputs()[i]
# print(f"name: {name} -> {new_name} {ort_output.device_name()}, {ort_output.shape()}, {ort_output.element_type()}")
self.iob.bind_ortvalue_input(new_name, ort_output)
self.iob.bind_output(name, device_type=self.ep, device_id=self.device_id)
def update_kv_cache(self, outputs):
for i, name in enumerate(self.output_names):
if name.startswith('present'):
new_name = name.replace('present', 'past_key_values')
self.feed[new_name] = outputs[i]
def generate(self, tokens, **kw_args):
keep_cache = kw_args.get("keep_cache", 0)
max_tokens = kw_args.get("max_tokens", 256) + len(tokens)
values = kw_args.get("values", None)
cb = kw_args.get("cb", None)
input_names = ['input_ids', 'attention_mask']
if keep_cache:
self.output_tokens.extend(tokens)
else:
self.setup_feed()
self.output_tokens = tokens.tolist()
feed = self.feed
feed['input_ids'] = tokens.reshape(1, len(tokens))
last_token = 0
seqlen = len(self.output_tokens)
if self.need_position_id:
input_names.append('position_ids')
if keep_cache:
feed['position_ids'] = np.arange(seqlen - len(tokens), seqlen, dtype=np.int64).reshape([1, len(tokens)])
else:
feed['position_ids'] = np.arange(seqlen, dtype=np.int64).reshape([1, seqlen])
first_token_time = 0
start_time = time.time()
while last_token not in self.eos and seqlen < max_tokens:
feed['attention_mask'] = np.empty([1, seqlen], dtype=np.int64)
feed['attention_mask'].fill(1)
if self.use_iob:
for name in input_names:
self.iob.bind_cpu_input(name, feed[name])
self.iob.bind_output('logits')
self.sess.run_with_iobinding(self.iob)
logits = self.iob.get_outputs()[0].numpy()
self.update_kv_cache_iob()
else:
outputs = self.sess.run([], feed)
logits = outputs[self.logits_idx]
self.update_kv_cache(outputs)
if not first_token_time:
first_token_time = time.time()
if values:
j = {}
for idx, name in enumerate(self.output_names):
if name.startswith('present.'):
continue
t = outputs[idx]
v = t.flatten().tolist()[:10000000]
j[self.output_names[idx]] = {
"dtype": str(t.dtype),
"shape": list(t.shape),
"output-avg": float(t.mean()),
"output-val": list(v),
}
with open(values, "w") as f:
f.write(json.dumps(j, indent=2))
return 0
last_token = np.argmax(logits[0, -1, :])
self.output_tokens.append(last_token)
feed['input_ids'] = np.array([last_token], dtype=np.int64).reshape(1, 1)
if self.need_position_id:
feed['position_ids'] = np.array([seqlen], dtype=np.int64).reshape(1, 1)
seqlen += 1
if cb:
cb(self.output_tokens, seqlen)
end_time = time.time()
return self.output_tokens, end_time - start_time, end_time - first_token_time, first_token_time - start_time, seqlen
def main():
args = get_args()
model_path = os.path.join(args.root, args.model['path'])
# if args.provider == "cpu":
# model_path = model_path + "-cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False, local_files_only=True)
llm = LLM()
llm.setup_session(model_path, args.provider, args.verbose, args.profile, fp32=args.fp32, use_iob=args.use_iob)
llm.setup_feed()
query = TASK[args.task]
# warmup
message = [{"role": "user", "content": "hello"}]
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_dict=False, tokenize=False)
input_ids = tokenizer(prompt, return_tensors="np", return_attention_mask=False) # , return_tensor: false, padding: true, truncation: true })
input_ids = input_ids['input_ids'][0].astype(np.int64)
prompt_tokens = len(input_ids)
_ = llm.generate(input_ids, keep_cache=False, max_tokens=1)
# timed run
message = [{"role": "user", "content": query}]
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_dict=False, tokenize=False)
input_ids = tokenizer(prompt, return_tensors="np", return_attention_mask=False) # , return_tensor: false, padding: true, truncation: true })
input_ids = input_ids['input_ids'][0].astype(np.int64)
prompt_tokens = len(input_ids)
def cb(output_tokens, seqlen):
return print(tokenizer.decode(output_tokens[seqlen - 1:], skip_special_tokens=True), end='', flush=True)
cb1 = None if args.quiet else cb
output_tokens, took, gen_time, prompt_time, seqlen = llm.generate(input_ids, keep_cache=False, max_tokens=args.max_tokens, values=args.values, cb=cb1)
if args.profile:
trace_file = llm.sess.end_profiling()
shutil.move(trace_file, args.profile)
txt = tokenizer.decode(output_tokens[prompt_tokens:], skip_special_tokens=True)
if not args.quiet:
print(txt)
if gen_time == 0:
gen_time = 1
new_tokens_length = seqlen - prompt_tokens
e2e_tps = new_tokens_length / took
prompt_tps = prompt_tokens / prompt_time
gen_tps = new_tokens_length / gen_time
print(f'{new_tokens_length} tokens in {took:.1f}sec, e2e:{e2e_tps:.1f} tps, prompt: {prompt_tps:.1f} tps, gen: {gen_tps:.1f} tps, ttft: {prompt_time:.2f} sec')
if args.csv:
with open(args.csv, "a") as f:
precision = "q4fp32" if args.fp32 else "q4fp16"
f.write("name,took,token_per_sec,tokens,input_tokens,prompt_token_per_sec,gen_token_per_sec,ttft,task,precision,tag,platform,provider,generator,filter,\n")
f.write(f"{args.model['name']},{took:.1f},{e2e_tps:.1f},{new_tokens_length},{prompt_tokens},{prompt_tps:.1f},{gen_tps:.1f},{prompt_time:.2f},{args.task},{precision},{args.tag},{args.platform},{args.provider},ort-llm.py,1\n")
return 0
if __name__ == '__main__':
main()