Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix fp16 engine building cycle #5

Merged
merged 1 commit into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ TensorRT sometimes builds an FP32 engine even if we pass `--fp16` flag to `trtex
To make sure that the engine is correct, we compare its size with the reference size: FP32 engine size or ONNX model size if `--compare-with-onnx` is passed.
If the size of the built engine is too large, then it is incorrect, and we automatically rebuild it.

The measurement script uses `1.5` as a default threshold on `reference size / current engine size` value.
New engines will be generated until `reference size / current engine size` becomes higher than the threshold.
This value can be changed using `--threshold` option.
If you want to know the actual size ratio, use `--verbosity-level=1`.
The measurement script uses `1.5` as a default threshold on `reference size / current engine size` value (this value can be changed using `--threshold` option).
Latency server tries to build a correct engine for `--n-trials` times (20 by default) until `reference size / current engine size` becomes higher than the threshold.

If `trtexec` has failed to create a correct engine for `n_trials` times, latency server returns `None` as model latency.
If you want to know the actual `reference size / current engine size` ratio, use `--verbosity-level=1`.
81 changes: 54 additions & 27 deletions tools/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,55 +21,66 @@ def __init__(
compare_with_onnx: bool = False,
verbosity_level: int = 0,
threshold: float = 1.5,
n_trials: int = 20,
):
"""
Server ctor.

Parameters
----------
host : str
Host name or IP address. Default value is '0.0.0.0'.
Host name or IP address. Default value is ``0.0.0.0``.
port : int
Port. Default value is 15003.
trtexec_path : str
Path to trtexec binaries
fp32 : bool
Whether to use a FP32 engine for inference instead of FP16.
Whether to use an FP32 engine for inference instead of an FP16 one.
Default is False.
warmup : int
Run for N milliseconds to warmup before measuring performance.
Default is 10000.
iterations : int
Run at least 'iterations' inference iterations for latency measurement
Run at least ``iterations`` inference iterations for latency measurement
Default is 10000.
avgruns : int
Report performance measurements averaged over 'avgruns' consecutive iterations
Report performance measurements averaged over ``avgruns`` consecutive iterations
Default is 100.
compare_with_onnx : bool
Whether to compare FP16 engine size with ONNX model size. If false, compares with FP32 engine size.
Not used when 'fp32' is False.
Not used when ``fp32`` is False.
Default is False.
threshold : float
Ratio of reference size (i.e. ONNX model size) to engine size to make sure we have a FP16 engine.
Not used when 'fp32' is False.
Ratio of reference size (i.e. ONNX model size) to engine size to make sure we have an FP16 engine.
Not used when ``fp32`` is False.
Default is 1.5
verbosity_level : int
Verbosity level.
Choices: 0 - stderr, 1 - show measurement results, 2 - both stderr and measurement results.
Default is 0.
n_trials : int
Try to build an FP16 engine for ``n_trials`` times before returning ``None``.
Not used when ``fp32 == False``.
Default is 20.

"""
super().__init__(host=host, port=port)
self.trtexec_path = trtexec_path

self.fp32 = fp32
self._fp32 = fp32
self._fp16 = not self._fp32
self.warmup = warmup
self.iterations = iterations
self.avgruns = avgruns
self.compare_with_onnx = compare_with_onnx
self.verbosity_level = verbosity_level
self.threshold = threshold

if n_trials < 1:
raise ValueError(f"`n_trials` should be a positive number, got {n_trials}")

self.n_trials = n_trials

@staticmethod
def floats_from_str(str_data):
floats = [float(x) for x in str_data.split(" ") if x.replace(".", "").isdigit()]
Expand Down Expand Up @@ -110,7 +121,7 @@ def get_engine_size(self, process_stdout, eps=1e-7):
return engine_size + eps

def run_subprocess(self, command):
print(f"Running '{command}'")
print(f"Running `{command}`")
pipe = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, check=True)
process_stdout, process_stderr = pipe.stdout.decode("utf-8"), pipe.stderr.decode("utf-8")

Expand Down Expand Up @@ -145,13 +156,13 @@ def measure_latency(
if _has_quant_dequant_node(model):
print("Found quant-dequant nodes, INT8 kernels usage is enabled.")

if self.fp32:
command_int8 = f"{base_command} --int8 {run_options}"
else: # FP16
if self._fp16:
command_int8 = f"{base_command} --int8 --fp16 {run_options}"
else: # FP32
command_int8 = f"{base_command} --int8 {run_options}"

command_stdout, command_stderr = self.run_subprocess(command_int8)
elif self.fp32:
elif self._fp32:
command_fp32 = f"{base_command} {run_options}"
command_stdout, command_stderr = self.run_subprocess(command_fp32)
else: # FP16
Expand All @@ -167,20 +178,28 @@ def measure_latency(
fp32_engine_size = self.get_engine_size(stdout_fp32)
reference_size = fp32_engine_size

engine_size = reference_size

# Sometimes trtexec creates engine in FP32 even though we demand FP16.
# To make sure we get FP16 engine, we compare its size with reference size.
while reference_size / engine_size < self.threshold: # Rebuild an engine if it is too large.
is_fp16_engine = False

for n_trial in range(self.n_trials): # Rebuild an engine if it is too large.
command_stdout, command_stderr = self.run_subprocess(command_fp16)
engine_size = self.get_engine_size(command_stdout)
engine_ratio = reference_size / engine_size

if self.verbosity_level in (1, 2):
print(
f"Reference_size / engine_size = {reference_size / engine_size}, threshold = {self.threshold}"
f"Trial {n_trial + 1}, Reference_size / engine_size = {engine_ratio}, threshold = {self.threshold}"
)

result = self.parse_trtexec_stdout(command_stdout)
if engine_ratio > self.threshold:
is_fp16_engine = True
break

if self._fp16 and not is_fp16_engine:
result = {"latency": None}
else:
result = self.parse_trtexec_stdout(command_stdout)

if self.verbosity_level in (1, 2): # show measurements result
print("=" * 10, " Results ", "=" * 10)
Expand All @@ -192,45 +211,52 @@ def measure_latency(
def parse():
parser = argparse.ArgumentParser()
parser.add_argument(
"--trtexec-path", type=str, default="/usr/src/tensorrt/bin/trtexec", help="Path to trtexec binaries"
"--trtexec-path", type=str, default="/usr/src/tensorrt/bin/trtexec", help="Path to trtexec binaries."
)
parser.add_argument(
"--host", type=str, default="0.0.0.0", help="Host name or IP address. Default value is '0.0.0.0'"
"--host", type=str, default="0.0.0.0", help="Host name or IP address. Default value is `0.0.0.0`."
)
parser.add_argument("--port", type=int, default=15003, help="Server port. Default is 15003")
parser.add_argument(
"--fp32", action="store_true", help="Whether to build a FP32 engine. Builds FP16 engine by default."
"--fp32", action="store_true", help="Whether to build an FP32 engine. Builds FP16 engine by default."
)
parser.add_argument(
"--warmup",
type=int,
default=10000,
help="Run for 'warmup' milliseconds to warmup before measuring performance. Default is 10000.",
help="Run for `warmup` milliseconds to warmup before measuring performance. Default is 10000.",
)
parser.add_argument(
"--iterations",
type=int,
default=10000,
help="Run at least 'iterations' inference iterations for latency measurement. Default is 10000.",
help="Run at least `iterations` inference iterations for latency measurement. Default is 10000.",
)
parser.add_argument(
"--avgruns",
type=int,
default=100,
help="Report performance measurements averaged over 'avgruns' consecutive iterations. Default is 100.",
help="Report performance measurements averaged over `avgruns` consecutive iterations. Default is 100.",
)
parser.add_argument(
"--compare-with-onnx",
action="store_true",
help="Use ONNX model size as a reference to compare with FP16 engine size. "
"By default, uses FP32 engine size as a reference. Not used with '--fp32'",
"By default, uses FP32 engine size as a reference. Not used with `--fp32`.",
)
parser.add_argument(
"--threshold",
type=float,
default=1.5,
help="Ratio of reference size (i.e. ONNX model size) to engine size to make sure we have a FP16 engine. "
"Not used with '--fp32'. Default is 1.5",
help="Ratio of reference size (i.e. ONNX model size) to engine size to make sure we have an FP16 engine. "
"Not used with `--fp32`. Default is 1.5.",
)
parser.add_argument(
"--n-trials",
type=int,
default=20,
help="Try to build an FP16 engine for `n_trials` times before returning `None`. "
"Not used with `--fp32`. Default is 20.",
)
parser.add_argument(
"--verbosity-level",
Expand Down Expand Up @@ -258,6 +284,7 @@ def main():
compare_with_onnx=args.compare_with_onnx,
verbosity_level=args.verbosity_level,
threshold=args.threshold,
n_trials=args.n_trials,
)
server.run()

Expand Down