diff --git a/Dockerfile b/Dockerfile index 5fcfcf274d..d538fd3145 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,10 @@ RUN cp /tmp/requirements.txt /tmp/req.bak \ COPY LICENSE CHANGELOG.md CODE_OF_CONDUCT.md CONTRIBUTING.md README.md versioneer.py setup.py setup.cfg runtests.sh MANIFEST.in ./ COPY tests ./tests COPY monai ./monai + +# TODO: remove this line and torch.patch for 24.11 +RUN patch -R -d /usr/local/lib/python3.10/dist-packages/torch/onnx/ < ./monai/torch.patch + RUN BUILD_MONAI=1 FORCE_CUDA=1 python setup.py develop \ && rm -rf build __pycache__ @@ -57,4 +61,6 @@ RUN apt-get update \ # append /opt/tools to runtime path for NGC CLI to be accessible from all file system locations ENV PATH=${PATH}:/opt/tools ENV POLYGRAPHY_AUTOINSTALL_DEPS=1 + + WORKDIR /opt/monai diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py index 6313b7812d..6ecb664b85 100644 --- a/monai/networks/nets/vista3d.py +++ b/monai/networks/nets/vista3d.py @@ -641,7 +641,6 @@ def forward(self, src: torch.Tensor, class_vector: torch.Tensor): # [b,1,feat] @ [1,feat,dim], batch dimension become class_embedding batch dimension. masks_embedding = class_embedding.squeeze() @ src.view(b, c, h * w * d) masks_embedding = masks_embedding.view(b, -1, h, w, d).transpose(0, 1) - return masks_embedding, class_embedding diff --git a/monai/networks/trt_compiler.py b/monai/networks/trt_compiler.py index d2d05fae22..d96b712003 100644 --- a/monai/networks/trt_compiler.py +++ b/monai/networks/trt_compiler.py @@ -18,12 +18,12 @@ from collections import OrderedDict from pathlib import Path from types import MethodType -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Tuple, Union import torch from monai.apps.utils import get_logger -from monai.networks.utils import add_casts_around_norms, convert_to_onnx, convert_to_torchscript, get_profile_shapes +from monai.networks.utils import add_casts_around_norms, convert_to_onnx, get_profile_shapes from monai.utils.module import optional_import polygraphy, polygraphy_imported = optional_import("polygraphy") @@ -125,6 +125,7 @@ def __init__(self, plan_path, logger=None): self.output_names = [] self.dtypes = [] self.cur_profile = 0 + self.input_table = {} dtype_dict = trt_to_torch_dtype_dict() for idx in range(self.engine.num_io_tensors): binding = self.engine[idx] @@ -134,6 +135,9 @@ def __init__(self, plan_path, logger=None): self.output_names.append(binding) dtype = dtype_dict[self.engine.get_tensor_dtype(binding)] self.dtypes.append(dtype) + self.logger.info( + f"Loaded TensorRT engine: {self.plan_path}.\nInputs: {self.input_names}\nOutputs: {self.output_names}" + ) def allocate_buffers(self, device): """ @@ -163,7 +167,8 @@ def set_inputs(self, feed_dict, stream): last_profile = self.cur_profile def try_set_inputs(): - for binding, t in feed_dict.items(): + for binding in self.input_names: + t = feed_dict.get(self.input_table[binding], None) if t is not None: t = t.contiguous() shape = t.shape @@ -180,7 +185,8 @@ def try_set_inputs(): raise self.cur_profile = next_profile ctx.set_optimization_profile_async(self.cur_profile, stream) - + except Exception: + raise left = ctx.infer_shapes() assert len(left) == 0 @@ -217,6 +223,74 @@ def infer(self, stream, use_cuda_graph=False): return self.tensors +def make_tensor(d): + return d if isinstance(d, torch.Tensor) else torch.tensor(d).cuda() + + +def unroll_input(input_names, input_example): + # Simulate list/tuple unrolling during ONNX export + unrolled_input = {} + for name in input_names: + val = input_example[name] + if val is not None: + if isinstance(val, list) or isinstance(val, tuple): + for i in range(len(val)): + unrolled_input[f"{name}_{i}"] = make_tensor(val[i]) + else: + unrolled_input[name] = make_tensor(val) + return unrolled_input + + +def parse_groups( + ret: List[torch.Tensor], output_lists: List[List[int]] +) -> Tuple[Union[torch.Tensor, List[torch.Tensor]], ...]: + """ + Implements parsing of 'output_lists' arg of trt_compile(). + + Args: + ret: plain list of Tensors + + output_lists: list of output group sizes: to form some Lists/Tuples out of 'ret' List, this will be a list + of group dimensions, like [[], [5], [-1]] for returning Tensor, list of 5 items and dynamic list. + Format: [[group_n] | [], ...] + [] or group_n == 0 : next output from ret is a scalar + group_n > 0 : next output from ret is a list of group_n length + group_n == -1: next output is a dynamic list. This entry can be at any + position in output_lists, but can appear only once. + Returns: + Tuple of Union[torch.Tensor, List[torch.Tensor]], according to the grouping in output_lists + + """ + groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple() + cur = 0 + for l in range(len(output_lists)): + gl = output_lists[l] + assert len(gl) == 0 or len(gl) == 1 + if len(gl) == 0 or gl[0] == 0: + groups = (*groups, ret[cur]) + cur = cur + 1 + elif gl[0] > 0: + groups = (*groups, ret[cur : cur + gl[0]]) + cur = cur + gl[0] + elif gl[0] == -1: + rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple() + rcur = len(ret) + for rl in range(len(output_lists) - 1, l, -1): + rgl = output_lists[rl] + assert len(rgl) == 0 or len(rgl) == 1 + if len(rgl) == 0 or rgl[0] == 0: + rcur = rcur - 1 + rev_groups = (*rev_groups, ret[rcur]) + elif rgl[0] > 0: + rcur = rcur - rgl[0] + rev_groups = (*rev_groups, ret[rcur : rcur + rgl[0]]) + else: + raise ValueError("Two -1 lists in output") + groups = (*groups, ret[cur:rcur], *rev_groups[::-1]) + break + return groups + + class TrtCompiler: """ This class implements: @@ -233,6 +307,7 @@ def __init__( method="onnx", input_names=None, output_names=None, + output_lists=None, export_args=None, build_args=None, input_profiles=None, @@ -240,6 +315,7 @@ def __init__( use_cuda_graph=False, timestamp=None, fallback=False, + forward_override=None, logger=None, ): """ @@ -255,6 +331,8 @@ def __init__( 'torch_trt' may not work for some nets. Also AMP must be turned off for it to work. input_names: Optional list of input names. If None, will be read from the function signature. output_names: Optional list of output names. Note: If not None, patched forward() will return a dictionary. + output_lists: Optional list of output group sizes: when forward() returns Lists/Tuples, this will be a list + of their dimensions, like [[], [5], [-1]] for returning Tensor, list of 5 items and dynamic list. export_args: Optional args to pass to export method. See onnx.export() and Torch-TensorRT docs for details. build_args: Optional args to pass to TRT builder. See polygraphy.Config for details. input_profiles: Optional list of profiles for TRT builder and ONNX export. @@ -279,6 +357,7 @@ def __init__( self.method = method self.return_dict = output_names is not None self.output_names = output_names or [] + self.output_lists = output_lists or [] self.profiles = input_profiles or [] self.dynamic_batchsize = dynamic_batchsize self.export_args = export_args or {} @@ -289,11 +368,19 @@ def __init__( self.disabled = False self.logger = logger or get_logger("monai.networks.trt_compiler") + self.argspec = inspect.getfullargspec(model.forward) # Normally we read input_names from forward() but can be overridden if input_names is None: - argspec = inspect.getfullargspec(model.forward) - input_names = argspec.args[1:] + input_names = self.argspec.args[1:] + self.defaults = {} + if self.argspec.defaults is not None: + for i in range(len(self.argspec.defaults)): + d = self.argspec.defaults[-i - 1] + if d is not None: + d = make_tensor(d) + self.defaults[self.argspec.args[-i - 1]] = d + self.input_names = input_names self.old_forward = model.forward @@ -314,9 +401,18 @@ def _load_engine(self): """ try: self.engine = TRTEngine(self.plan_path, self.logger) - self.input_names = self.engine.input_names + # Make sure we have names correct + input_table = {} + for name in self.engine.input_names: + if name.startswith("__") and name not in self.input_names: + orig_name = name[2:] + else: + orig_name = name + input_table[name] = orig_name + self.engine.input_table = input_table + self.logger.info(f"Engine loaded, inputs:{self.engine.input_table}") except Exception as e: - self.logger.debug(f"Exception while loading the engine:\n{e}") + self.logger.info(f"Exception while loading the engine:\n{e}") def forward(self, model, argv, kwargs): """ @@ -329,6 +425,11 @@ def forward(self, model, argv, kwargs): Returns: Passing through wrapped module's forward() return value(s) """ + args = self.defaults + args.update(kwargs) + if len(argv) > 0: + args.update(self._inputs_to_dict(argv)) + if self.engine is None and not self.disabled: # Restore original forward for export new_forward = model.forward @@ -336,11 +437,10 @@ def forward(self, model, argv, kwargs): try: self._load_engine() if self.engine is None: - build_args = kwargs.copy() - if len(argv) > 0: - build_args.update(self._inputs_to_dict(argv)) - self._build_and_save(model, build_args) - # This will reassign input_names from the engine + build_args = args.copy() + with torch.no_grad(): + self._build_and_save(model, build_args) + # This will reassign input_names from the engine self._load_engine() assert self.engine is not None except Exception as e: @@ -355,19 +455,16 @@ def forward(self, model, argv, kwargs): del param # Call empty_cache to release GPU memory torch.cuda.empty_cache() + # restore TRT hook model.forward = new_forward # Run the engine try: - if len(argv) > 0: - kwargs.update(self._inputs_to_dict(argv)) - argv = () - if self.engine is not None: # forward_trt is not thread safe as we do not use per-thread execution contexts with lock_sm: device = torch.cuda.current_device() stream = torch.cuda.Stream(device=device) - self.engine.set_inputs(kwargs, stream.cuda_stream) + self.engine.set_inputs(unroll_input(self.input_names, args), stream.cuda_stream) self.engine.allocate_buffers(device=device) # Need this to synchronize with Torch stream stream.wait_stream(torch.cuda.current_stream()) @@ -375,11 +472,13 @@ def forward(self, model, argv, kwargs): # if output_names is not None, return dictionary if not self.return_dict: ret = list(ret.values()) - if len(ret) == 1: + if self.output_lists: + ret = parse_groups(ret, self.output_lists) + elif len(ret) == 1: ret = ret[0] return ret except Exception as e: - if model is not None: + if self.fallback: self.logger.info(f"Exception: {e}\nFalling back to Pytorch ...") else: raise e @@ -391,16 +490,11 @@ def _onnx_to_trt(self, onnx_path): """ profiles = [] - if self.profiles: - for input_profile in self.profiles: - if isinstance(input_profile, Profile): - profiles.append(input_profile) - else: - p = Profile() - for name, dims in input_profile.items(): - assert len(dims) == 3 - p.add(name, min=dims[0], opt=dims[1], max=dims[2]) - profiles.append(p) + for profile in self.profiles: + p = Profile() + for id, val in profile.items(): + p.add(id, min=val[0], opt=val[1], max=val[2]) + profiles.append(p) build_args = self.build_args.copy() build_args["tf32"] = self.precision != "fp32" @@ -425,7 +519,7 @@ def _build_and_save(self, model, input_example): return export_args = self.export_args - + engine_bytes = None add_casts_around_norms(model) if self.method == "torch_trt": @@ -435,7 +529,6 @@ def _build_and_save(self, model, input_example): elif self.precision == "bf16": enabled_precisions.append(torch.bfloat16) inputs = list(input_example.values()) - ir_model = convert_to_torchscript(model, inputs=inputs, use_trace=True) def get_torch_trt_input(input_shape, dynamic_batchsize): min_input_shape, opt_input_shape, max_input_shape = get_profile_shapes(input_shape, dynamic_batchsize) @@ -445,12 +538,7 @@ def get_torch_trt_input(input_shape, dynamic_batchsize): tt_inputs = [get_torch_trt_input(i.shape, self.dynamic_batchsize) for i in inputs] engine_bytes = torch_tensorrt.convert_method_to_trt_engine( - ir_model, - "forward", - inputs=tt_inputs, - ir="torchscript", - enabled_precisions=enabled_precisions, - **export_args, + model, "forward", arg_inputs=tt_inputs, enabled_precisions=enabled_precisions, **export_args ) else: dbs = self.dynamic_batchsize @@ -459,33 +547,47 @@ def get_torch_trt_input(input_shape, dynamic_batchsize): raise ValueError("ERROR: Both dynamic_batchsize and input_profiles set for TrtCompiler!") if len(dbs) != 3: raise ValueError("dynamic_batchsize has to have len ==3 ") - profiles = {} + profile = {} for id, val in input_example.items(): - sh = val.shape[1:] - profiles[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]] - self.profiles = [profiles] - if len(self.profiles) > 0: - export_args.update({"dynamic_axes": get_dynamic_axes(self.profiles)}) + def add_profile(id, val): + sh = val.shape + if len(sh) > 0: + sh = sh[1:] + profile[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]] + + if isinstance(val, list) or isinstance(val, tuple): + for i in range(len(val)): + add_profile(f"{id}_{i}", val[i]) + elif isinstance(val, torch.Tensor): + add_profile(id, val) + self.profiles = [profile] + + self.dynamic_axes = get_dynamic_axes(self.profiles) + + if len(self.dynamic_axes) > 0: + export_args.update({"dynamic_axes": self.dynamic_axes}) # Use temporary directory for easy cleanup in case of external weights with tempfile.TemporaryDirectory() as tmpdir: - onnx_path = Path(tmpdir) / "model.onnx" + unrolled_input = unroll_input(self.input_names, input_example) + onnx_path = str(Path(tmpdir) / "model.onnx") self.logger.info( - f"Exporting to {onnx_path}:\n\toutput_names={self.output_names}\n\texport args: {export_args}" + f"Exporting to {onnx_path}:\nunrolled_inputs={list(unrolled_input.keys())}\n" + + f"output_names={self.output_names}\ninput_names={self.input_names}\nexport args: {export_args}" ) convert_to_onnx( model, input_example, - filename=str(onnx_path), - input_names=self.input_names, + filename=onnx_path, + input_names=list(unrolled_input.keys()), output_names=self.output_names, **export_args, ) self.logger.info("Export to ONNX successful.") - engine_bytes = self._onnx_to_trt(str(onnx_path)) - - open(self.plan_path, "wb").write(engine_bytes) + engine_bytes = self._onnx_to_trt(onnx_path) + if engine_bytes: + open(self.plan_path, "wb").write(engine_bytes) def trt_forward(self, *argv, **kwargs): @@ -542,9 +644,11 @@ def trt_compile( args["timestamp"] = timestamp def wrap(model, path): - wrapper = TrtCompiler(model, path + ".plan", logger=logger, **args) - model._trt_compiler = wrapper - model.forward = MethodType(trt_forward, model) + if not hasattr(model, "_trt_compiler"): + model.orig_forward = model.forward + wrapper = TrtCompiler(model, path + ".plan", logger=logger, **args) + model._trt_compiler = wrapper + model.forward = MethodType(trt_forward, model) def find_sub(parent, submodule): idx = submodule.find(".") diff --git a/monai/networks/utils.py b/monai/networks/utils.py index cfad0364c3..05627f9c00 100644 --- a/monai/networks/utils.py +++ b/monai/networks/utils.py @@ -632,7 +632,6 @@ def convert_to_onnx( use_trace: bool = True, do_constant_folding: bool = True, constant_size_threshold: int = 16 * 1024 * 1024 * 1024, - dynamo=False, **kwargs, ): """ @@ -673,6 +672,9 @@ def convert_to_onnx( # let torch.onnx.export to trace the model. mode_to_export = model torch_versioned_kwargs = kwargs + if "dynamo" in kwargs and kwargs["dynamo"] and verify: + torch_versioned_kwargs["verify"] = verify + verify = False else: if not pytorch_after(1, 10): if "example_outputs" not in kwargs: @@ -695,13 +697,13 @@ def convert_to_onnx( f = temp_file.name else: f = filename - + print(f"torch_versioned_kwargs={torch_versioned_kwargs}") torch.onnx.export( mode_to_export, onnx_inputs, f=f, input_names=input_names, - output_names=output_names, + output_names=output_names or None, dynamic_axes=dynamic_axes, opset_version=opset_version, do_constant_folding=do_constant_folding, @@ -715,6 +717,9 @@ def convert_to_onnx( fold_constants(onnx_model, size_threshold=constant_size_threshold) if verify: + if isinstance(inputs, dict): + inputs = list(inputs.values()) + if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/monai/torch.patch b/monai/torch.patch new file mode 100644 index 0000000000..e53980968b --- /dev/null +++ b/monai/torch.patch @@ -0,0 +1,9 @@ +--- /usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset14.py 2024-10-31 06:09:21.139938791 +0000 ++++ /usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset14.bak 2024-10-31 06:01:50.207462739 +0000 +@@ -150,6 +150,7 @@ + ), "is_causal and attn_mask cannot be set at the same time" + assert not enable_gqa, "conversion of scaled_dot_product_attention not implemented if enable_gqa is True" + ++ scale = symbolic_helper._maybe_get_const(scale, "f") + if symbolic_helper._is_none(scale): + scale = _attention_scale(g, query) diff --git a/monai/utils/module.py b/monai/utils/module.py index d3f2ff09f2..1ad001fc87 100644 --- a/monai/utils/module.py +++ b/monai/utils/module.py @@ -649,7 +649,7 @@ def compute_capabilities_after(major: int, minor: int = 0, current_ver_string: s current_ver_string: if None, the current system GPU CUDA compute capability will be used. Returns: - True if the current system GPU CUDA compute capability is greater than or equal to the specified version. + True if the current system GPU CUDA compute capability is greater than the specified version. """ if current_ver_string is None: cuda_available = torch.cuda.is_available() @@ -667,11 +667,11 @@ def compute_capabilities_after(major: int, minor: int = 0, current_ver_string: s ver, has_ver = optional_import("packaging.version", name="parse") if has_ver: - return ver(".".join((f"{major}", f"{minor}"))) <= ver(f"{current_ver_string}") # type: ignore + return ver(".".join((f"{major}", f"{minor}"))) < ver(f"{current_ver_string}") # type: ignore parts = f"{current_ver_string}".split("+", 1)[0].split(".", 2) while len(parts) < 2: parts += ["0"] c_major, c_minor = parts[:2] c_mn = int(c_major), int(c_minor) mn = int(major), int(minor) - return c_mn > mn + return c_mn >= mn diff --git a/tests/test_trt_compile.py b/tests/test_trt_compile.py index 49404fdbbe..9716a4a715 100644 --- a/tests/test_trt_compile.py +++ b/tests/test_trt_compile.py @@ -19,17 +19,12 @@ from monai.handlers import TrtHandler from monai.networks import trt_compile -from monai.networks.nets import UNet, cell_sam_wrapper, vista3d132 +from monai.networks.nets import cell_sam_wrapper, vista3d132 from monai.utils import min_version, optional_import -from tests.utils import ( - SkipIfAtLeastPyTorchVersion, - SkipIfBeforeComputeCapabilityVersion, - skip_if_no_cuda, - skip_if_quick, - skip_if_windows, -) +from tests.utils import SkipIfBeforeComputeCapabilityVersion, skip_if_no_cuda, skip_if_quick, skip_if_windows trt, trt_imported = optional_import("tensorrt", "10.1.0", min_version) +torch_tensorrt, torch_trt_imported = optional_import("torch_tensorrt") polygraphy, polygraphy_imported = optional_import("polygraphy") build_sam_vit_b, has_sam = optional_import("segment_anything.build_sam", name="build_sam_vit_b") @@ -37,6 +32,19 @@ TEST_CASE_2 = ["fp16"] +class ListAdd(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: list[torch.Tensor], y: torch.Tensor, z: torch.Tensor, bs: float = 0.1): + y1 = y.clone() + x1 = x.copy() + z1 = z + y + for xi in x: + y1 = y1 + xi + bs + return x1, [y1, z1], y1 + z1 + + @skip_if_windows @skip_if_no_cuda @skip_if_quick @@ -53,7 +61,7 @@ def tearDown(self): if current_device != self.gpu_device: torch.cuda.set_device(self.gpu_device) - @SkipIfAtLeastPyTorchVersion((2, 4, 1)) + @unittest.skipUnless(torch_trt_imported, "torch_tensorrt is required") def test_handler(self): from ignite.engine import Engine @@ -74,29 +82,19 @@ def test_handler(self): net1.forward(torch.tensor([[0.0, 1.0], [1.0, 2.0]], device="cuda")) self.assertIsNotNone(net1._trt_compiler.engine) - @parameterized.expand([TEST_CASE_1, TEST_CASE_2]) - def test_unet_value(self, precision): - model = UNet( - spatial_dims=3, - in_channels=1, - out_channels=2, - channels=(2, 2, 4, 8, 4), - strides=(2, 2, 2, 2), - num_res_units=2, - norm="batch", - ).cuda() + def test_lists(self): + model = ListAdd().cuda() + with torch.no_grad(), tempfile.TemporaryDirectory() as tmpdir: - model.eval() - input_example = torch.randn(2, 1, 96, 96, 96).cuda() - output_example = model(input_example) - args: dict = {"builder_optimization_level": 1} - trt_compile( - model, - f"{tmpdir}/test_unet_trt_compile", - args={"precision": precision, "build_args": args, "dynamic_batchsize": [1, 4, 8]}, - ) + args = {"output_lists": [[-1], [2], []], "export_args": {"dynamo": False, "verbose": True}} + x = torch.randn(1, 16).to("cuda") + y = torch.randn(1, 16).to("cuda") + z = torch.randn(1, 16).to("cuda") + input_example = ([x, y, z], y.clone(), z.clone()) + output_example = model(*input_example) + trt_compile(model, f"{tmpdir}/test_lists", args=args) self.assertIsNone(model._trt_compiler.engine) - trt_output = model(input_example) + trt_output = model(*input_example) # Check that lazy TRT build succeeded self.assertIsNotNone(model._trt_compiler.engine) torch.testing.assert_close(trt_output, output_example, rtol=0.01, atol=0.01) @@ -109,11 +107,7 @@ def test_cell_sam_wrapper_value(self, precision): model.eval() input_example = torch.randn(1, 3, 128, 128).to("cuda") output_example = model(input_example) - trt_compile( - model, - f"{tmpdir}/test_cell_sam_wrapper_trt_compile", - args={"precision": precision, "dynamic_batchsize": [1, 1, 1]}, - ) + trt_compile(model, f"{tmpdir}/test_cell_sam_wrapper_trt_compile", args={"precision": precision}) self.assertIsNone(model._trt_compiler.engine) trt_output = model(input_example) # Check that lazy TRT build succeeded @@ -130,7 +124,7 @@ def test_vista3d(self, precision): model = trt_compile( model, f"{tmpdir}/test_vista3d_trt_compile", - args={"precision": precision, "dynamic_batchsize": [1, 1, 1]}, + args={"precision": precision, "dynamic_batchsize": [1, 2, 4]}, submodule=["image_encoder.encoder", "class_head"], ) self.assertIsNotNone(model.image_encoder.encoder._trt_compiler) diff --git a/tests/test_version_after.py b/tests/test_version_after.py index b6cb741382..34a5054974 100644 --- a/tests/test_version_after.py +++ b/tests/test_version_after.py @@ -38,7 +38,7 @@ TEST_CASES_SM = [ # (major, minor, sm, expected) - (6, 1, "6.1", True), + (6, 1, "6.1", False), (6, 1, "6.0", False), (6, 0, "8.6", True), (7, 0, "8", True),