From d669a74623f273f74213a88b5233964d1ab3ea08 Mon Sep 17 00:00:00 2001 From: Gaz Iqbal Date: Fri, 23 Sep 2022 15:56:42 -0700 Subject: [PATCH] Detect.py supports running against a Triton container (#9228) * update coco128-seg comments * Enables detect.py to use Triton for inference Triton Inference Server is an open source inference serving software that streamlines AI inferencing. https://github.com/triton-inference-server/server The user can now provide a "--triton-url" argument to detect.py to use a local or remote Triton server for inference. For e.g., http://localhost:8000 will use http over port 8000 and grpc://localhost:8001 will use grpc over port 8001. Note, it is not necessary to specify a weights file to use Triton. A Triton container can be created by first exporting the Yolov5 model to a Triton supported runtime. Onnx, Torchscript, TensorRT are supported by both Triton and the export.py script. The exported model can then be containerized via the OctoML CLI. See https://github.com/octoml/octo-cli#getting-started for a guide. * added triton client to requirements * fixed support for TFSavedModels in Triton * reverted change * Test CoreML update Signed-off-by: Glenn Jocher * Update ci-testing.yml Signed-off-by: Glenn Jocher * Use pathlib Signed-off-by: Glenn Jocher * Refacto DetectMultiBackend to directly accept triton url as --weights http://... Signed-off-by: Glenn Jocher * Deploy category Signed-off-by: Glenn Jocher * Update detect.py Signed-off-by: Glenn Jocher * Update common.py Signed-off-by: Glenn Jocher * Update common.py Signed-off-by: Glenn Jocher * Update predict.py Signed-off-by: Glenn Jocher * Update predict.py Signed-off-by: Glenn Jocher * Update predict.py Signed-off-by: Glenn Jocher * Update triton.py Signed-off-by: Glenn Jocher * Update triton.py Signed-off-by: Glenn Jocher * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add printout and requirements check * Cleanup Signed-off-by: Glenn Jocher * triton fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed triton model query over grpc * Update check_requirements('tritonclient[all]') * group imports * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix likely remote URL bug * update comment * Update is_url() * Fix 2x download attempt on http://path/to/model.pt Signed-off-by: Glenn Jocher Co-authored-by: glennjocher Co-authored-by: Gaz Iqbal Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- classify/predict.py | 2 +- detect.py | 8 ++--- models/common.py | 44 +++++++++++++++-------- requirements.txt | 3 ++ segment/predict.py | 2 +- utils/downloads.py | 4 +-- utils/triton.py | 85 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 126 insertions(+), 22 deletions(-) create mode 100644 utils/triton.py diff --git a/classify/predict.py b/classify/predict.py index 011e7b83f09b..d3bec8eea7ba 100644 --- a/classify/predict.py +++ b/classify/predict.py @@ -104,7 +104,7 @@ def run( seen, windows, dt = 0, [], (Profile(), Profile(), Profile()) for path, im, im0s, vid_cap, s in dataset: with dt[0]: - im = torch.Tensor(im).to(device) + im = torch.Tensor(im).to(model.device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 if len(im.shape) == 3: im = im[None] # expand for batch dim diff --git a/detect.py b/detect.py index 9036b26263e5..e442ed75f4c7 100644 --- a/detect.py +++ b/detect.py @@ -49,7 +49,7 @@ @smart_inference_mode() def run( - weights=ROOT / 'yolov5s.pt', # model.pt path(s) + weights=ROOT / 'yolov5s.pt', # model path or triton URL source=ROOT / 'data/images', # file/dir/URL/glob/screen/0(webcam) data=ROOT / 'data/coco128.yaml', # dataset.yaml path imgsz=(640, 640), # inference size (height, width) @@ -108,11 +108,11 @@ def run( vid_path, vid_writer = [None] * bs, [None] * bs # Run inference - model.warmup(imgsz=(1 if pt else bs, 3, *imgsz)) # warmup + model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz)) # warmup seen, windows, dt = 0, [], (Profile(), Profile(), Profile()) for path, im, im0s, vid_cap, s in dataset: with dt[0]: - im = torch.from_numpy(im).to(device) + im = torch.from_numpy(im).to(model.device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: @@ -214,7 +214,7 @@ def run( def parse_opt(): parser = argparse.ArgumentParser() - parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path(s)') + parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path or triton URL') parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)') parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='(optional) dataset.yaml path') parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w') diff --git a/models/common.py b/models/common.py index fac95a82fdb9..177704849d3d 100644 --- a/models/common.py +++ b/models/common.py @@ -10,6 +10,7 @@ from collections import OrderedDict, namedtuple from copy import copy from pathlib import Path +from urllib.parse import urlparse import cv2 import numpy as np @@ -327,11 +328,13 @@ def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, super().__init__() w = str(weights[0] if isinstance(weights, list) else weights) - pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle = self._model_type(w) # type - w = attempt_download(w) # download if not local + pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w) fp16 &= pt or jit or onnx or engine # FP16 + nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH) stride = 32 # default stride cuda = torch.cuda.is_available() and device.type != 'cpu' # use CUDA + if not (pt or triton): + w = attempt_download(w) # download if not local if pt: # PyTorch model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse) @@ -342,7 +345,7 @@ def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, elif jit: # TorchScript LOGGER.info(f'Loading {w} for TorchScript inference...') extra_files = {'config.txt': ''} # model metadata - model = torch.jit.load(w, _extra_files=extra_files) + model = torch.jit.load(w, _extra_files=extra_files, map_location=device) model.half() if fp16 else model.float() if extra_files['config.txt']: # load metadata dict d = json.loads(extra_files['config.txt'], @@ -472,6 +475,12 @@ def gd_outputs(gd): predictor = pdi.create_predictor(config) input_handle = predictor.get_input_handle(predictor.get_input_names()[0]) output_names = predictor.get_output_names() + elif triton: # NVIDIA Triton Inference Server + LOGGER.info(f'Using {w} as Triton Inference Server...') + check_requirements('tritonclient[all]') + from utils.triton import TritonRemoteModel + model = TritonRemoteModel(url=w) + nhwc = model.runtime.startswith("tensorflow") else: raise NotImplementedError(f'ERROR: {w} is not a supported format') @@ -488,6 +497,8 @@ def forward(self, im, augment=False, visualize=False): b, ch, h, w = im.shape # batch, channel, height, width if self.fp16 and im.dtype != torch.float16: im = im.half() # to FP16 + if self.nhwc: + im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3) if self.pt: # PyTorch y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im) @@ -517,7 +528,7 @@ def forward(self, im, augment=False, visualize=False): self.context.execute_v2(list(self.binding_addrs.values())) y = [self.bindings[x].data for x in sorted(self.output_names)] elif self.coreml: # CoreML - im = im.permute(0, 2, 3, 1).cpu().numpy() # torch BCHW to numpy BHWC shape(1,320,192,3) + im = im.cpu().numpy() im = Image.fromarray((im[0] * 255).astype('uint8')) # im = im.resize((192, 320), Image.ANTIALIAS) y = self.model.predict({'image': im}) # coordinates are xywh normalized @@ -532,8 +543,10 @@ def forward(self, im, augment=False, visualize=False): self.input_handle.copy_from_cpu(im) self.predictor.run() y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names] + elif self.triton: # NVIDIA Triton Inference Server + y = self.model(im) else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU) - im = im.permute(0, 2, 3, 1).cpu().numpy() # torch BCHW to numpy BHWC shape(1,320,192,3) + im = im.cpu().numpy() if self.saved_model: # SavedModel y = self.model(im, training=False) if self.keras else self.model(im) elif self.pb: # GraphDef @@ -566,8 +579,8 @@ def from_numpy(self, x): def warmup(self, imgsz=(1, 3, 640, 640)): # Warmup model by running inference once - warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb - if any(warmup_types) and self.device.type != 'cpu': + warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton + if any(warmup_types) and (self.device.type != 'cpu' or self.triton): im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input for _ in range(2 if self.jit else 1): # self.forward(im) # warmup @@ -575,14 +588,17 @@ def warmup(self, imgsz=(1, 3, 640, 640)): @staticmethod def _model_type(p='path/to/model.pt'): # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx + # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle] from export import export_formats - sf = list(export_formats().Suffix) + ['.xml'] # export suffixes - check_suffix(p, sf) # checks - p = Path(p).name # eliminate trailing separators - pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, xml2 = (s in p for s in sf) - xml |= xml2 # *_openvino_model or *.xml - tflite &= not edgetpu # *.tflite - return pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle + from utils.downloads import is_url + sf = list(export_formats().Suffix) # export suffixes + if not is_url(p, check=False): + check_suffix(p, sf) # checks + url = urlparse(p) # if url may be Triton inference server + types = [s in Path(p).name for s in sf] + types[8] &= not types[9] # tflite &= not edgetpu + triton = not any(types) and all([any(s in url.scheme for s in ["http", "grpc"]), url.netloc]) + return types + [triton] @staticmethod def _load_metadata(f=Path('path/to/meta.yaml')): diff --git a/requirements.txt b/requirements.txt index 914da54e73fc..4d6ec3509efa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,6 +34,9 @@ seaborn>=0.11.0 # tensorflowjs>=3.9.0 # TF.js export # openvino-dev # OpenVINO export +# Deploy -------------------------------------- +# tritonclient[all]~=2.24.0 + # Extras -------------------------------------- ipython # interactive notebook psutil # system utilization diff --git a/segment/predict.py b/segment/predict.py index 43cebc706371..2e794c342de1 100644 --- a/segment/predict.py +++ b/segment/predict.py @@ -114,7 +114,7 @@ def run( seen, windows, dt = 0, [], (Profile(), Profile(), Profile()) for path, im, im0s, vid_cap, s in dataset: with dt[0]: - im = torch.from_numpy(im).to(device) + im = torch.from_numpy(im).to(model.device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: diff --git a/utils/downloads.py b/utils/downloads.py index bd495068522d..433de84b51ca 100644 --- a/utils/downloads.py +++ b/utils/downloads.py @@ -16,13 +16,13 @@ import torch -def is_url(url, check_exists=True): +def is_url(url, check=True): # Check if string is URL and check if URL exists try: url = str(url) result = urllib.parse.urlparse(url) assert all([result.scheme, result.netloc, result.path]) # check if is url - return (urllib.request.urlopen(url).getcode() == 200) if check_exists else True # check if exists online + return (urllib.request.urlopen(url).getcode() == 200) if check else True # check if exists online except (AssertionError, urllib.request.HTTPError): return False diff --git a/utils/triton.py b/utils/triton.py new file mode 100644 index 000000000000..a94ef0ad197d --- /dev/null +++ b/utils/triton.py @@ -0,0 +1,85 @@ +# YOLOv5 🚀 by Ultralytics, GPL-3.0 license +""" Utils to interact with the Triton Inference Server +""" + +import typing +from urllib.parse import urlparse + +import torch + + +class TritonRemoteModel: + """ A wrapper over a model served by the Triton Inference Server. It can + be configured to communicate over GRPC or HTTP. It accepts Torch Tensors + as input and returns them as outputs. + """ + + def __init__(self, url: str): + """ + Keyword arguments: + url: Fully qualified address of the Triton server - for e.g. grpc://localhost:8000 + """ + + parsed_url = urlparse(url) + if parsed_url.scheme == "grpc": + from tritonclient.grpc import InferenceServerClient, InferInput + + self.client = InferenceServerClient(parsed_url.netloc) # Triton GRPC client + model_repository = self.client.get_model_repository_index() + self.model_name = model_repository.models[0].name + self.metadata = self.client.get_model_metadata(self.model_name, as_json=True) + + def create_input_placeholders() -> typing.List[InferInput]: + return [ + InferInput(i['name'], [int(s) for s in i["shape"]], i['datatype']) for i in self.metadata['inputs']] + + else: + from tritonclient.http import InferenceServerClient, InferInput + + self.client = InferenceServerClient(parsed_url.netloc) # Triton HTTP client + model_repository = self.client.get_model_repository_index() + self.model_name = model_repository[0]['name'] + self.metadata = self.client.get_model_metadata(self.model_name) + + def create_input_placeholders() -> typing.List[InferInput]: + return [ + InferInput(i['name'], [int(s) for s in i["shape"]], i['datatype']) for i in self.metadata['inputs']] + + self._create_input_placeholders_fn = create_input_placeholders + + @property + def runtime(self): + """Returns the model runtime""" + return self.metadata.get("backend", self.metadata.get("platform")) + + def __call__(self, *args, **kwargs) -> typing.Union[torch.Tensor, typing.Tuple[torch.Tensor, ...]]: + """ Invokes the model. Parameters can be provided via args or kwargs. + args, if provided, are assumed to match the order of inputs of the model. + kwargs are matched with the model input names. + """ + inputs = self._create_inputs(*args, **kwargs) + response = self.client.infer(model_name=self.model_name, inputs=inputs) + result = [] + for output in self.metadata['outputs']: + tensor = torch.as_tensor(response.as_numpy(output['name'])) + result.append(tensor) + return result[0] if len(result) == 1 else result + + def _create_inputs(self, *args, **kwargs): + args_len, kwargs_len = len(args), len(kwargs) + if not args_len and not kwargs_len: + raise RuntimeError("No inputs provided.") + if args_len and kwargs_len: + raise RuntimeError("Cannot specify args and kwargs at the same time") + + placeholders = self._create_input_placeholders_fn() + if args_len: + if args_len != len(placeholders): + raise RuntimeError(f"Expected {len(placeholders)} inputs, got {args_len}.") + for input, value in zip(placeholders, args): + input.set_data_from_numpy(value.cpu().numpy()) + else: + for input in placeholders: + value = kwargs[input.name] + input.set_data_from_numpy(value.cpu().numpy()) + return placeholders