diff --git a/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp index a89ee3d02cd8b..a1d12bcb56384 100644 --- a/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp +++ b/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -29,8 +30,8 @@ class PointPaintingTRT : public autoware::lidar_centerpoint::CenterPointTRT using autoware::lidar_centerpoint::CenterPointTRT::CenterPointTRT; explicit PointPaintingTRT( - const autoware::lidar_centerpoint::NetworkParam & encoder_param, - const autoware::lidar_centerpoint::NetworkParam & head_param, + const autoware::tensorrt_common::TrtCommonConfig & encoder_param, + const autoware::tensorrt_common::TrtCommonConfig & head_param, const autoware::lidar_centerpoint::DensificationParam & densification_param, const autoware::lidar_centerpoint::CenterPointConfig & config); diff --git a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp index bcd62383319c1..ef03cf6c3a7cf 100644 --- a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp +++ b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp @@ -176,10 +176,10 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt iou_bev_nms_.setParameters(p); } - autoware::lidar_centerpoint::NetworkParam encoder_param( - encoder_onnx_path, encoder_engine_path, trt_precision); - autoware::lidar_centerpoint::NetworkParam head_param( - head_onnx_path, head_engine_path, trt_precision); + autoware::tensorrt_common::TrtCommonConfig encoder_param( + encoder_onnx_path, trt_precision, encoder_engine_path); + autoware::tensorrt_common::TrtCommonConfig head_param( + head_onnx_path, trt_precision, head_engine_path); autoware::lidar_centerpoint::DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); autoware::lidar_centerpoint::CenterPointConfig config( diff --git a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp index cc3eb17cc32ab..1fe860e68a967 100644 --- a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp +++ b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp @@ -27,8 +27,8 @@ namespace autoware::image_projection_based_fusion { PointPaintingTRT::PointPaintingTRT( - const autoware::lidar_centerpoint::NetworkParam & encoder_param, - const autoware::lidar_centerpoint::NetworkParam & head_param, + const autoware::tensorrt_common::TrtCommonConfig & encoder_param, + const autoware::tensorrt_common::TrtCommonConfig & head_param, const autoware::lidar_centerpoint::DensificationParam & densification_param, const autoware::lidar_centerpoint::CenterPointConfig & config) : autoware::lidar_centerpoint::CenterPointTRT( diff --git a/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp b/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp index 60aa09c8e16d7..2e3a7a2243399 100644 --- a/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp +++ b/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp @@ -18,7 +18,6 @@ #include -#include #include #include @@ -51,12 +50,9 @@ LidarApolloInstanceSegmentation::LidarApolloInstanceSegmentation(rclcpp::Node * const auto precision = node_->declare_parameter("precision", "fp32"); trt_common_ = std::make_unique( - onnx_file, precision, nullptr, autoware::tensorrt_common::BatchConfig{1, 1, 1}, 1 << 30); - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - RCLCPP_ERROR_STREAM(node_->get_logger(), "failed to create tensorrt engine file."); - return; + tensorrt_common::TrtCommonConfig(onnx_file, precision)); + if (!trt_common_->setup()) { + throw std::runtime_error("Failed to setup TensorRT"); } if (node_->declare_parameter("build_only", false)) { @@ -65,11 +61,11 @@ LidarApolloInstanceSegmentation::LidarApolloInstanceSegmentation(rclcpp::Node * } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); input_d_ = autoware::cuda_utils::make_unique(input_size); - const auto output_dims = trt_common_->getBindingDimensions(1); + const auto output_dims = trt_common_->getTensorShape(1); output_size_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); output_d_ = autoware::cuda_utils::make_unique(output_size_); @@ -127,10 +123,6 @@ bool LidarApolloInstanceSegmentation::detectDynamicObjects( const sensor_msgs::msg::PointCloud2 & input, tier4_perception_msgs::msg::DetectedObjectsWithFeature & output) { - if (!trt_common_->isInitialized()) { - return false; - } - // move up pointcloud z_offset in z axis sensor_msgs::msg::PointCloud2 transformed_cloud; transformCloud(input, transformed_cloud, z_offset_); @@ -169,7 +161,10 @@ bool LidarApolloInstanceSegmentation::detectDynamicObjects( std::vector buffers = {input_d_.get(), output_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( output_h_.get(), output_d_.get(), sizeof(float) * output_size_, cudaMemcpyDeviceToHost, diff --git a/perception/autoware_lidar_centerpoint/CMakeLists.txt b/perception/autoware_lidar_centerpoint/CMakeLists.txt index 88244b3153349..322907c67c6ae 100644 --- a/perception/autoware_lidar_centerpoint/CMakeLists.txt +++ b/perception/autoware_lidar_centerpoint/CMakeLists.txt @@ -84,8 +84,6 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL) lib/detection_class_remapper.cpp lib/utils.cpp lib/ros_utils.cpp - lib/network/network_trt.cpp - lib/network/tensorrt_wrapper.cpp lib/postprocess/non_maximum_suppression.cpp lib/preprocess/pointcloud_densification.cpp lib/preprocess/voxel_generator.cpp diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp index 671ff54f8ce0d..4b531e34877d6 100644 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp +++ b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp @@ -16,12 +16,13 @@ #define AUTOWARE__LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_ #include "autoware/lidar_centerpoint/cuda_utils.hpp" -#include "autoware/lidar_centerpoint/network/network_trt.hpp" #include "autoware/lidar_centerpoint/postprocess/postprocess_kernel.hpp" #include "autoware/lidar_centerpoint/preprocess/voxel_generator.hpp" #include "pcl/point_cloud.h" #include "pcl/point_types.h" +#include + #include "sensor_msgs/msg/point_cloud2.hpp" #include @@ -31,31 +32,14 @@ namespace autoware::lidar_centerpoint { -class NetworkParam -{ -public: - NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision) - : onnx_path_(std::move(onnx_path)), - engine_path_(std::move(engine_path)), - trt_precision_(std::move(trt_precision)) - { - } - - std::string onnx_path() const { return onnx_path_; } - std::string engine_path() const { return engine_path_; } - std::string trt_precision() const { return trt_precision_; } - -private: - std::string onnx_path_; - std::string engine_path_; - std::string trt_precision_; -}; +using autoware::tensorrt_common::ProfileDims; +using autoware::tensorrt_common::TrtCommonConfig; class CenterPointTRT { public: explicit CenterPointTRT( - const NetworkParam & encoder_param, const NetworkParam & head_param, + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param, const DensificationParam & densification_param, const CenterPointConfig & config); virtual ~CenterPointTRT(); @@ -66,6 +50,7 @@ class CenterPointTRT protected: void initPtr(); + void initTrt(const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param); virtual bool preprocess( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer); @@ -75,8 +60,8 @@ class CenterPointTRT void postProcess(std::vector & det_boxes3d); std::unique_ptr vg_ptr_{nullptr}; - std::unique_ptr encoder_trt_ptr_{nullptr}; - std::unique_ptr head_trt_ptr_{nullptr}; + std::unique_ptr encoder_trt_ptr_{nullptr}; + std::unique_ptr head_trt_ptr_{nullptr}; std::unique_ptr post_proc_ptr_{nullptr}; cudaStream_t stream_{nullptr}; diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp deleted file mode 100644 index 17778d77ed798..0000000000000 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ -#define AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ - -#include "autoware/lidar_centerpoint/centerpoint_config.hpp" -#include "autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp" - -#include - -namespace autoware::lidar_centerpoint -{ -class VoxelEncoderTRT : public TensorRTWrapper -{ -public: - using TensorRTWrapper::TensorRTWrapper; - -protected: - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) override; -}; - -class HeadTRT : public TensorRTWrapper -{ -public: - using TensorRTWrapper::TensorRTWrapper; - - HeadTRT(const std::vector & out_channel_sizes, const CenterPointConfig & config); - -protected: - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) override; - - std::vector out_channel_sizes_; -}; - -} // namespace autoware::lidar_centerpoint - -#endif // AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp deleted file mode 100644 index d56ccc699add3..0000000000000 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ -#define AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ - -#include "NvInfer.h" -#include "autoware/lidar_centerpoint/centerpoint_config.hpp" -#include "autoware/tensorrt_common/tensorrt_common.hpp" - -#include -#include -#include - -namespace autoware::lidar_centerpoint -{ - -class TensorRTWrapper -{ -public: - explicit TensorRTWrapper(const CenterPointConfig & config); - - virtual ~TensorRTWrapper(); - - bool init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision); - - autoware::tensorrt_common::TrtUniquePtr context_{nullptr}; - -protected: - virtual bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) = 0; - - CenterPointConfig config_; - autoware::tensorrt_common::Logger logger_; - -private: - bool parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - std::size_t workspace_size = (1ULL << 30)); - - bool saveEngine(const std::string & engine_path); - - bool loadEngine(const std::string & engine_path); - - bool createContext(); - - autoware::tensorrt_common::TrtUniquePtr runtime_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr plan_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr engine_{nullptr}; -}; - -} // namespace autoware::lidar_centerpoint - -#endif // AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ diff --git a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp index 530f59179d25b..c586eeb961716 100644 --- a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp +++ b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp @@ -28,39 +28,22 @@ #include #include #include +#include +#include #include namespace autoware::lidar_centerpoint { CenterPointTRT::CenterPointTRT( - const NetworkParam & encoder_param, const NetworkParam & head_param, + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param, const DensificationParam & densification_param, const CenterPointConfig & config) : config_(config) { vg_ptr_ = std::make_unique(densification_param, config_); post_proc_ptr_ = std::make_unique(config_); - // encoder - encoder_trt_ptr_ = std::make_unique(config_); - encoder_trt_ptr_->init( - encoder_param.onnx_path(), encoder_param.engine_path(), encoder_param.trt_precision()); - encoder_trt_ptr_->context_->setBindingDimensions( - 0, - nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, config_.encoder_in_feature_size_)); - - // head - std::vector out_channel_sizes = { - config_.class_size_, config_.head_out_offset_size_, config_.head_out_z_size_, - config_.head_out_dim_size_, config_.head_out_rot_size_, config_.head_out_vel_size_}; - head_trt_ptr_ = std::make_unique(out_channel_sizes, config_); - head_trt_ptr_->init(head_param.onnx_path(), head_param.engine_path(), head_param.trt_precision()); - head_trt_ptr_->context_->setBindingDimensions( - 0, nvinfer1::Dims4( - config_.batch_size_, config_.encoder_out_feature_size_, config_.grid_size_y_, - config_.grid_size_x_)); - initPtr(); + initTrt(encoder_param, head_param); cudaStreamCreate(&stream_); } @@ -126,6 +109,66 @@ void CenterPointTRT::initPtr() cudaMemcpyHostToDevice, stream_)); } +void CenterPointTRT::initTrt( + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param) +{ + // encoder input profile + auto enc_in_dims = nvinfer1::Dims{ + 3, + {static_cast(config_.max_voxel_size_), + static_cast(config_.max_point_in_voxel_size_), + static_cast(config_.encoder_in_feature_size_)}}; + std::vector encoder_profile_dims{ + tensorrt_common::ProfileDims(0, enc_in_dims, enc_in_dims, enc_in_dims)}; + auto encoder_profile_dims_ptr = + std::make_unique>(encoder_profile_dims); + + // head input profile + auto head_in_dims = nvinfer1::Dims{ + 4, + {static_cast(config_.batch_size_), + static_cast(config_.encoder_out_feature_size_), + static_cast(config_.grid_size_y_), static_cast(config_.grid_size_x_)}}; + std::vector head_profile_dims{ + tensorrt_common::ProfileDims(0, head_in_dims, head_in_dims, head_in_dims)}; + std::unordered_map out_channel_map = { + {1, static_cast(config_.class_size_)}, + {2, static_cast(config_.head_out_offset_size_)}, + {3, static_cast(config_.head_out_z_size_)}, + {4, static_cast(config_.head_out_dim_size_)}, + {5, static_cast(config_.head_out_rot_size_)}, + {6, static_cast(config_.head_out_vel_size_)}}; + for (const auto & [tensor_name, channel_size] : out_channel_map) { + auto dims = nvinfer1::Dims{ + 4, + {static_cast(config_.batch_size_), channel_size, + static_cast(config_.down_grid_size_y_), + static_cast(config_.down_grid_size_x_)}}; + head_profile_dims.emplace_back(tensor_name, dims, dims, dims); + } + auto head_profile_dims_ptr = + std::make_unique>(head_profile_dims); + + // initialize trt wrappers + encoder_trt_ptr_ = std::make_unique(encoder_param); + + head_trt_ptr_ = std::make_unique(head_param); + + // setup trt engines + if ( + !encoder_trt_ptr_->setup(std::move(encoder_profile_dims_ptr)) || + !head_trt_ptr_->setup(std::move(head_profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TRT engine."); + } + + // set input shapes + if ( + !encoder_trt_ptr_->setInputShape(0, enc_in_dims) || + !head_trt_ptr_->setInputShape(0, head_in_dims)) { + throw std::runtime_error("Failed to set input shape."); + } +} + bool CenterPointTRT::detect( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d, bool & is_num_pillars_within_range) @@ -210,13 +253,10 @@ bool CenterPointTRT::preprocess( void CenterPointTRT::inference() { - if (!encoder_trt_ptr_->context_ || !head_trt_ptr_->context_) { - throw std::runtime_error("Failed to create tensorrt context."); - } - // pillar encoder network - std::vector encoder_buffers{encoder_in_features_d_.get(), pillar_features_d_.get()}; - encoder_trt_ptr_->context_->enqueueV2(encoder_buffers.data(), stream_, nullptr); + std::vector encoder_tensors = {encoder_in_features_d_.get(), pillar_features_d_.get()}; + encoder_trt_ptr_->setTensorsAddresses(encoder_tensors); + encoder_trt_ptr_->enqueueV3(stream_); // scatter CHECK_CUDA_ERROR(scatterFeatures_launch( @@ -225,11 +265,13 @@ void CenterPointTRT::inference() spatial_features_d_.get(), stream_)); // head network - std::vector head_buffers = {spatial_features_d_.get(), head_out_heatmap_d_.get(), + std::vector head_tensors = {spatial_features_d_.get(), head_out_heatmap_d_.get(), head_out_offset_d_.get(), head_out_z_d_.get(), head_out_dim_d_.get(), head_out_rot_d_.get(), head_out_vel_d_.get()}; - head_trt_ptr_->context_->enqueueV2(head_buffers.data(), stream_, nullptr); + head_trt_ptr_->setTensorsAddresses(head_tensors); + + head_trt_ptr_->enqueueV3(stream_); } void CenterPointTRT::postProcess(std::vector & det_boxes3d) diff --git a/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp b/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp deleted file mode 100644 index 7767c0b5cbb02..0000000000000 --- a/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_centerpoint/network/network_trt.hpp" - -#include -#include - -namespace autoware::lidar_centerpoint -{ -bool VoxelEncoderTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - auto in_name = network.getInput(0)->getName(); - auto in_dims = nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, config_.encoder_in_feature_size_); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims); - - auto out_name = network.getOutput(0)->getName(); - auto out_dims = nvinfer1::Dims2(config_.max_voxel_size_, config_.encoder_out_feature_size_); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims); - config.addOptimizationProfile(profile); - - return true; -} - -HeadTRT::HeadTRT( - const std::vector & out_channel_sizes, const CenterPointConfig & config) -: TensorRTWrapper(config), out_channel_sizes_(out_channel_sizes) -{ -} - -bool HeadTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - auto in_name = network.getInput(0)->getName(); - auto in_dims = nvinfer1::Dims4( - config_.batch_size_, config_.encoder_out_feature_size_, config_.grid_size_y_, - config_.grid_size_x_); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims); - - for (std::size_t ci = 0; ci < out_channel_sizes_.size(); ci++) { - auto out_name = network.getOutput(ci)->getName(); - - if ( - out_name == std::string("heatmap") && - network.getOutput(ci)->getDimensions().d[1] != static_cast(out_channel_sizes_[ci])) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Expected and actual number of classes do not match" << std::endl; - return false; - } - auto out_dims = nvinfer1::Dims4( - config_.batch_size_, out_channel_sizes_[ci], config_.down_grid_size_y_, - config_.down_grid_size_x_); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims); - } - config.addOptimizationProfile(profile); - - return true; -} - -} // namespace autoware::lidar_centerpoint diff --git a/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp b/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp deleted file mode 100644 index 6d4c2e053b34f..0000000000000 --- a/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp" - -#include "NvOnnxParser.h" - -#include -#include -#include - -namespace autoware::lidar_centerpoint -{ -TensorRTWrapper::TensorRTWrapper(const CenterPointConfig & config) : config_(config) -{ -} - -TensorRTWrapper::~TensorRTWrapper() -{ - context_.reset(); - runtime_.reset(); - plan_.reset(); - engine_.reset(); -} - -bool TensorRTWrapper::init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision) -{ - runtime_ = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferRuntime(logger_)); - if (!runtime_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl; - return false; - } - - bool success; - std::ifstream engine_file(engine_path); - if (engine_file.is_open()) { - success = loadEngine(engine_path); - } else { - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5); - success = parseONNX(onnx_path, engine_path, precision); - logger_.stop_throttle(log_thread); - } - success &= createContext(); - - return success; -} - -bool TensorRTWrapper::createContext() -{ - if (!engine_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create context: Engine was not created" << std::endl; - return false; - } - - context_ = autoware::tensorrt_common::TrtUniquePtr( - engine_->createExecutionContext()); - if (!context_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl; - return false; - } - - return true; -} - -bool TensorRTWrapper::parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - const std::size_t workspace_size) -{ - auto builder = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferBuilder(logger_)); - if (!builder) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl; - return false; - } - - auto config = autoware::tensorrt_common::TrtUniquePtr( - builder->createBuilderConfig()); - if (!config) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl; - return false; - } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size); -#else - config->setMaxWorkspaceSize(workspace_size); -#endif - if (precision == "fp16") { - if (builder->platformHasFastFp16()) { - autoware::tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl; - config->setFlag(nvinfer1::BuilderFlag::kFP16); - } else { - autoware::tensorrt_common::LOG_INFO(logger_) - << "TensorRT FP16 Inference isn't supported in this environment" << std::endl; - } - } - - const auto flag = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = autoware::tensorrt_common::TrtUniquePtr( - builder->createNetworkV2(flag)); - if (!network) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl; - return false; - } - - auto parser = autoware::tensorrt_common::TrtUniquePtr( - nvonnxparser::createParser(*network, logger_)); - parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR)); - - if (!setProfile(*builder, *network, *config)) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl; - return false; - } - - plan_ = autoware::tensorrt_common::TrtUniquePtr( - builder->buildSerializedNetwork(*network, *config)); - if (!plan_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create serialized network" << std::endl; - return false; - } - engine_ = autoware::tensorrt_common::TrtUniquePtr( - runtime_->deserializeCudaEngine(plan_->data(), plan_->size())); - if (!engine_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl; - return false; - } - - return saveEngine(engine_path); -} - -bool TensorRTWrapper::saveEngine(const std::string & engine_path) -{ - autoware::tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl; - std::ofstream file(engine_path, std::ios::out | std::ios::binary); - file.write(reinterpret_cast(plan_->data()), plan_->size()); - return true; -} - -bool TensorRTWrapper::loadEngine(const std::string & engine_path) -{ - std::ifstream engine_file(engine_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine_ = - autoware::tensorrt_common::TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - autoware::tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl; - return true; -} - -} // namespace autoware::lidar_centerpoint diff --git a/perception/autoware_lidar_centerpoint/src/node.cpp b/perception/autoware_lidar_centerpoint/src/node.cpp index 825fc40234120..19d1475db9051 100644 --- a/perception/autoware_lidar_centerpoint/src/node.cpp +++ b/perception/autoware_lidar_centerpoint/src/node.cpp @@ -86,8 +86,8 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti iou_bev_nms_.setParameters(p); } - NetworkParam encoder_param(encoder_onnx_path, encoder_engine_path, trt_precision); - NetworkParam head_param(head_onnx_path, head_engine_path, trt_precision); + TrtCommonConfig encoder_param(encoder_onnx_path, trt_precision, encoder_engine_path); + TrtCommonConfig head_param(head_onnx_path, trt_precision, head_engine_path); DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); diff --git a/perception/autoware_lidar_transfusion/CMakeLists.txt b/perception/autoware_lidar_transfusion/CMakeLists.txt index c3a56f883fbe5..fdb978c64946f 100644 --- a/perception/autoware_lidar_transfusion/CMakeLists.txt +++ b/perception/autoware_lidar_transfusion/CMakeLists.txt @@ -84,7 +84,6 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL) ament_auto_add_library(${PROJECT_NAME}_lib SHARED lib/detection_class_remapper.cpp - lib/network/network_trt.cpp lib/postprocess/non_maximum_suppression.cpp lib/preprocess/voxel_generator.cpp lib/preprocess/pointcloud_densification.cpp diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp deleted file mode 100644 index 71cd08acdb8c8..0000000000000 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ -#define AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ - -#include "autoware/lidar_transfusion/transfusion_config.hpp" -#include "autoware/lidar_transfusion/utils.hpp" - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace autoware::lidar_transfusion -{ - -struct ProfileDimension -{ - nvinfer1::Dims min; - nvinfer1::Dims opt; - nvinfer1::Dims max; - - bool operator!=(const ProfileDimension & rhs) const - { - return min.nbDims != rhs.min.nbDims || opt.nbDims != rhs.opt.nbDims || - max.nbDims != rhs.max.nbDims || !std::equal(min.d, min.d + min.nbDims, rhs.min.d) || - !std::equal(opt.d, opt.d + opt.nbDims, rhs.opt.d) || - !std::equal(max.d, max.d + max.nbDims, rhs.max.d); - } -}; - -class NetworkTRT -{ -public: - explicit NetworkTRT(const TransfusionConfig & config); - ~NetworkTRT(); - - bool init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision); - const char * getTensorName(NetworkIO name); - - autoware::tensorrt_common::TrtUniquePtr engine{nullptr}; - autoware::tensorrt_common::TrtUniquePtr context{nullptr}; - -private: - bool parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - std::size_t workspace_size = (1ULL << 30)); - bool saveEngine(const std::string & engine_path); - bool loadEngine(const std::string & engine_path); - bool createContext(); - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config); - bool validateNetworkIO(); - nvinfer1::Dims validateTensorShape(NetworkIO name, const std::vector shape); - - autoware::tensorrt_common::TrtUniquePtr runtime_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr plan_{nullptr}; - autoware::tensorrt_common::Logger logger_; - TransfusionConfig config_; - std::vector tensors_names_; - - std::array in_profile_dims_; -}; - -} // namespace autoware::lidar_transfusion - -#endif // AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp index 363ee17a0d6e0..e60ee04150b5c 100644 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp +++ b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp @@ -35,21 +35,21 @@ class TransfusionConfig if (voxels_num.size() == 3) { max_voxels_ = voxels_num[2]; - voxels_num_[0] = voxels_num[0]; - voxels_num_[1] = voxels_num[1]; - voxels_num_[2] = voxels_num[2]; + voxels_num_[0] = static_cast(voxels_num[0]); + voxels_num_[1] = static_cast(voxels_num[1]); + voxels_num_[2] = static_cast(voxels_num[2]); - min_voxel_size_ = voxels_num[0]; - opt_voxel_size_ = voxels_num[1]; - max_voxel_size_ = voxels_num[2]; + min_voxel_size_ = static_cast(voxels_num[0]); + opt_voxel_size_ = static_cast(voxels_num[1]); + max_voxel_size_ = static_cast(voxels_num[2]); - min_points_size_ = voxels_num[0]; - opt_points_size_ = voxels_num[1]; - max_points_size_ = voxels_num[2]; + min_points_size_ = static_cast(voxels_num[0]); + opt_points_size_ = static_cast(voxels_num[1]); + max_points_size_ = static_cast(voxels_num[2]); - min_coors_size_ = voxels_num[0]; - opt_coors_size_ = voxels_num[1]; - max_coors_size_ = voxels_num[2]; + min_coors_size_ = static_cast(voxels_num[0]); + opt_coors_size_ = static_cast(voxels_num[1]); + max_coors_size_ = static_cast(voxels_num[2]); } if (point_cloud_range.size() == 6) { min_x_range_ = static_cast(point_cloud_range[0]); @@ -91,7 +91,7 @@ class TransfusionConfig std::size_t cloud_capacity_{}; ///// KERNEL PARAMETERS ///// const std::size_t threads_for_voxel_{256}; // threads number for a block - const std::size_t points_per_voxel_{20}; + const int32_t points_per_voxel_{20}; const std::size_t warp_size_{32}; // one warp(32 threads) for one pillar const std::size_t pillars_per_block_{64}; // one thread deals with one pillar // and a block has pillars_per_block threads @@ -99,9 +99,9 @@ class TransfusionConfig std::size_t max_voxels_{60000}; ///// NETWORK PARAMETERS ///// - const std::size_t batch_size_{1}; - const std::size_t num_classes_{5}; - const std::size_t num_point_feature_size_{5}; // x, y, z, intensity, lag + const int32_t batch_size_{1}; + const int32_t num_classes_{5}; + const int32_t num_point_feature_size_{5}; // x, y, z, intensity, lag // the dimension of the input cloud float min_x_range_{-76.8}; float max_x_range_{76.8}; @@ -114,9 +114,9 @@ class TransfusionConfig float voxel_y_size_{0.3}; float voxel_z_size_{8.0}; const std::size_t out_size_factor_{4}; - const std::size_t max_num_points_per_pillar_{points_per_voxel_}; - const std::size_t num_point_values_{4}; - std::size_t num_proposals_{200}; + const int32_t max_num_points_per_pillar_{points_per_voxel_}; + const int32_t num_point_values_{4}; + int32_t num_proposals_{200}; // the number of feature maps for pillar scatter const std::size_t num_feature_scatter_{pillar_feature_size_}; // the score threshold for classification @@ -126,7 +126,7 @@ class TransfusionConfig std::size_t max_num_pillars_{max_voxels_}; const std::size_t pillar_points_bev_{max_num_points_per_pillar_ * max_num_pillars_}; // the detected boxes result decode by (x, y, z, w, l, h, yaw) - const std::size_t num_box_values_{8}; + const int32_t num_box_values_{8}; // the input size of the 2D backbone network std::size_t grid_x_size_{512}; std::size_t grid_y_size_{512}; @@ -136,33 +136,33 @@ class TransfusionConfig std::size_t feature_y_size_{grid_y_size_ / out_size_factor_}; ///// RUNTIME DIMENSIONS ///// - std::vector voxels_num_{5000, 30000, 60000}; + std::vector voxels_num_{5000, 30000, 60000}; // voxels - std::size_t min_voxel_size_{voxels_num_[0]}; - std::size_t opt_voxel_size_{voxels_num_[1]}; - std::size_t max_voxel_size_{voxels_num_[2]}; + int32_t min_voxel_size_{voxels_num_[0]}; + int32_t opt_voxel_size_{voxels_num_[1]}; + int32_t max_voxel_size_{voxels_num_[2]}; - std::size_t min_point_in_voxel_size_{points_per_voxel_}; - std::size_t opt_point_in_voxel_size_{points_per_voxel_}; - std::size_t max_point_in_voxel_size_{points_per_voxel_}; + int32_t min_point_in_voxel_size_{points_per_voxel_}; + int32_t opt_point_in_voxel_size_{points_per_voxel_}; + int32_t max_point_in_voxel_size_{points_per_voxel_}; - std::size_t min_network_feature_size_{num_point_feature_size_}; - std::size_t opt_network_feature_size_{num_point_feature_size_}; - std::size_t max_network_feature_size_{num_point_feature_size_}; + int32_t min_network_feature_size_{num_point_feature_size_}; + int32_t opt_network_feature_size_{num_point_feature_size_}; + int32_t max_network_feature_size_{num_point_feature_size_}; // num_points - std::size_t min_points_size_{voxels_num_[0]}; - std::size_t opt_points_size_{voxels_num_[1]}; - std::size_t max_points_size_{voxels_num_[2]}; + int32_t min_points_size_{voxels_num_[0]}; + int32_t opt_points_size_{voxels_num_[1]}; + int32_t max_points_size_{voxels_num_[2]}; // coors - std::size_t min_coors_size_{voxels_num_[0]}; - std::size_t opt_coors_size_{voxels_num_[1]}; - std::size_t max_coors_size_{voxels_num_[2]}; + int32_t min_coors_size_{voxels_num_[0]}; + int32_t opt_coors_size_{voxels_num_[1]}; + int32_t max_coors_size_{voxels_num_[2]}; - std::size_t min_coors_dim_size_{num_point_values_}; - std::size_t opt_coors_dim_size_{num_point_values_}; - std::size_t max_coors_dim_size_{num_point_values_}; + int32_t min_coors_dim_size_{num_point_values_}; + int32_t opt_coors_dim_size_{num_point_values_}; + int32_t max_coors_dim_size_{num_point_values_}; }; } // namespace autoware::lidar_transfusion diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp index 06dd65b4b805f..00bb5e726706c 100644 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp +++ b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp @@ -16,7 +16,6 @@ #define AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_TRT_HPP_ #include "autoware/lidar_transfusion/cuda_utils.hpp" -#include "autoware/lidar_transfusion/network/network_trt.hpp" #include "autoware/lidar_transfusion/postprocess/postprocess_kernel.hpp" #include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp" #include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp" @@ -24,6 +23,7 @@ #include "autoware/lidar_transfusion/utils.hpp" #include "autoware/lidar_transfusion/visibility_control.hpp" +#include #include #include @@ -34,38 +34,17 @@ #include #include #include -#include #include namespace autoware::lidar_transfusion { -class NetworkParam -{ -public: - NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision) - : onnx_path_(std::move(onnx_path)), - engine_path_(std::move(engine_path)), - trt_precision_(std::move(trt_precision)) - { - } - - std::string onnx_path() const { return onnx_path_; } - std::string engine_path() const { return engine_path_; } - std::string trt_precision() const { return trt_precision_; } - -private: - std::string onnx_path_; - std::string engine_path_; - std::string trt_precision_; -}; - class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT { public: explicit TransfusionTRT( - const NetworkParam & network_param, const DensificationParam & densification_param, - const TransfusionConfig & config); + const tensorrt_common::TrtCommonConfig & trt_config, + const DensificationParam & densification_param, const TransfusionConfig config); virtual ~TransfusionTRT(); bool detect( @@ -73,6 +52,8 @@ class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT std::vector & det_boxes3d, std::unordered_map & proc_timing); protected: + void initTrt(const tensorrt_common::TrtCommonConfig & trt_config); + void initPtr(); bool preprocess(const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer); @@ -81,7 +62,7 @@ class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT bool postprocess(std::vector & det_boxes3d); - std::unique_ptr network_trt_ptr_{nullptr}; + std::unique_ptr network_trt_ptr_{nullptr}; std::unique_ptr vg_ptr_{nullptr}; std::unique_ptr> stop_watch_ptr_{ nullptr}; diff --git a/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp b/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp deleted file mode 100644 index 8e3cb8a55ec7e..0000000000000 --- a/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp +++ /dev/null @@ -1,357 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_transfusion/network/network_trt.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace autoware::lidar_transfusion -{ - -inline NetworkIO nameToNetworkIO(const char * name) -{ - static const std::unordered_map name_to_enum = { - {"voxels", NetworkIO::voxels}, {"num_points", NetworkIO::num_points}, - {"coors", NetworkIO::coors}, {"cls_score0", NetworkIO::cls_score}, - {"bbox_pred0", NetworkIO::bbox_pred}, {"dir_cls_pred0", NetworkIO::dir_pred}}; - - auto it = name_to_enum.find(name); - if (it != name_to_enum.end()) { - return it->second; - } - throw std::runtime_error("Invalid input name: " + std::string(name)); -} - -std::ostream & operator<<(std::ostream & os, const ProfileDimension & profile) -{ - std::string delim = ""; - os << "min->["; - for (int i = 0; i < profile.min.nbDims; ++i) { - os << delim << profile.min.d[i]; - delim = ", "; - } - os << "], opt->["; - delim = ""; - for (int i = 0; i < profile.opt.nbDims; ++i) { - os << delim << profile.opt.d[i]; - delim = ", "; - } - os << "], max->["; - delim = ""; - for (int i = 0; i < profile.max.nbDims; ++i) { - os << delim << profile.max.d[i]; - delim = ", "; - } - os << "]"; - return os; -} - -NetworkTRT::NetworkTRT(const TransfusionConfig & config) : config_(config) -{ - ProfileDimension voxels_dims = { - nvinfer1::Dims3( - config_.min_voxel_size_, config_.min_point_in_voxel_size_, config_.min_network_feature_size_), - nvinfer1::Dims3( - config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, config_.opt_network_feature_size_), - nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, - config_.max_network_feature_size_)}; - ProfileDimension num_points_dims = { - nvinfer1::Dims{1, {static_cast(config_.min_points_size_)}}, - nvinfer1::Dims{1, {static_cast(config_.opt_points_size_)}}, - nvinfer1::Dims{1, {static_cast(config_.max_points_size_)}}}; - ProfileDimension coors_dims = { - nvinfer1::Dims2(config_.min_coors_size_, config_.min_coors_dim_size_), - nvinfer1::Dims2(config_.opt_coors_size_, config_.opt_coors_dim_size_), - nvinfer1::Dims2(config_.max_coors_size_, config_.max_coors_dim_size_)}; - in_profile_dims_ = {voxels_dims, num_points_dims, coors_dims}; -} - -NetworkTRT::~NetworkTRT() -{ - context.reset(); - runtime_.reset(); - plan_.reset(); - engine.reset(); -} - -bool NetworkTRT::init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision) -{ - runtime_ = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferRuntime(logger_)); - if (!runtime_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl; - return false; - } - - bool success; - std::ifstream engine_file(engine_path); - if (engine_file.is_open()) { - success = loadEngine(engine_path); - } else { - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5); - success = parseONNX(onnx_path, engine_path, precision); - logger_.stop_throttle(log_thread); - } - success &= createContext(); - - return success; -} - -bool NetworkTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - - auto voxels_name = network.getInput(NetworkIO::voxels)->getName(); - auto num_points_name = network.getInput(NetworkIO::num_points)->getName(); - auto coors_name = network.getInput(NetworkIO::coors)->getName(); - - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::voxels].min); - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::voxels].opt); - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::voxels].max); - - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kMIN, - in_profile_dims_[NetworkIO::num_points].min); - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kOPT, - in_profile_dims_[NetworkIO::num_points].opt); - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kMAX, - in_profile_dims_[NetworkIO::num_points].max); - - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::coors].min); - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::coors].opt); - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::coors].max); - - config.addOptimizationProfile(profile); - return true; -} - -bool NetworkTRT::createContext() -{ - if (!engine) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create context: Engine was not created" << std::endl; - return false; - } - - context = autoware::tensorrt_common::TrtUniquePtr( - engine->createExecutionContext()); - if (!context) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl; - return false; - } - - return true; -} - -bool NetworkTRT::parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - const size_t workspace_size) -{ - auto builder = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferBuilder(logger_)); - if (!builder) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl; - return false; - } - - auto config = autoware::tensorrt_common::TrtUniquePtr( - builder->createBuilderConfig()); - if (!config) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl; - return false; - } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size); -#else - config->setMaxWorkspaceSize(workspace_size); -#endif - if (precision == "fp16") { - if (builder->platformHasFastFp16()) { - autoware::tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl; - config->setFlag(nvinfer1::BuilderFlag::kFP16); - } else { - autoware::tensorrt_common::LOG_INFO(logger_) - << "TensorRT FP16 Inference isn't supported in this environment" << std::endl; - } - } - - const auto flag = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = autoware::tensorrt_common::TrtUniquePtr( - builder->createNetworkV2(flag)); - if (!network) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl; - return false; - } - - auto parser = autoware::tensorrt_common::TrtUniquePtr( - nvonnxparser::createParser(*network, logger_)); - parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR)); - - if (!setProfile(*builder, *network, *config)) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl; - return false; - } - - plan_ = autoware::tensorrt_common::TrtUniquePtr( - builder->buildSerializedNetwork(*network, *config)); - if (!plan_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create serialized network" << std::endl; - return false; - } - engine = autoware::tensorrt_common::TrtUniquePtr( - runtime_->deserializeCudaEngine(plan_->data(), plan_->size())); - if (!engine) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl; - return false; - } - - return saveEngine(engine_path); -} - -bool NetworkTRT::saveEngine(const std::string & engine_path) -{ - autoware::tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl; - std::ofstream file(engine_path, std::ios::out | std::ios::binary); - file.write(reinterpret_cast(plan_->data()), plan_->size()); - return validateNetworkIO(); -} - -bool NetworkTRT::loadEngine(const std::string & engine_path) -{ - std::ifstream engine_file(engine_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine = - autoware::tensorrt_common::TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - autoware::tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl; - return validateNetworkIO(); -} - -bool NetworkTRT::validateNetworkIO() -{ - // Whether the number of IO match the expected size - if (engine->getNbIOTensors() != NetworkIO::ENUM_SIZE) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid network IO. Expected size: " << NetworkIO::ENUM_SIZE - << ". Actual size: " << engine->getNbIOTensors() << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - - // Initialize tensors_names_ with null values - tensors_names_.resize(NetworkIO::ENUM_SIZE, nullptr); - - // Loop over the tensor names and place them in the correct positions - for (int i = 0; i < NetworkIO::ENUM_SIZE; ++i) { - const char * name = engine->getIOTensorName(i); - tensors_names_[nameToNetworkIO(name)] = name; - } - - // Log the network IO - std::string tensors = std::accumulate( - tensors_names_.begin(), tensors_names_.end(), std::string(), - [](const std::string & a, const std::string & b) -> std::string { return a + b + " "; }); - autoware::tensorrt_common::LOG_INFO(logger_) << "Network IO: " << tensors << std::endl; - - // Whether the current engine input profile match the config input profile - for (int i = 0; i <= NetworkIO::coors; ++i) { - ProfileDimension engine_dims{ - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMIN), - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kOPT), - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMAX)}; - - autoware::tensorrt_common::LOG_INFO(logger_) - << "Profile for " << tensors_names_[i] << ": " << engine_dims << std::endl; - - if (engine_dims != in_profile_dims_[i]) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid network input dimension. Config: " << in_profile_dims_[i] - << ". Please change the input profile or delete the engine file and build engine again." - << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - } - - // Whether the IO tensor shapes match the network config, -1 for dynamic size - validateTensorShape( - NetworkIO::voxels, {-1, static_cast(config_.points_per_voxel_), - static_cast(config_.num_point_feature_size_)}); - validateTensorShape(NetworkIO::num_points, {-1}); - validateTensorShape(NetworkIO::coors, {-1, static_cast(config_.num_point_values_)}); - auto cls_score = validateTensorShape( - NetworkIO::cls_score, - {static_cast(config_.batch_size_), static_cast(config_.num_classes_), - static_cast(config_.num_proposals_)}); - autoware::tensorrt_common::LOG_INFO(logger_) - << "Network num classes: " << cls_score.d[1] << std::endl; - validateTensorShape( - NetworkIO::dir_pred, - {static_cast(config_.batch_size_), 2, static_cast(config_.num_proposals_)}); // x, y - validateTensorShape( - NetworkIO::bbox_pred, - {static_cast(config_.batch_size_), static_cast(config_.num_box_values_), - static_cast(config_.num_proposals_)}); - - return true; -} - -const char * NetworkTRT::getTensorName(NetworkIO name) -{ - return tensors_names_.at(name); -} - -nvinfer1::Dims NetworkTRT::validateTensorShape(NetworkIO name, const std::vector shape) -{ - auto tensor_shape = engine->getTensorShape(tensors_names_[name]); - if (tensor_shape.nbDims != static_cast(shape.size())) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid tensor shape for " << tensors_names_[name] << ". Expected size: " << shape.size() - << ". Actual size: " << tensor_shape.nbDims << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - for (int i = 0; i < tensor_shape.nbDims; ++i) { - if (tensor_shape.d[i] != static_cast(shape[i])) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid tensor shape for " << tensors_names_[name] << ". Expected: " << shape[i] - << ". Actual: " << tensor_shape.d[i] << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - } - return tensor_shape; -} - -} // namespace autoware::lidar_transfusion diff --git a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp index 4e100f2e794d5..dd075a45cb29c 100644 --- a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp +++ b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp @@ -17,6 +17,7 @@ #include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp" #include "autoware/lidar_transfusion/transfusion_config.hpp" +#include #include #include @@ -27,25 +28,23 @@ #include #include #include +#include #include namespace autoware::lidar_transfusion { TransfusionTRT::TransfusionTRT( - const NetworkParam & network_param, const DensificationParam & densification_param, - const TransfusionConfig & config) -: config_(config) + const tensorrt_common::TrtCommonConfig & trt_config, + const DensificationParam & densification_param, TransfusionConfig config) +: config_(std::move(config)) { - network_trt_ptr_ = std::make_unique(config_); - - network_trt_ptr_->init( - network_param.onnx_path(), network_param.engine_path(), network_param.trt_precision()); vg_ptr_ = std::make_unique(densification_param, config_, stream_); stop_watch_ptr_ = std::make_unique>(); stop_watch_ptr_->tic("processing/inner"); initPtr(); + initTrt(trt_config); CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); } @@ -99,6 +98,51 @@ void TransfusionTRT::initPtr() post_ptr_ = std::make_unique(config_, stream_); } +void TransfusionTRT::initTrt(const tensorrt_common::TrtCommonConfig & trt_config) +{ + std::vector network_io{ + autoware::tensorrt_common::NetworkIO( + "voxels", {3, {-1, config_.points_per_voxel_, config_.num_point_feature_size_}}), + autoware::tensorrt_common::NetworkIO("num_points", {1, {-1}}), + autoware::tensorrt_common::NetworkIO("coors", {2, {-1, config_.num_point_values_}}), + autoware::tensorrt_common::NetworkIO( + "cls_score0", {3, {config_.batch_size_, config_.num_classes_, config_.num_proposals_}}), + autoware::tensorrt_common::NetworkIO( + "bbox_pred0", {3, {config_.batch_size_, config_.num_box_values_, config_.num_proposals_}}), + autoware::tensorrt_common::NetworkIO( + "dir_cls_pred0", {3, {config_.batch_size_, 2, config_.num_proposals_}})}; + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + "voxels", + {3, + {config_.min_voxel_size_, config_.min_point_in_voxel_size_, + config_.min_network_feature_size_}}, + {3, + {config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, + config_.opt_network_feature_size_}}, + {3, + {config_.max_voxel_size_, config_.max_point_in_voxel_size_, + config_.max_network_feature_size_}}), + autoware::tensorrt_common::ProfileDims( + "num_points", {1, {static_cast(config_.min_points_size_)}}, + {1, {static_cast(config_.opt_points_size_)}}, + {1, {static_cast(config_.max_points_size_)}}), + autoware::tensorrt_common::ProfileDims( + "coors", {2, {config_.min_coors_size_, config_.min_coors_dim_size_}}, + {2, {config_.opt_coors_size_, config_.opt_coors_dim_size_}}, + {2, {config_.max_coors_size_, config_.max_coors_dim_size_}})}; + + auto network_io_ptr = + std::make_unique>(network_io); + auto profile_dims_ptr = + std::make_unique>(profile_dims); + + network_trt_ptr_ = std::make_unique(trt_config); + if (!network_trt_ptr_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr))) + throw std::runtime_error("Failed to setup TRT engine."); +} + bool TransfusionTRT::detect( const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d, std::unordered_map & proc_timing) @@ -166,8 +210,9 @@ bool TransfusionTRT::preprocess( CHECK_CUDA_ERROR(cudaMemcpyAsync( ¶ms_input, params_input_d_.get(), sizeof(unsigned int), cudaMemcpyDeviceToHost, stream_)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + auto params_input_i32 = static_cast(params_input); - if (params_input < config_.min_voxel_size_) { + if (params_input_i32 < config_.min_voxel_size_) { RCLCPP_WARN_STREAM( rclcpp::get_logger("lidar_transfusion"), "Too few voxels (" << params_input << ") for actual optimization profile (" @@ -185,39 +230,34 @@ bool TransfusionTRT::preprocess( params_input = config_.max_voxels_; } + RCLCPP_DEBUG_STREAM( rclcpp::get_logger("lidar_transfusion"), "Generated input voxels: " << params_input); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::voxels), voxel_features_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::voxels), + bool success = true; + + // Inputs + success &= network_trt_ptr_->setTensor( + "voxels", voxel_features_d_.get(), nvinfer1::Dims3{ - static_cast(params_input), static_cast(config_.max_num_points_per_pillar_), + params_input_i32, config_.max_num_points_per_pillar_, static_cast(config_.num_point_feature_size_)}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::num_points), voxel_num_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::num_points), - nvinfer1::Dims{1, {static_cast(params_input)}}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::coors), voxel_idxs_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::coors), - nvinfer1::Dims2{ - static_cast(params_input), static_cast(config_.num_point_values_)}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::cls_score), cls_output_d_.get()); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::bbox_pred), box_output_d_.get()); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::dir_pred), dir_cls_output_d_.get()); - return true; + success &= network_trt_ptr_->setTensor( + "num_points", voxel_num_d_.get(), nvinfer1::Dims{1, {params_input_i32}}); + success &= network_trt_ptr_->setTensor( + "coors", voxel_idxs_d_.get(), nvinfer1::Dims2{params_input_i32, config_.num_point_values_}); + + // Outputs + success &= network_trt_ptr_->setTensor("cls_score0", cls_output_d_.get()); + success &= network_trt_ptr_->setTensor("bbox_pred0", box_output_d_.get()); + success &= network_trt_ptr_->setTensor("dir_cls_pred0", dir_cls_output_d_.get()); + + return success; } bool TransfusionTRT::inference() { - auto status = network_trt_ptr_->context->enqueueV3(stream_); + auto status = network_trt_ptr_->enqueueV3(stream_); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); if (!status) { diff --git a/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp b/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp index 8bb70a821621f..563abd97698e0 100644 --- a/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp +++ b/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp @@ -75,7 +75,6 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options) const float score_threshold = static_cast(this->declare_parameter("score_threshold", descriptor)); - NetworkParam network_param(onnx_path, engine_path, trt_precision); DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); TransfusionConfig config( @@ -91,7 +90,9 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options) detection_class_remapper_.setParameters( allow_remapping_by_area_matrix, min_area_matrix, max_area_matrix); - detector_ptr_ = std::make_unique(network_param, densification_param, config); + auto trt_config = tensorrt_common::TrtCommonConfig(onnx_path, trt_precision, engine_path); + + detector_ptr_ = std::make_unique(trt_config, densification_param, config); cloud_sub_ = this->create_subscription( "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), @@ -155,7 +156,6 @@ void LidarTransfusionNode::cloudCallback(const sensor_msgs::msg::PointCloud2::Co objects_pub_->publish(output_msg); published_time_pub_->publish_if_subscribed(objects_pub_, output_msg.header.stamp); } - // add processing time for debug if (debug_publisher_ptr_ && stop_watch_ptr_) { const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic", true); diff --git a/perception/autoware_shape_estimation/CMakeLists.txt b/perception/autoware_shape_estimation/CMakeLists.txt index d0eb2aa5535a8..d7313111e6ade 100644 --- a/perception/autoware_shape_estimation/CMakeLists.txt +++ b/perception/autoware_shape_estimation/CMakeLists.txt @@ -4,6 +4,9 @@ project(autoware_shape_estimation) find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(PCL REQUIRED COMPONENTS common) find_package(pcl_conversions REQUIRED) diff --git a/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp b/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp index d39dce65d224d..4328246ff395f 100644 --- a/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp +++ b/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp @@ -48,11 +48,7 @@ class TrtShapeEstimator { public: TrtShapeEstimator( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, - const size_t max_workspace_size = (1 << 30), - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0)); + const std::string & model_path, const std::string & precision, const int batch_size); ~TrtShapeEstimator() = default; diff --git a/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp b/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp index b4b1f18832da7..fa3c3cb09bad3 100644 --- a/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp +++ b/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp @@ -29,47 +29,41 @@ namespace autoware::shape_estimation { TrtShapeEstimator::TrtShapeEstimator( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, - const autoware::tensorrt_common::BuildConfig build_config) + const std::string & model_path, const std::string & precision, const int batch_size) { trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); + tensorrt_common::TrtCommonConfig(model_path, precision)); + batch_size_ = batch_size; - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - std::cerr << "Failed to initialize TensorRT" << std::endl; - return; + if (!trt_common_->setup()) { + throw std::runtime_error("Failed to setup TensorRT"); } - const auto pc_input_dims = trt_common_->getBindingDimensions(0); + const auto pc_input_dims = trt_common_->getTensorShape(0); const auto pc_input_size = std::accumulate( pc_input_dims.d + 1, pc_input_dims.d + pc_input_dims.nbDims, 1, std::multiplies()); - input_pc_d_ = autoware::cuda_utils::make_unique(pc_input_size * batch_config[2]); - batch_size_ = batch_config[2]; - const auto one_hot_input_dims = trt_common_->getBindingDimensions(1); + input_pc_d_ = autoware::cuda_utils::make_unique(pc_input_size * batch_size_); + const auto one_hot_input_dims = trt_common_->getTensorShape(1); const auto one_hot_input_size = std::accumulate( one_hot_input_dims.d + 1, one_hot_input_dims.d + one_hot_input_dims.nbDims, 1, std::multiplies()); - input_one_hot_d_ = - autoware::cuda_utils::make_unique(one_hot_input_size * batch_config[2]); + input_one_hot_d_ = autoware::cuda_utils::make_unique(one_hot_input_size * batch_size_); - const auto stage1_center_out_dims = trt_common_->getBindingDimensions(2); + const auto stage1_center_out_dims = trt_common_->getTensorShape(2); out_s1center_elem_num_ = std::accumulate( stage1_center_out_dims.d + 1, stage1_center_out_dims.d + stage1_center_out_dims.nbDims, 1, std::multiplies()); - out_s1center_elem_num_ = out_s1center_elem_num_ * batch_config[2]; - out_s1center_elem_num_per_batch_ = static_cast(out_s1center_elem_num_ / batch_config[2]); + out_s1center_elem_num_ = out_s1center_elem_num_ * batch_size_; + out_s1center_elem_num_per_batch_ = static_cast(out_s1center_elem_num_ / batch_size_); out_s1center_prob_d_ = autoware::cuda_utils::make_unique(out_s1center_elem_num_); out_s1center_prob_h_ = autoware::cuda_utils::make_unique_host(out_s1center_elem_num_, cudaHostAllocPortable); - const auto pred_out_dims = trt_common_->getBindingDimensions(3); + const auto pred_out_dims = trt_common_->getTensorShape(3); out_pred_elem_num_ = std::accumulate( pred_out_dims.d + 1, pred_out_dims.d + pred_out_dims.nbDims, 1, std::multiplies()); - out_pred_elem_num_ = out_pred_elem_num_ * batch_config[2]; - out_pred_elem_num_per_batch_ = static_cast(out_pred_elem_num_ / batch_config[2]); + out_pred_elem_num_ = out_pred_elem_num_ * batch_size_; + out_pred_elem_num_per_batch_ = static_cast(out_pred_elem_num_ / batch_size_); out_pred_prob_d_ = autoware::cuda_utils::make_unique(out_pred_elem_num_); out_pred_prob_h_ = autoware::cuda_utils::make_unique_host(out_pred_elem_num_, cudaHostAllocPortable); @@ -83,10 +77,6 @@ TrtShapeEstimator::TrtShapeEstimator( bool TrtShapeEstimator::inference( const DetectedObjectsWithFeature & input, DetectedObjectsWithFeature & output) { - if (!trt_common_->isInitialized()) { - return false; - } - bool result = false; for (size_t i = 0; i < input.feature_objects.size(); i += batch_size_) { @@ -111,13 +101,13 @@ bool TrtShapeEstimator::inference( void TrtShapeEstimator::preprocess(const DetectedObjectsWithFeature & input) { - auto input_dims_pc = trt_common_->getBindingDimensions(0); + auto input_dims_pc = trt_common_->getTensorShape(0); int batch_size = static_cast(input.feature_objects.size()); const auto input_chan = static_cast(input_dims_pc.d[1]); const auto input_pc_size = static_cast(input_dims_pc.d[2]); - auto input_dims_one_hot = trt_common_->getBindingDimensions(1); + auto input_dims_one_hot = trt_common_->getTensorShape(1); const auto input_one_hot_size = static_cast(input_dims_one_hot.d[1]); int volume_pc = batch_size * input_chan * input_pc_size; @@ -229,7 +219,10 @@ bool TrtShapeEstimator::feed_forward_and_decode( int batch_size = static_cast(input.feature_objects.size()); std::vector buffers = { input_pc_d_.get(), input_one_hot_d_.get(), out_s1center_prob_d_.get(), out_pred_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( out_s1center_prob_h_.get(), out_s1center_prob_d_.get(), out_s1center_elem_num_ * sizeof(float), diff --git a/perception/autoware_shape_estimation/src/shape_estimation_node.cpp b/perception/autoware_shape_estimation/src/shape_estimation_node.cpp index 67402f68d63ef..c5450dc14adb0 100644 --- a/perception/autoware_shape_estimation/src/shape_estimation_node.cpp +++ b/perception/autoware_shape_estimation/src/shape_estimation_node.cpp @@ -63,9 +63,8 @@ ShapeEstimationNode::ShapeEstimationNode(const rclcpp::NodeOptions & node_option min_points_ = declare_parameter("model_params.minimum_points"); std::string precision = declare_parameter("model_params.precision"); int batch_size = declare_parameter("model_params.batch_size"); - autoware::tensorrt_common::BatchConfig batch_config{batch_size, batch_size, batch_size}; tensorrt_shape_estimator_ = - std::make_unique(model_path, precision, batch_config); + std::make_unique(model_path, precision, batch_size); if (this->declare_parameter("model_params.build_only", false)) { RCLCPP_INFO(this->get_logger(), "TensorRT engine is built and shutdown node."); rclcpp::shutdown(); diff --git a/perception/autoware_tensorrt_classifier/CMakeLists.txt b/perception/autoware_tensorrt_classifier/CMakeLists.txt index 88747a1e9240c..59ce7b18afcc3 100644 --- a/perception/autoware_tensorrt_classifier/CMakeLists.txt +++ b/perception/autoware_tensorrt_classifier/CMakeLists.txt @@ -6,6 +6,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O3 -Wno-write-strings -fopen find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(CUDA) find_package(CUDNN) find_package(TENSORRT) diff --git a/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp b/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp index ee16343b956d1..4f7a5865c72ae 100644 --- a/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp +++ b/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -44,22 +45,18 @@ class TrtClassifier * @param[in] mode_path ONNX model_path * @param[in] precision precision for inference * @param[in] calibration_images path for calibration files (only require for quantization) - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine * @param[in] mean mean for preprocess * @param[in] std std for preprocess - * @param[in] buildConfig configuration including precision, calibration method, dla, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder + * @param[in] calib_config calibration configuration * @param[in] cuda whether use cuda gpu for preprocessing */ TrtClassifier( const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, const std::vector & mean = {123.675, 116.28, 103.53}, const std::vector & std = {58.395, 57.12, 57.375}, - const size_t max_workspace_size = (1 << 30), const std::string & calibration_images = "", - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0), + const std::string & calibration_images = "", + const tensorrt_common::CalibrationConfig & calibration_config = + tensorrt_common::CalibrationConfig("MinMax", false, false, 0.0), const bool cuda = false); /** * @brief Deconstruct TrtClassifier @@ -84,6 +81,12 @@ class TrtClassifier */ void initPreprocessBuffer(int width, int height); + /** + * @brief get batch size + * @return batch size + */ + [[nodiscard]] int getBatchSize() const; + private: /** * @brief run preprocess including resizing, letterbox, BGR2RGB, NHWC2NCHW and toFloat on CPU @@ -102,7 +105,7 @@ class TrtClassifier const std::vector & images, std::vector & results, std::vector & probabilities); - std::unique_ptr trt_common_; + std::unique_ptr trt_common_; std::vector input_h_; CudaUniquePtr input_d_; diff --git a/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp b/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp index 4aa008e42c966..7666ae4f9d068 100644 --- a/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp +++ b/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp @@ -100,11 +100,9 @@ std::vector loadImageList(const std::string & filename, const std:: namespace autoware::tensorrt_classifier { TrtClassifier::TrtClassifier( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, const std::vector & mean, - const std::vector & std, const size_t max_workspace_size, - const std::string & calibration_image_list_path, - autoware::tensorrt_common::BuildConfig build_config, const bool cuda) + const std::string & model_path, const std::string & precision, const std::vector & mean, + const std::vector & std, const std::string & calibration_image_list_path, + const tensorrt_common::CalibrationConfig & calib_config, const bool cuda) { src_width_ = -1; src_height_ = -1; @@ -114,22 +112,36 @@ TrtClassifier::TrtClassifier( for (size_t i = 0; i < inv_std_.size(); i++) { inv_std_[i] = 1.0 / inv_std_[i]; } - batch_size_ = batch_config[2]; + trt_common_ = std::make_unique( + tensorrt_common::TrtCommonConfig(model_path, precision)); + + const auto network_input_dims = trt_common_->getTensorShape(0); + batch_size_ = network_input_dims.d[0]; + const auto input_channel = network_input_dims.d[1]; + const auto input_height = network_input_dims.d[2]; + const auto input_width = network_input_dims.d[3]; + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + 0, {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}})}; + auto profile_dims_ptr = + std::make_unique>(profile_dims); + if (precision == "int8") { - int max_batch_size = batch_config[2]; - nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path); std::vector calibration_images; if (calibration_image_list_path != "") { calibration_images = loadImageList(calibration_image_list_path, ""); } autoware::tensorrt_classifier::ImageStream stream( - max_batch_size, input_dims, calibration_images); + batch_size_, network_input_dims, calibration_images); fs::path calibration_table{model_path}; std::string ext = ""; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { ext = "EntropyV2-"; } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { ext = "Legacy-"; } else { ext = "MinMax-"; @@ -141,11 +153,11 @@ TrtClassifier::TrtClassifier( histogram_table.replace_extension(ext); std::unique_ptr calibrator; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { calibrator.reset(new autoware::tensorrt_classifier::Int8EntropyCalibrator( stream, calibration_table, mean_, std_)); } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { double quantile = 0.999999; double cutoff = 0.999999; calibrator.reset(new autoware::tensorrt_classifier::Int8LegacyCalibrator( @@ -154,29 +166,27 @@ TrtClassifier::TrtClassifier( calibrator.reset(new autoware::tensorrt_classifier::Int8MinMaxCalibrator( stream, calibration_table, mean_, std_)); } - trt_common_ = std::make_unique( - model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); + if (!trt_common_->setupWithCalibrator( + std::move(calibrator), calib_config, std::move((profile_dims_ptr)))) { + throw std::runtime_error("Failed to setup TensorRT engine with calibrator"); + } } else { - trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); - } - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - return; + if (!trt_common_->setup(std::move(profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TensorRT engine"); + } } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); - const auto output_dims = trt_common_->getBindingDimensions(1); - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); + const auto output_dims = trt_common_->getTensorShape(1); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); out_elem_num_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num_ = out_elem_num_ * batch_config[2]; - out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_elem_num_ = out_elem_num_ * batch_size_; + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_size_); out_prob_d_ = autoware::cuda_utils::make_unique(out_elem_num_); out_prob_h_ = autoware::cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); @@ -224,7 +234,7 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) src_width_ = width; src_height_ = height; if (m_cuda) { - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); bool const hasRuntimeDim = std::any_of( input_dims.d, input_dims.d + input_dims.nbDims, [](int32_t input_dim) { return input_dim == -1; }); @@ -232,7 +242,9 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) input_dims.d[0] = batch_size_; } if (!h_img_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } if (!h_img_) { CHECK_CUDA_ERROR(cudaMallocHost( @@ -245,10 +257,15 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) } } +int TrtClassifier::getBatchSize() const +{ + return batch_size_; +} + void TrtClassifier::preprocessGpu(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -272,7 +289,9 @@ void TrtClassifier::preprocessGpu(const std::vector & images) src_height_ = height; } if (!h_img_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -305,9 +324,11 @@ void TrtClassifier::preprocessGpu(const std::vector & images) void TrtClassifier::preprocess_opt(const std::vector & images) { int batch_size = static_cast(images.size()); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_chan = static_cast(input_dims.d[1]); const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -354,9 +375,6 @@ bool TrtClassifier::doInference( const std::vector & images, std::vector & results, std::vector & probabilities) { - if (!trt_common_->isInitialized()) { - return false; - } preprocess_opt(images); return feedforwardAndDecode(images, results, probabilities); @@ -369,7 +387,10 @@ bool TrtClassifier::feedforwardAndDecode( results.clear(); probabilities.clear(); std::vector buffers = {input_d_.get(), out_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); int batch_size = static_cast(images.size()); diff --git a/perception/autoware_tensorrt_common/CMakeLists.txt b/perception/autoware_tensorrt_common/CMakeLists.txt index 3105523a509e3..d7d8026866124 100644 --- a/perception/autoware_tensorrt_common/CMakeLists.txt +++ b/perception/autoware_tensorrt_common/CMakeLists.txt @@ -16,9 +16,14 @@ if(NOT (CUDAToolkit_FOUND AND CUDNN_FOUND AND TENSORRT_FOUND)) return() endif() +if(TENSORRT_VERSION VERSION_LESS 8.5) + message(WARNING "Unsupported version TensorRT ${TENSORRT_VERSION} detected. This package requires TensorRT 8.5 or later.") + return() +endif() + add_library(${PROJECT_NAME} SHARED src/tensorrt_common.cpp - src/simple_profiler.cpp + src/profiler.cpp ) target_link_libraries(${PROJECT_NAME} @@ -42,7 +47,7 @@ set_target_properties(${PROJECT_NAME} CXX_EXTENSIONS NO ) -# TODO(autoware_tensorrt_common): Remove -Wno-deprecated-declarations once upgrading to TensorRT 8.5 is complete +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic -Werror -Wno-deprecated-declarations ) diff --git a/perception/autoware_tensorrt_common/README.md b/perception/autoware_tensorrt_common/README.md index c828be58c1c1c..54bfccdfcb561 100644 --- a/perception/autoware_tensorrt_common/README.md +++ b/perception/autoware_tensorrt_common/README.md @@ -1,6 +1,70 @@ # autoware_tensorrt_common -## Purpose +This package provides a high-level API to work with TensorRT. This library simplifies the process of loading, building, and executing TensorRT inference engines using ONNX models. It also includes utilities for profiling and managing TensorRT execution contexts, making it easier to integrate TensorRT-based packages in Autoware. -This package contains a library of common functions related to TensorRT. -This package may include functions for handling TensorRT engine and calibration algorithm used for quantization +## Usage + +Here is an example usage of the library. For the full API documentation, please refer to the doxygen documentation (see header [file](include/autoware/tensorrt_common/tensorrt_common.hpp)). + +```c++ +#include + +#include +#include +#include + +using autoware::tensorrt_common::TrtCommon; +using autoware::tensorrt_common::TrtCommonConfig; +using autoware::tensorrt_common::TensorInfo; +using autoware::tensorrt_common::NetworkIO; +using autoware::tensorrt_common::ProfileDims; + +std::unique_ptr trt_common_; +``` + +### Create a tensorrt common instance and setup engine + +- With minimal configuration. + +```c++ +trt_common_ = std::make_unique(TrtCommonConfig("/path/to/onnx/model.onnx")); +trt_common_->setup(); +``` + +- With full configuration. + +```c++ +trt_common_ = std::make_unique(TrtCommonConfig("/path/to/onnx/model.onnx", "fp16", "/path/to/engine/model.engine", (1ULL << 30U), -1, false)); + +std::vector network_io{ + NetworkIO("sample_input", {3, {-1, 64, 512}}), NetworkIO("sample_output", {1, {50}})}; +std::vector profile_dims{ + ProfileDims("sample_input", {3, {1, 64, 512}}, {3, {3, 64, 512}}, {3, {9, 64, 512}})}; + +auto network_io_ptr = std::make_unique>(network_io); +auto profile_dims_ptr = std::make_unique>(profile_dims); + +trt_common_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr)); +``` + +By defining network IO names and dimensions, an extra shapes validation will be performed after building / loading engine. This is useful to ensure the model is compatible with current code for preprocessing as it might consists of operations dependent on tensor shapes. + +Profile dimension is used to specify the min, opt, and max dimensions for dynamic shapes. + +Network IO or / and profile dimensions can be omitted if not needed. + +### Setting input and output tensors + +```c++ +bool success = true; +success &= trt_common_->setTensor("sample_input", sample_input_d_.get(), nvinfer1::Dims{3, {var_size, 64, 512}}); +success &= trt_common_->setTensor("sample_output", sample_output_d_.get()); +return success; +``` + +### Execute inference + +```c++ +auto success = trt_common_->enqueueV3(stream_); +return success; +``` diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp new file mode 100644 index 0000000000000..0b58a797d8b9b --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -0,0 +1,108 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ +#define AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ + +#include + +#include + +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +/** + * @struct ConvLayerInfo + * @brief Information of a layer. + */ +struct ConvLayerInfo +{ + //! @brief Input channel. + int in_c; + //! @brief Output channel. + int out_c; + //! @brief Width. + int w; + //! @brief Height. + int h; + //! @brief Kernel size. + int k; + //! @brief Stride. + int stride; + //! @brief Number of groups. + int groups; + //! @brief Layer type. + nvinfer1::LayerType type; +}; + +/** + * @class ConvProfiler + * @brief Collect per-layer profile information, assuming times are reported in the same order. + */ +class ConvProfiler : public tensorrt_common::Profiler +{ +public: + /** + * @brief Construct Profiler for convolutional layers. + * + * @param[in] src_profilers Source profilers to merge. + */ + explicit ConvProfiler( + const std::vector & src_profilers = std::vector()) + : Profiler(src_profilers) + { + } + + /** + * @brief Set per-layer profile information for model. + * + * @param[in] layer Layer to set profile information. + */ + void setProfDict(nvinfer1::ILayer * const layer) noexcept final + { + if (const auto type = layer->getType(); type == nvinfer1::LayerType::kCONVOLUTION) { + const auto name = layer->getName(); + auto conv = dynamic_cast(layer); + + nvinfer1::ITensor * in = layer->getInput(0); + nvinfer1::Dims in_dim = in->getDimensions(); + + nvinfer1::ITensor * out = layer->getOutput(0); + nvinfer1::Dims out_dim = out->getDimensions(); + + nvinfer1::Dims k_dims = conv->getKernelSizeNd(); + nvinfer1::Dims s_dims = conv->getStrideNd(); + + int32_t kernel = k_dims.d[0]; + int32_t stride = s_dims.d[0]; + int32_t groups = conv->getNbGroups(); + + layer_dict_.insert_or_assign( + name, ConvLayerInfo{ + in_dim.d[1], out_dim.d[1], in_dim.d[3], in_dim.d[2], kernel, stride, groups, type}); + } + } + +private: + //! @brief Per-layer information. + std::map layer_dict_; +}; +} // namespace tensorrt_common +} // namespace autoware +#endif // AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp index 5aa897983af72..b9b9048cd8204 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp @@ -17,18 +17,19 @@ #ifndef AUTOWARE__TENSORRT_COMMON__LOGGER_HPP_ #define AUTOWARE__TENSORRT_COMMON__LOGGER_HPP_ -#include "NvInferRuntimeCommon.h" +#include #include #include +#include #include #include -#include #include #include #include #include #include +#include namespace autoware { @@ -46,7 +47,7 @@ class LogStreamConsumerBuffer : public std::stringbuf LogStreamConsumerBuffer(LogStreamConsumerBuffer && other) : mOutput(other.mOutput) {} - ~LogStreamConsumerBuffer() + ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output // sequence std::streambuf::pptr() gives a pointer to the current position of the output @@ -60,7 +61,7 @@ class LogStreamConsumerBuffer : public std::stringbuf // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream - virtual int sync() + int sync() override { putOutput(); return 0; @@ -252,6 +253,37 @@ class Logger : public nvinfer1::ILogger // NOLINT } } + void log(Severity severity, const char * msg, ...) const noexcept + { + if (mVerbose) { + va_list args; + va_start(args, msg); + + // Buffer size + va_list args_copy; + va_copy(args_copy, args); + int required_size = std::vsnprintf(nullptr, 0, msg, args_copy); + va_end(args_copy); + + // Formatting error + if (required_size < 0) { + log(Severity::kINTERNAL_ERROR, "Error formatting log message"); + va_end(args); + return; + } + + // Format msg + std::vector buffer(required_size + 1); + std::vsnprintf(buffer.data(), buffer.size(), msg, args); + + va_end(args); // End variadic argument processing + + // Send the formatted message to LogStreamConsumer + LogStreamConsumer(mReportableSeverity, severity) + << "[TRT] " << std::string(buffer.data()) << std::endl; + } + } + /** * @brief Logging with throttle. * diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp new file mode 100644 index 0000000000000..d45351f97fd01 --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp @@ -0,0 +1,96 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ +#define AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ + +#include + +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ + +/** + * @class Profiler + * @brief Collect per-layer profile information, assuming times are reported in the same order. + */ +class Profiler : public nvinfer1::IProfiler +{ +public: + /** + * @struct Record + * @brief Record of layer profile information. + */ + struct Record + { + float time{0}; + int count{0}; + float min_time{-1.0}; + int index; + }; + + /** + * @brief Construct Profiler. + * + * @param[in] src_profilers Source profilers to merge. + */ + explicit Profiler(const std::vector & src_profilers = std::vector()); + + /** + * @brief Report layer time. + * + * @param[in] layerName Layer name. + * @param[in] ms Time in milliseconds. + */ + void reportLayerTime(const char * layerName, float ms) noexcept final; + + /** + * @brief Get printable representation of Profiler. + * + * @return String representation for current state of Profiler. + */ + [[nodiscard]] std::string toString() const; + + /** + * @brief Set per-layer profile information for model. + * + * @param[in] layer Layer to set profile information. + */ + virtual void setProfDict([[maybe_unused]] nvinfer1::ILayer * layer) noexcept {}; + + /** + * @brief Output Profiler to ostream. + * + * @param[out] out Output stream. + * @param[in] value Profiler to output. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & out, const Profiler & value); + +private: + //!< @brief Profile information for layers. + std::map profile_; + + //!< @brief Index for layer. + int index_; +}; +} // namespace tensorrt_common +} // namespace autoware +#endif // AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp deleted file mode 100644 index a8d504fb2861a..0000000000000 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ -#define AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ - -#include - -#include -#include -#include -#include - -namespace autoware -{ -namespace tensorrt_common -{ -struct LayerInfo -{ - int in_c; - int out_c; - int w; - int h; - int k; - int stride; - int groups; - nvinfer1::LayerType type; -}; - -/** - * @class Profiler - * @brief Collect per-layer profile information, assuming times are reported in the same order - */ -class SimpleProfiler : public nvinfer1::IProfiler -{ -public: - struct Record - { - float time{0}; - int count{0}; - float min_time{-1.0}; - int index; - }; - SimpleProfiler( - std::string name, - const std::vector & src_profilers = std::vector()); - - void reportLayerTime(const char * layerName, float ms) noexcept override; - - void setProfDict(nvinfer1::ILayer * layer) noexcept; - - friend std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value); - -private: - std::string m_name; - std::map m_profile; - int m_index; - std::map m_layer_dict; -}; -} // namespace tensorrt_common -} // namespace autoware -#endif // AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp index 2b3b3f02f315f..9355732962e23 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp @@ -15,226 +15,367 @@ #ifndef AUTOWARE__TENSORRT_COMMON__TENSORRT_COMMON_HPP_ #define AUTOWARE__TENSORRT_COMMON__TENSORRT_COMMON_HPP_ -#ifndef YOLOX_STANDALONE -#include - -#include -#endif +#include "autoware/tensorrt_common/logger.hpp" +#include "autoware/tensorrt_common/profiler.hpp" +#include "autoware/tensorrt_common/utils.hpp" #include #include -#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) -#include -namespace fs = ::std::filesystem; -#else -#include -namespace fs = ::std::experimental::filesystem; -#endif - -#include -#include - #include -#include #include +#include +#include #include namespace autoware { namespace tensorrt_common { -/** - * @struct BuildConfig - * @brief Configuration to provide fine control regarding TensorRT builder - */ -struct BuildConfig -{ - // type for calibration - std::string calib_type_str; - - // DLA core ID that the process uses - int dla_core_id; - - // flag for partial quantization in first layer - bool quantize_first_layer; // For partial quantization - - // flag for partial quantization in last layer - bool quantize_last_layer; // For partial quantization - - // flag for per-layer profiler using IProfiler - bool profile_per_layer; - - // clip value for implicit quantization - double clip_value; // For implicit quantization - - // Supported calibration type - const std::array valid_calib_type = {"Entropy", "Legacy", "Percentile", "MinMax"}; - - BuildConfig() - : calib_type_str("MinMax"), - dla_core_id(-1), - quantize_first_layer(false), - quantize_last_layer(false), - profile_per_layer(false), - clip_value(0.0) - { - } - - explicit BuildConfig( - const std::string & calib_type_str, const int dla_core_id = -1, - const bool quantize_first_layer = false, const bool quantize_last_layer = false, - const bool profile_per_layer = false, const double clip_value = 0.0) - : calib_type_str(calib_type_str), - dla_core_id(dla_core_id), - quantize_first_layer(quantize_first_layer), - quantize_last_layer(quantize_last_layer), - profile_per_layer(profile_per_layer), - clip_value(clip_value) - { -#ifndef YOLOX_STANDALONE - if ( - std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) == - valid_calib_type.end()) { - std::stringstream message; - message << "Invalid calibration type was specified: " << calib_type_str << std::endl - << "Valid value is one of: [Entropy, (Legacy | Percentile), MinMax]" << std::endl - << "Default calibration type will be used: MinMax" << std::endl; - std::cerr << message.str(); - } -#endif - } -}; - -nvinfer1::Dims get_input_dims(const std::string & onnx_file_path); - -const std::array valid_precisions = {"fp32", "fp16", "int8"}; -bool is_valid_precision_string(const std::string & precision); - template struct InferDeleter // NOLINT { - void operator()(T * obj) const - { - if (obj) { -#if TENSORRT_VERSION_MAJOR >= 8 - delete obj; -#else - obj->destroy(); -#endif - } - } + void operator()(T * obj) const { delete obj; } }; template using TrtUniquePtr = std::unique_ptr>; -using BatchConfig = std::array; +using NetworkIOPtr = std::unique_ptr>; +using ProfileDimsPtr = std::unique_ptr>; +using TensorsVec = std::vector>; +using TensorsMap = std::unordered_map>; /** * @class TrtCommon - * @brief TensorRT common library + * @brief TensorRT common library. */ class TrtCommon // NOLINT { public: /** * @brief Construct TrtCommon. - * @param[in] mode_path ONNX model_path - * @param[in] precision precision for inference - * @param[in] calibrator pointer for any type of INT8 calibrator - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine - * @param[in] buildConfig configuration including precision, calibration method, dla, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder - * @param[in] plugin_paths path for custom plugin + * + * @param[in] trt_config Base configuration with ONNX model path as minimum required. + * parameter. + * @param[in] profiler Per-layer profiler. + * @param[in] plugin_paths Paths for TensorRT plugins. */ TrtCommon( - const std::string & model_path, const std::string & precision, - std::unique_ptr calibrator = nullptr, - const BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (16 << 20), - const BuildConfig & buildConfig = BuildConfig(), + const TrtCommonConfig & trt_config, + const std::shared_ptr & profiler = std::make_shared(), const std::vector & plugin_paths = {}); - /** * @brief Deconstruct TrtCommon */ ~TrtCommon(); /** - * @brief Load TensorRT engine - * @param[in] engine_file_path path for a engine file - * @return flag for whether loading are succeeded or failed + * @brief Setup for TensorRT execution including building and loading engine. + * + * @param[in] profile_dims Optimization profile of tensors for dynamic shapes. + * @param[in] network_io Network input/output tensors information. + * @return Whether setup is successful. + */ + [[nodiscard]] virtual bool setup( + ProfileDimsPtr profile_dims = nullptr, NetworkIOPtr network_io = nullptr); + + /** + * @brief Get TensorRT engine precision. + * + * @return string representation of TensorRT engine precision. + */ + [[nodiscard]] std::string getPrecision() const; + + /** + * @brief Get tensor name by index from TensorRT engine with fallback from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor name. + */ + [[nodiscard]] const char * getIOTensorName(const int32_t index) const; + + /** + * @brief Get number of IO tensors from TensorRT engine with fallback from TensorRT network. + * + * @return Number of IO tensors. + */ + [[nodiscard]] int32_t getNbIOTensors() const; + + /** + * @brief Get tensor shape by index from TensorRT engine with fallback from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getTensorShape(const int32_t index) const; + + /** + * @brief Get tensor shape by name from TensorRT engine. + * + * @param[in] tensor_name Tensor name. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getTensorShape(const char * tensor_name) const; + + /** + * @brief Get input tensor shape by index from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getInputDims(const int32_t index) const; + + /** + * @brief Get output tensor shape by index from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getOutputDims(const int32_t index) const; + + /** + * @brief Set tensor address by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] data Tensor pointer. + * @return Whether setting tensor address is successful. + */ + bool setTensorAddress(const int32_t index, void * data); + + /** + * @brief Set tensor address by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] data Tensor pointer. + * @return Whether setting tensor address is successful. + */ + bool setTensorAddress(const char * tensor_name, void * data); + + /** + * @brief Set tensors addresses by indices via TensorRT context. + * + * @param[in] tensors Tensors pointers. + * @return Whether setting tensors addresses is successful. + */ + bool setTensorsAddresses(std::vector & tensors); + + /** + * @brief Set tensors addresses by names via TensorRT context. + * + * @param[in] tensors Tensors pointers. + * @return Whether setting tensors addresses is successful. + */ + bool setTensorsAddresses(std::unordered_map & tensors); + + /** + * @brief Set input shape by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting input shape is successful. + */ + bool setInputShape(const int32_t index, const nvinfer1::Dims & dimensions); + + /** + * @brief Set input shape by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting input shape is successful. + */ + bool setInputShape(const char * tensor_name, const nvinfer1::Dims & dimensions); + + /** + * @brief Set inputs shapes by indices via TensorRT context. + * + * @param[in] dimensions Vector of tensor dimensions with corresponding indices. + * @return Whether setting input shapes is successful. + */ + bool setInputsShapes(const std::vector & dimensions); + + /** + * @brief Set inputs shapes by names via TensorRT context. + * + * @param[in] dimensions Map of tensor dimensions with corresponding names as keys. + * @return Whether setting input shapes is successful. + */ + bool setInputsShapes(const std::unordered_map & dimensions); + + /** + * @brief Set tensor (address and shape) by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] data Tensor pointer. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting tensor is successful. + */ + bool setTensor(const int32_t index, void * data, nvinfer1::Dims dimensions = {}); + + /** + * @brief Set tensor (address and shape) by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] data Tensor pointer. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting tensor is successful. + */ + bool setTensor(const char * tensor_name, void * data, nvinfer1::Dims dimensions = {}); + + /** + * @brief Set tensors (addresses and shapes) by indices via TensorRT context. + * + * @param[in] tensors Vector of tensor pointers and dimensions with corresponding indices. + * @return Whether setting tensors is successful. */ - bool loadEngine(const std::string & engine_file_path); + bool setTensors(TensorsVec & tensors); /** - * @brief Output layer information including GFLOPs and parameters - * @param[in] onnx_file_path path for a onnx file - * @warning This function is based on darknet log. + * @brief Set tensors (addresses and shapes) by names via TensorRT context. + * + * @param[in] tensors Map of tensor pointers and dimensions with corresponding names as keys. + * @return Whether setting tensors is successful. */ - void printNetworkInfo(const std::string & onnx_file_path); + bool setTensors(TensorsMap & tensors); /** - * @brief build TensorRT engine from ONNX - * @param[in] onnx_file_path path for a onnx file - * @param[in] output_engine_file_path path for a engine file + * @brief Get per-layer profiler for model. + * + * @return Per-layer profiler. */ - bool buildEngineFromOnnx( - const std::string & onnx_file_path, const std::string & output_engine_file_path); + [[nodiscard]] std::shared_ptr getModelProfiler(); /** - * @brief setup for TensorRT execution including building and loading engine + * @brief Get per-layer profiler for host. + * + * @return Per-layer profiler. */ - void setup(); + [[nodiscard]] std::shared_ptr getHostProfiler(); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 - void setupBindings(std::vector & bindings); -#endif + /** + * @brief Get TensorRT common configuration. + * + * @return TensorRT common configuration. + */ + [[nodiscard]] std::shared_ptr getTrtCommonConfig(); - bool isInitialized(); + /** + * @brief Get TensorRT builder configuration. + * + * @return TensorRT builder configuration. + */ + [[nodiscard]] std::shared_ptr getBuilderConfig(); + + /** + * @brief Get TensorRT network definition. + * + * @return TensorRT network definition. + */ + [[nodiscard]] std::shared_ptr getNetwork(); - nvinfer1::Dims getBindingDimensions(const int32_t index) const; - int32_t getNbBindings(); - bool setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const; -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 + /** + * @brief Get TensorRT logger. + * + * @return TensorRT logger. + */ + [[nodiscard]] std::shared_ptr getLogger(); + + /** + * @brief Execute inference via TensorRT context. + * + * @param[in] stream CUDA stream. + * @return Whether inference is successful. + */ bool enqueueV3(cudaStream_t stream); -#endif -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 - bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed); -#endif /** - * @brief output per-layer information + * @brief Print per-layer information. + */ + void printProfiling() const; + +private: + /** + * @brief Initialize TensorRT common. + * + * @return Whether initialization is successful. */ - void printProfiling(void); + [[nodiscard]] bool initialize(); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 /** - * @brief get per-layer information for trt-engine-profiler + * @brief Get per-layer information for trt-engine-profiler. + * + * @param[in] format Format for layer information. + * @return Layer information. */ std::string getLayerInformation(nvinfer1::LayerInformationFormat format); -#endif -private: - Logger logger_; - fs::path model_file_path_; + /** + + * @brief Build TensorRT engine from ONNX. + * + * @return Whether building engine is successful. + */ + bool buildEngineFromOnnx(); + + /** + * @brief Load TensorRT engine. + * + * @return Whether loading engine is successful. + */ + bool loadEngine(); + + /** + * @brief Validate network input/output names and dimensions. + * + * @return Whether network input/output is valid. + */ + bool validateNetworkIO(); + + /** + * @brief Validate optimization profile. + * + * @return Whether optimization profile is valid. + */ + bool validateProfileDims(); + + //! @brief TensorRT runtime. TrtUniquePtr runtime_; + + //! @brief TensorRT engine. TrtUniquePtr engine_; + + //! @brief TensorRT execution context. TrtUniquePtr context_; - std::unique_ptr calibrator_; - std::string precision_; - BatchConfig batch_config_; - size_t max_workspace_size_; - bool is_initialized_{false}; + //! @brief TensorRT builder. + TrtUniquePtr builder_; + + //! @brief TensorRT engine parser. + TrtUniquePtr parser_; + + //! @brief TensorRT builder configuration. + std::shared_ptr builder_config_; + + //! @brief TensorRT network definition. + std::shared_ptr network_; + + //! @brief TrtCommon library base configuration. + std::shared_ptr trt_config_; + + //! @brief TensorRT logger. + std::shared_ptr logger_; + + //! @brief Per-layer profiler for host. + std::shared_ptr host_profiler_; + + //! @brief Per-layer profiler for model. + std::shared_ptr model_profiler_; - // profiler for per-layer - SimpleProfiler model_profiler_; - // profiler for whole model - SimpleProfiler host_profiler_; + //! @brief Model optimization profile. + ProfileDimsPtr profile_dims_; - std::unique_ptr build_config_; + //! @brief Model network input/output tensors information. + NetworkIOPtr network_io_; }; } // namespace tensorrt_common diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp new file mode 100644 index 0000000000000..44a944031086e --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp @@ -0,0 +1,241 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ +#define AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ + +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +template +bool contain(const std::string & s, const T & v) +{ + return s.find(v) != std::string::npos; +} + +using autoware::tensorrt_common::CalibrationConfig; +using autoware::tensorrt_common::NetworkIOPtr; +using autoware::tensorrt_common::ProfileDimsPtr; +using autoware::tensorrt_common::Profiler; +using autoware::tensorrt_common::TrtCommonConfig; + +/** + * @class TrtConvCalib + * @brief TensorRT common library with calibration. + */ +class TrtConvCalib : public tensorrt_common::TrtCommon +{ +public: + /** + * @brief Construct TrtCommonCalib. + * + * @param[in] trt_config base trt common configuration, ONNX model path is mandatory + * @param[in] profiler per-layer profiler + * @param[in] plugin_paths paths for TensorRT plugins + */ + explicit TrtConvCalib( + const TrtCommonConfig & trt_config, + const std::shared_ptr & profiler = std::make_shared(), + const std::vector & plugin_paths = {}) + : TrtCommon(trt_config, profiler, plugin_paths) + { + } + + /** + * @brief Setup for TensorRT execution including calibration, building and loading engine. + * + * @param[in] calibrator Pointer for any type of INT8 calibrator. + * @param[in] calib_config Calibration configuration. + * @param[in] profile_dims Optimization profile of input tensors for dynamic shapes. + * @param[in] network_io Network input/output tensors information. + * @return Whether setup is successful. + */ + [[nodiscard]] bool setupWithCalibrator( + std::unique_ptr calibrator, const CalibrationConfig & calib_config, + ProfileDimsPtr profile_dims = nullptr, NetworkIOPtr network_io = nullptr) + { + calibrator_ = std::move(calibrator); + calib_config_ = std::make_unique(calib_config); + + auto builder_config = getBuilderConfig(); + builder_config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + builder_config->setInt8Calibrator(calibrator_.get()); + + // Model specific quantization + auto logger = getLogger(); + auto quantization_log = quantization(); + logger->log(nvinfer1::ILogger::Severity::kINFO, quantization_log.c_str()); + + return setup(std::move(profile_dims), std::move(network_io)); + } + +private: + /** + * @brief Implicit quantization for TensorRT. + * + * @return Output log for TensorRT logger. + */ + std::string quantization() + { + auto network = getNetwork(); + auto trt_config = getTrtCommonConfig(); + auto model_profiler = getModelProfiler(); + + const int num = network->getNbLayers(); + bool first = calib_config_->quantize_first_layer; + bool last = calib_config_->quantize_last_layer; + std::stringstream ss; + + // Partial Quantization + if (getPrecision() == "int8") { + auto builder_config = getBuilderConfig(); + builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); + network->getInput(0)->setDynamicRange(0, 255.0); + for (int i = 0; i < num; i++) { + nvinfer1::ILayer * layer = network->getLayer(i); + auto layer_type = layer->getType(); + std::string name = layer->getName(); + nvinfer1::ITensor * out = layer->getOutput(0); + if (calib_config_->clip_value > 0.0) { + ss << "Set max value for outputs: " << calib_config_->clip_value << " " << name + << std::endl; + out->setDynamicRange(0.0, calib_config_->clip_value); + } + + if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { + if (first) { + layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << name << std::endl; + first = false; + } + if (last) { + // cspell: ignore preds + if ( + contain(name, "reg_preds") || contain(name, "cls_preds") || + contain(name, "obj_preds")) { + layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << name << std::endl; + } + for (int j = num - 1; j >= 0; j--) { + nvinfer1::ILayer * inner_layer = network->getLayer(j); + auto inner_layer_type = inner_layer->getType(); + std::string inner_name = inner_layer->getName(); + if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) { + inner_layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << inner_name << std::endl; + break; + } + if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) { + inner_layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << inner_name << std::endl; + break; + } + } + } + } + } + builder_config->setFlag(nvinfer1::BuilderFlag::kINT8); + } + + // Print layer information + float total_gflops = 0.0; + int total_params = 0; + + for (int i = 0; i < num; i++) { + nvinfer1::ILayer * layer = network->getLayer(i); + auto layer_type = layer->getType(); + if (trt_config->profile_per_layer) { + model_profiler->setProfDict(layer); + } + if (layer_type == nvinfer1::LayerType::kCONSTANT) { + continue; + } + nvinfer1::ITensor * in = layer->getInput(0); + nvinfer1::Dims dim_in = in->getDimensions(); + nvinfer1::ITensor * out = layer->getOutput(0); + nvinfer1::Dims dim_out = out->getDimensions(); + + if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { + auto conv = dynamic_cast(layer); + nvinfer1::Dims k_dims = conv->getKernelSizeNd(); + nvinfer1::Dims s_dims = conv->getStrideNd(); + int groups = conv->getNbGroups(); + int stride = s_dims.d[0]; + int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1]; + float gflops = (2.0 * num_weights) * (static_cast(dim_in.d[3]) / stride * + static_cast(dim_in.d[2]) / stride / 1e9); + total_gflops += gflops; + total_params += num_weights; + ss << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups << ") " + << ") " << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x" + << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x" << dim_out.d[1]; + ss << " weights:" << num_weights; + ss << " GFLOPs:" << gflops; + ss << std::endl; + } else if (layer_type == nvinfer1::LayerType::kPOOLING) { + auto pool = dynamic_cast(layer); + auto p_type = pool->getPoolingType(); + nvinfer1::Dims dim_stride = pool->getStrideNd(); + nvinfer1::Dims dim_window = pool->getWindowSizeNd(); + + ss << "L" << i << " ["; + if (p_type == nvinfer1::PoolingType::kMAX) { + ss << "max "; + } else if (p_type == nvinfer1::PoolingType::kAVERAGE) { + ss << "avg "; + } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) { + ss << "max avg blend "; + } + float gflops = static_cast(dim_in.d[1]) * + (static_cast(dim_window.d[0]) / static_cast(dim_stride.d[0])) * + (static_cast(dim_window.d[1]) / static_cast(dim_stride.d[1])) * + static_cast(dim_in.d[2]) * static_cast(dim_in.d[3]) / 1e9; + total_gflops += gflops; + ss << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]"; + ss << " GFLOPs:" << gflops; + ss << std::endl; + } else if (layer_type == nvinfer1::LayerType::kRESIZE) { + ss << "L" << i << " [resize]" << std::endl; + } + } + ss << "Total " << total_gflops << " GFLOPs" << std::endl; + ss << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl; + + return ss.str(); + }; + +private: + //!< Calibration configuration + std::unique_ptr calib_config_; + + //!< Calibrator used for implicit quantization + std::unique_ptr calibrator_; +}; + +} // namespace tensorrt_common +} // namespace autoware + +#endif // AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp new file mode 100644 index 0000000000000..be640f52147d9 --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp @@ -0,0 +1,408 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ +#define AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ + +#include +#include + +#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) +#include +namespace fs = ::std::filesystem; +#else +#include +namespace fs = ::std::experimental::filesystem; +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +constexpr std::array valid_precisions = {"fp32", "fp16", "int8"}; + +/** + * @struct TrtCommonConfig + * @brief Configuration to provide fine control regarding TensorRT builder + */ +struct TrtCommonConfig +{ + /** @brief Construct TrtCommonConfig, ONNX model path is mandatory. + * + * @param[in] onnx_path ONNX model path + * @param[in] precision precision for inference + * @param[in] engine_path TensorRT engine path + * @param[in] max_workspace_size maximum workspace size for TensorRT + * @param[in] dla_core_id DLA core ID + * @param[in] profile_per_layer flag for per-layer profiler using IProfiler + */ + explicit TrtCommonConfig( + const std::string onnx_path, const std::string precision = "fp16", + const std::string engine_path = "", const size_t max_workspace_size = (1ULL << 30U), + const int32_t dla_core_id = -1, const bool profile_per_layer = false) + : onnx_path(onnx_path), + precision(precision), + engine_path(engine_path), + max_workspace_size(max_workspace_size), + dla_core_id(dla_core_id), + profile_per_layer(profile_per_layer) + { + validatePrecision(); + + if (engine_path.empty()) { + this->engine_path = onnx_path; + this->engine_path.replace_extension(".engine"); + } + } + + /** + * @brief Validate configured TensorRT engine precision. + */ + void validatePrecision() const + { + if ( + std::find(valid_precisions.begin(), valid_precisions.end(), precision) == + valid_precisions.end()) { + std::stringstream message; + message << "Invalid precision was specified: " << precision << std::endl + << "Valid string is one of: [" << valid_precisions[0]; + for (size_t i = 1; i < valid_precisions.size(); ++i) { + message << ", " << valid_precisions[i]; + } + message << "] (case sensitive)" << std::endl; + + throw std::runtime_error(message.str()); + } + } + + //!< @brief ONNX model path. + const fs::path onnx_path; + + //!< @brief Precision for inference. + const std::string precision; + + //!< @brief TensorRT engine path. + fs::path engine_path; + + //!< @brief TensorRT max workspace size. + const size_t max_workspace_size; + + //!< @brief DLA core ID that the process uses. + const int32_t dla_core_id; + + //!< @brief Flag for per-layer profiler using IProfiler. + const bool profile_per_layer; +}; + +/** + * @brief Represents a tensor with its name or index. + */ +struct TensorInfo +{ + /** + * @brief Construct TensorInfo with tensor name. + * + * @param[in] name Tensor name. + */ + explicit TensorInfo(std::string name) : tensor_name(std::move(name)), tensor_index(-1) {} + + /** + * @brief Construct TensorInfo with tensor index. + * + * @param[in] index Tensor index. + */ + explicit TensorInfo(int32_t index) : tensor_index(index) {} + + /** + * @brief Check if dimensions are equal. + * + * @param lhs Left-hand side tensor dimensions. + * @param rhs Right-hand side tensor dimensions. + * @return Whether dimensions are equal. + */ + static bool dimsEqual(const nvinfer1::Dims & lhs, const nvinfer1::Dims & rhs) + { + if (lhs.nbDims != rhs.nbDims) return false; + return std::equal(lhs.d, lhs.d + lhs.nbDims, rhs.d); // NOLINT + } + + /** + * @brief Get printable representation of tensor dimensions. + * + * @param[in] dims Tensor dimensions. + * @return String representation of tensor dimensions. + */ + static std::string dimsRepr(const nvinfer1::Dims & dims) + { + if (dims.nbDims == 0) return "[]"; + std::string repr = "[" + std::to_string(dims.d[0]); + for (int i = 1; i < dims.nbDims; ++i) { + repr += ", " + std::to_string(dims.d[i]); + } + repr += "]"; + return repr; + } + + //!< @brief Tensor name. + std::string tensor_name; + + //!< @brief Tensor index. + int32_t tensor_index; +}; + +/** + * @brief Represents a network input/output tensor with its dimensions. + * + * Example usage: + * @code + * nvinfer1::Dims tensor_dims = {3, {1, 2, 3}}; + * NetworkIO input("input_tensor", tensor_dims); + * NetworkIO output("output_tensor", tensor_dims); + * bool is_equal = input == output; // false + * @endcode + */ +struct NetworkIO : public TensorInfo +{ + /** + * @brief Construct NetworkIO with tensor name and dimensions. + * + * @param[in] name Tensor name. + * @param[in] tensor_dims Tensor dimensions. + */ + NetworkIO(std::string name, const nvinfer1::Dims & tensor_dims) + : TensorInfo(std::move(name)), dims(tensor_dims) + { + } + + /** + * @brief Construct NetworkIO with tensor index and dimensions. + * + * @param[in] index Tensor index. + * @param[in] tensor_dims Tensor dimensions. + */ + NetworkIO(int32_t index, const nvinfer1::Dims & tensor_dims) + : TensorInfo(index), dims(tensor_dims) + { + } + + /** + * @brief Get printable representation of NetworkIO. + * + * @return String representation of NetworkIO. + */ + [[nodiscard]] std::string toString() const + { + std::stringstream ss; + ss << tensor_name << " {" << TensorInfo::dimsRepr(dims) << "}"; + return ss.str(); + } + + /** + * @brief Check if NetworkIO is equal to another NetworkIO. + * + * @param rhs Right-hand side NetworkIO. + * @return Whether NetworkIO is equal to another NetworkIO. + */ + bool operator==(const NetworkIO & rhs) const + { + if (tensor_name != rhs.tensor_name) return false; + return dimsEqual(dims, rhs.dims); + } + + /** + * @brief Check if NetworkIO is not equal to another NetworkIO. + * + * @param rhs Right-hand side NetworkIO. + * @return Whether NetworkIO is not equal to another NetworkIO. + */ + bool operator!=(const NetworkIO & rhs) const { return !(*this == rhs); } + + /** + * @brief Output NetworkIO to ostream. + * + * @param os Output stream. + * @param io NetworkIO. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & os, const NetworkIO & io) + { + os << io.toString(); + return os; + } + + //!< @brief Tensor dimensions. + nvinfer1::Dims dims; +}; + +/** + * @brief Represents a tensor optimization profile with minimum, optimal, and maximum dimensions. + * + * Example usage: + * @code + * nvinfer1::Dims min = {3, {1, 2, 3}}; + * nvinfer1::Dims opt = {3, {1, 3, 4}}; + * nvinfer1::Dims max = {3, {1, 4, 5}}; + * ProfileDims entry_1("tensor_name", min, opt, max); + * ProfileDims entry_2("tensor_name", {3, {1, 2, 3}}, {3, {1, 3, 4}}, {3, {1, 4, 5}}); + * bool is_equal = entry_1 == entry_2; // true + * @endcode + */ +struct ProfileDims : public TensorInfo +{ + /** + * @brief Construct ProfileDims with tensor name and dimensions. + * + * @param[in] name Tensor name. + * @param[in] min Minimum dimensions for optimization profile. + * @param[in] opt Optimal dimensions for optimization profile. + * @param[in] max Maximum dimensions for optimization profile. + */ + ProfileDims( + std::string name, const nvinfer1::Dims & min, const nvinfer1::Dims & opt, + const nvinfer1::Dims & max) + : TensorInfo(std::move(name)), min_dims(min), opt_dims(opt), max_dims(max) + { + } + + /** + * @brief Construct ProfileDims with tensor index and dimensions. + * + * @param[in] index Tensor index. + * @param[in] min Minimum dimensions for optimization profile. + * @param[in] opt Optimal dimensions for optimization profile. + * @param[in] max Maximum dimensions for optimization profile. + */ + ProfileDims( + int32_t index, const nvinfer1::Dims & min, const nvinfer1::Dims & opt, + const nvinfer1::Dims & max) + : TensorInfo(index), min_dims(min), opt_dims(opt), max_dims(max) + { + } + + /** + * @brief Get printable representation of ProfileDims. + * + * @return String representation of ProfileDims. + */ + [[nodiscard]] std::string toString() const + { + std::ostringstream oss; + oss << tensor_name << " {min " << TensorInfo::dimsRepr(min_dims) << ", opt " + << TensorInfo::dimsRepr(opt_dims) << ", max " << TensorInfo::dimsRepr(max_dims) << "}"; + return oss.str(); + } + + /** + * @brief Check if ProfileDims is equal to another ProfileDims. + * + * @param rhs Right-hand side ProfileDims. + * @return Whether ProfileDims is equal to another ProfileDims. + */ + bool operator==(const ProfileDims & rhs) const + { + if (tensor_name != rhs.tensor_name) return false; + return dimsEqual(min_dims, rhs.min_dims) && dimsEqual(opt_dims, rhs.opt_dims) && + dimsEqual(max_dims, rhs.max_dims); + } + + /** + * @brief Check if ProfileDims is not equal to another ProfileDims. + * + * @param rhs Right-hand side ProfileDims. + * @return Whether ProfileDims is not equal to another ProfileDims. + */ + bool operator!=(const ProfileDims & rhs) const { return !(*this == rhs); } + + /** + * @brief Output ProfileDims to ostream. + * + * @param os Output stream. + * @param profile ProfileDims. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & os, const ProfileDims & profile) + { + os << profile.toString(); + return os; + } + + //!< @brief Minimum dimensions for optimization profile. + nvinfer1::Dims min_dims; + + //!< @brief Optimal dimensions for optimization profile. + nvinfer1::Dims opt_dims; + + //!< @brief Maximum dimensions for optimization profile. + nvinfer1::Dims max_dims; +}; + +//!< @brief Valid calibration types for TensorRT. +constexpr std::array valid_calib_type = { + "Entropy", "Legacy", "Percentile", "MinMax"}; + +/** + * @brief Configuration for implicit calibration. + */ +struct CalibrationConfig +{ + /** + * @brief Construct CalibrationConfig with its parameters. + * + * @param[in] calib_type_str Calibration type. + * @param[in] quantize_first_layer Flag for partial quantization in first layer. + * @param[in] quantize_last_layer Flag for partial quantization in last layer. + * @param[in] clip_value Clip value for implicit quantization. + */ + explicit CalibrationConfig( + const std::string & calib_type_str = "MinMax", const bool quantize_first_layer = false, + const bool quantize_last_layer = false, const double clip_value = 0.0) + : calib_type_str(calib_type_str), + quantize_first_layer(quantize_first_layer), + quantize_last_layer(quantize_last_layer), + clip_value(clip_value) + { + if ( + std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) == + valid_calib_type.end()) { + throw std::runtime_error( + "Invalid calibration type was specified: " + std::string(calib_type_str) + + "\nValid value is one of: [Entropy, (Legacy | Percentile), MinMax]"); + } + } + + //!< @brief type of calibration + const std::string calib_type_str; + + //!< @brief flag for partial quantization in first layer + const bool quantize_first_layer; + + //!< @brief flag for partial quantization in last layer + const bool quantize_last_layer; + + //!< @brief clip value for implicit quantization + const double clip_value; +}; + +} // namespace tensorrt_common +} // namespace autoware + +#endif // AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ diff --git a/perception/autoware_tensorrt_common/src/profiler.cpp b/perception/autoware_tensorrt_common/src/profiler.cpp new file mode 100644 index 0000000000000..433cca51f2b4f --- /dev/null +++ b/perception/autoware_tensorrt_common/src/profiler.cpp @@ -0,0 +1,113 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ + +Profiler::Profiler(const std::vector & src_profilers) +{ + index_ = 0; + for (const auto & src_profiler : src_profilers) { + for (const auto & [name, record] : src_profiler.profile_) { + auto it = profile_.find(name); + if (it == profile_.end()) { + profile_.emplace(name, record); + } else { + it->second.time += record.time; + it->second.count += record.count; + } + } + } +} + +void Profiler::reportLayerTime(const char * layerName, float ms) noexcept +{ + if (profile_.find(layerName) == profile_.end()) { + return; + } + profile_[layerName].count++; + profile_[layerName].time += ms; + if (profile_[layerName].min_time == -1.0) { + profile_[layerName].min_time = ms; + profile_[layerName].index = index_; + index_++; + } else if (profile_[layerName].min_time > ms) { + profile_[layerName].min_time = ms; + } +} + +std::string Profiler::toString() const +{ + std::ostringstream out; + float total_time = 0.0; + std::string layer_name = "Operation"; + + int max_layer_name_length = static_cast(layer_name.size()); + for (const auto & elem : profile_) { + total_time += elem.second.time; + max_layer_name_length = std::max(max_layer_name_length, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << "index, " << std::setw(12); + out << std::setw(max_layer_name_length) << layer_name << " "; + out << std::setw(12) << "Runtime" << "%," << " "; + out << std::setw(12) << "Invocations" << " , "; + out << std::setw(12) << "Runtime[ms]" << " , "; + out << std::setw(12) << "Avg Runtime[ms]" << " ,"; + out << std::setw(12) << "Min Runtime[ms]" << std::endl; + } + int index = index_; + for (int i = 0; i < index; i++) { + for (const auto & elem : profile_) { + if (elem.second.index == i) { + out << i << ", "; + out << std::setw(max_layer_name_length) << elem.first << ","; + out << std::setw(12) << std::fixed << std::setprecision(1) + << (elem.second.time * 100.0F / total_time) << "%" << ","; + out << std::setw(12) << elem.second.count << ","; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", "; + out << std::setw(12) << std::fixed << std::setprecision(2) + << elem.second.time / elem.second.count << ", "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time + << std::endl; + } + } + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== total runtime = " << total_time << " ms ==========" << std::endl; + + return out.str(); +} + +std::ostream & operator<<(std::ostream & out, const Profiler & value) +{ + out << value.toString(); + return out; +} +} // namespace tensorrt_common +} // namespace autoware diff --git a/perception/autoware_tensorrt_common/src/simple_profiler.cpp b/perception/autoware_tensorrt_common/src/simple_profiler.cpp deleted file mode 100644 index 2bcc1c4f9da06..0000000000000 --- a/perception/autoware_tensorrt_common/src/simple_profiler.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2023 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include -#include - -namespace autoware -{ -namespace tensorrt_common -{ - -SimpleProfiler::SimpleProfiler(std::string name, const std::vector & src_profilers) -: m_name(name) -{ - m_index = 0; - for (const auto & src_profiler : src_profilers) { - for (const auto & rec : src_profiler.m_profile) { - auto it = m_profile.find(rec.first); - if (it == m_profile.end()) { - m_profile.insert(rec); - } else { - it->second.time += rec.second.time; - it->second.count += rec.second.count; - } - } - } -} - -void SimpleProfiler::reportLayerTime(const char * layerName, float ms) noexcept -{ - m_profile[layerName].count++; - m_profile[layerName].time += ms; - if (m_profile[layerName].min_time == -1.0) { - m_profile[layerName].min_time = ms; - m_profile[layerName].index = m_index; - m_index++; - } else if (m_profile[layerName].min_time > ms) { - m_profile[layerName].min_time = ms; - } -} - -void SimpleProfiler::setProfDict(nvinfer1::ILayer * layer) noexcept -{ - std::string name = layer->getName(); - m_layer_dict[name]; - m_layer_dict[name].type = layer->getType(); - if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) { - nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer; - nvinfer1::ITensor * in = layer->getInput(0); - nvinfer1::Dims dim_in = in->getDimensions(); - nvinfer1::ITensor * out = layer->getOutput(0); - nvinfer1::Dims dim_out = out->getDimensions(); - nvinfer1::Dims k_dims = conv->getKernelSizeNd(); - nvinfer1::Dims s_dims = conv->getStrideNd(); - int groups = conv->getNbGroups(); - int stride = s_dims.d[0]; - int kernel = k_dims.d[0]; - m_layer_dict[name].in_c = dim_in.d[1]; - m_layer_dict[name].out_c = dim_out.d[1]; - m_layer_dict[name].w = dim_in.d[3]; - m_layer_dict[name].h = dim_in.d[2]; - m_layer_dict[name].k = kernel; - ; - m_layer_dict[name].stride = stride; - m_layer_dict[name].groups = groups; - } -} - -std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value) -{ - out << "========== " << value.m_name << " profile ==========" << std::endl; - float totalTime = 0; - std::string layerNameStr = "Operation"; - - int maxLayerNameLength = static_cast(layerNameStr.size()); - for (const auto & elem : value.m_profile) { - totalTime += elem.second.time; - maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); - } - - auto old_settings = out.flags(); - auto old_precision = out.precision(); - // Output header - { - out << "index, " << std::setw(12); - out << std::setw(maxLayerNameLength) << layerNameStr << " "; - out << std::setw(12) << "Runtime" - << "%," - << " "; - out << std::setw(12) << "Invocations" - << " , "; - out << std::setw(12) << "Runtime[ms]" - << " , "; - out << std::setw(12) << "Avg Runtime[ms]" - << " ,"; - out << std::setw(12) << "Min Runtime[ms]" << std::endl; - } - int index = value.m_index; - for (int i = 0; i < index; i++) { - for (const auto & elem : value.m_profile) { - if (elem.second.index == i) { - out << i << ", "; - out << std::setw(maxLayerNameLength) << elem.first << ","; - out << std::setw(12) << std::fixed << std::setprecision(1) - << (elem.second.time * 100.0F / totalTime) << "%" - << ","; - out << std::setw(12) << elem.second.count << ","; - out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", "; - out << std::setw(12) << std::fixed << std::setprecision(2) - << elem.second.time / elem.second.count << ", "; - out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time - << std::endl; - } - } - } - out.flags(old_settings); - out.precision(old_precision); - out << "========== " << value.m_name << " total runtime = " << totalTime - << " ms ==========" << std::endl; - return out; -} -} // namespace tensorrt_common -} // namespace autoware diff --git a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp index 990433ee277a0..ba422277416ab 100644 --- a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp +++ b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp @@ -12,102 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include "autoware/tensorrt_common/tensorrt_common.hpp" +#include "autoware/tensorrt_common/logger.hpp" +#include "autoware/tensorrt_common/utils.hpp" + +#include #include +#include #include +#include #include -#include #include #include #include +#include #include #include -namespace -{ -template -bool contain(const std::string & s, const T & v) -{ - return s.find(v) != std::string::npos; -} -} // anonymous namespace - namespace autoware { namespace tensorrt_common { -nvinfer1::Dims get_input_dims(const std::string & onnx_file_path) -{ - Logger logger_; - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); - } - - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - } - - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - } - - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Failed to parse onnx file"); - } - - const auto input = network->getInput(0); - return input->getDimensions(); -} - -bool is_valid_precision_string(const std::string & precision) -{ - if ( - std::find(valid_precisions.begin(), valid_precisions.end(), precision) == - valid_precisions.end()) { - std::stringstream message; - message << "Invalid precision was specified: " << precision << std::endl - << "Valid string is one of: ["; - for (const auto & s : valid_precisions) { - message << s << ", "; - } - message << "] (case sensitive)" << std::endl; - std::cerr << message.str(); - return false; - } else { - return true; - } -} TrtCommon::TrtCommon( - const std::string & model_path, const std::string & precision, - std::unique_ptr calibrator, const BatchConfig & batch_config, - const size_t max_workspace_size, const BuildConfig & build_config, + const TrtCommonConfig & trt_config, const std::shared_ptr & profiler, const std::vector & plugin_paths) -: model_file_path_(model_path), - calibrator_(std::move(calibrator)), - precision_(precision), - batch_config_(batch_config), - max_workspace_size_(max_workspace_size), - model_profiler_("Model"), - host_profiler_("Host") +: trt_config_(std::make_shared(trt_config)), + host_profiler_(profiler), + model_profiler_(profiler) { - // Check given precision is valid one - if (!is_valid_precision_string(precision)) { - return; - } - build_config_ = std::make_unique(build_config); - + logger_ = std::make_shared(); for (const auto & plugin_path : plugin_paths) { int32_t flags{RTLD_LAZY}; // cspell: ignore asan @@ -120,523 +56,595 @@ TrtCommon::TrtCommon( #endif // ENABLE_ASAN void * handle = dlopen(plugin_path.c_str(), flags); if (!handle) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library"); + } else { + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Loaded plugin library: %s", plugin_path.c_str()); } } - runtime_ = TrtUniquePtr(nvinfer1::createInferRuntime(logger_)); - if (build_config_->dla_core_id != -1) { - runtime_->setDLACore(build_config_->dla_core_id); + runtime_ = TrtUniquePtr(nvinfer1::createInferRuntime(*logger_)); + if (trt_config_->dla_core_id != -1) { + runtime_->setDLACore(trt_config_->dla_core_id); } - initLibNvInferPlugins(&logger_, ""); -} + initLibNvInferPlugins(&*logger_, ""); -TrtCommon::~TrtCommon() -{ + if (!initialize()) { + throw std::runtime_error("Failed to initialize TensorRT"); + } } -void TrtCommon::setup() +TrtCommon::~TrtCommon() = default; + +bool TrtCommon::setup(ProfileDimsPtr profile_dims, NetworkIOPtr network_io) { - if (!fs::exists(model_file_path_)) { - is_initialized_ = false; - return; - } - // cppcheck-suppress unreadVariable - std::string engine_path = model_file_path_; - if (model_file_path_.extension() == ".engine") { - std::cout << "Load ... " << model_file_path_ << std::endl; - loadEngine(model_file_path_); - } else if (model_file_path_.extension() == ".onnx") { - fs::path cache_engine_path{model_file_path_}; - std::string ext; - std::string calib_name = ""; - if (precision_ == "int8") { - if (build_config_->calib_type_str == "Entropy") { - calib_name = "EntropyV2-"; - } else if ( - build_config_->calib_type_str == "Legacy" || - build_config_->calib_type_str == "Percentile") { - calib_name = "Legacy-"; - } else { - calib_name = "MinMax-"; + profile_dims_ = std::move(profile_dims); + network_io_ = std::move(network_io); + + // Set input profile + if (profile_dims_ && !profile_dims_->empty()) { + auto profile = builder_->createOptimizationProfile(); + for (auto & profile_dim : *profile_dims_) { + if (profile_dim.tensor_name.empty()) { + profile_dim.tensor_name = getIOTensorName(profile_dim.tensor_index); } + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Setting optimization profile for tensor: %s", + profile_dim.toString().c_str()); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kMIN, profile_dim.min_dims); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kOPT, profile_dim.opt_dims); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kMAX, profile_dim.max_dims); } - if (build_config_->dla_core_id != -1) { - ext = "DLA" + std::to_string(build_config_->dla_core_id) + "-" + calib_name + precision_; - if (build_config_->quantize_first_layer) { - ext += "-firstFP16"; - } - if (build_config_->quantize_last_layer) { - ext += "-lastFP16"; - } - ext += "-batch" + std::to_string(batch_config_[0]) + ".engine"; - } else { - ext = calib_name + precision_; - if (build_config_->quantize_first_layer) { - ext += "-firstFP16"; - } - if (build_config_->quantize_last_layer) { - ext += "-lastFP16"; - } - ext += "-batch" + std::to_string(batch_config_[0]) + ".engine"; + builder_config_->addOptimizationProfile(profile); + } + + auto build_engine_with_log = [this]() -> bool { + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Starting to build engine"); + auto log_thread = logger_->log_throttle( + nvinfer1::ILogger::Severity::kINFO, + "Applying optimizations and building TensorRT CUDA engine. Please wait for a few minutes...", + 5); + auto success = buildEngineFromOnnx(); + logger_->stop_throttle(log_thread); + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Engine build completed"); + return success; + }; + + // Load engine file if it exists + if (fs::exists(trt_config_->engine_path)) { + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Loading engine"); + if (!loadEngine()) { + return false; } - cache_engine_path.replace_extension(ext); - - // Output Network Information - printNetworkInfo(model_file_path_); - - if (fs::exists(cache_engine_path)) { - std::cout << "Loading... " << cache_engine_path << std::endl; - loadEngine(cache_engine_path); - } else { - std::cout << "Building... " << cache_engine_path << std::endl; - logger_.log(nvinfer1::ILogger::Severity::kINFO, "Start build engine"); - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait for a few minutes...", 5); - buildEngineFromOnnx(model_file_path_, cache_engine_path); - logger_.stop_throttle(log_thread); - logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine"); + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Network validation"); + // Validate engine tensor shapes and optimization profile + if (!validateNetworkIO() || !validateProfileDims()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Network validation failed for loaded engine from file. Rebuilding engine"); + // Rebuild engine if the tensor shapes or optimization profile mismatch + if (!build_engine_with_log()) { + return false; + } } - // cppcheck-suppress unreadVariable - engine_path = cache_engine_path; } else { - is_initialized_ = false; - return; + // Build engine if engine has not been cached + if (!build_engine_with_log()) { + return false; + } } - context_ = TrtUniquePtr(engine_->createExecutionContext()); - if (!context_) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); - is_initialized_ = false; - return; + // Validate engine nevertheless is loaded or rebuilt + if (!validateNetworkIO() || !validateProfileDims()) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Final network validation failed. Possibly the input / output of the currently " + "deployed model has changed. Check your configuration file with the current model."); + return false; } - if (build_config_->profile_per_layer) { - context_->setProfiler(&model_profiler_); + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Engine setup completed"); + return true; +} + +std::string TrtCommon::getPrecision() const +{ + return trt_config_->precision; +} + +const char * TrtCommon::getIOTensorName(const int32_t index) const +{ + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return nullptr; + } + auto num_inputs = network_->getNbInputs(); + auto num_outputs = network_->getNbOutputs(); + if (index < 0 || index >= num_inputs + num_outputs) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid index for I/O tensor: %d. Total I/O tensors: %d", index, num_inputs + num_outputs); + return nullptr; + } + if (index < num_inputs) { + return network_->getInput(index)->getName(); + } + return network_->getOutput(index - num_inputs)->getName(); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - // Write profiles for trt-engine-explorer - // See: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer - std::string j_ext = ".json"; - fs::path json_path{engine_path}; - json_path.replace_extension(j_ext); - std::string ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON); - std::ofstream os(json_path, std::ofstream::trunc); - os << ret << std::flush; -#endif - is_initialized_ = true; + return engine_->getIOTensorName(index); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 -void TrtCommon::setupBindings(std::vector & bindings) +int32_t TrtCommon::getNbIOTensors() const { - for (int32_t i = 0, e = engine_->getNbIOTensors(); i < e; i++) { - auto const name = engine_->getIOTensorName(i); - context_->setTensorAddress(name, bindings.at(i)); + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return 0; + } + return network_->getNbInputs() + network_->getNbOutputs(); } + return engine_->getNbIOTensors(); } -#endif -bool TrtCommon::loadEngine(const std::string & engine_file_path) +nvinfer1::Dims TrtCommon::getTensorShape(const int32_t index) const { - std::ifstream engine_file(engine_file_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine_ = TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - return true; + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return nvinfer1::Dims{}; + } + auto num_inputs = network_->getNbInputs(); + auto num_outputs = network_->getNbOutputs(); + if (index < 0 || index >= num_inputs + num_outputs) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid index for I/O tensor: %d. Total I/O tensors: %d", index, num_inputs + num_outputs); + return nvinfer1::Dims{}; + } + if (index < num_inputs) { + return network_->getInput(index)->getDimensions(); + } + return network_->getOutput(index - num_inputs)->getDimensions(); + } + auto const & name = getIOTensorName(index); + return getTensorShape(name); } -void TrtCommon::printNetworkInfo(const std::string & onnx_file_path) +nvinfer1::Dims TrtCommon::getTensorShape(const char * tensor_name) const { - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); - return; + if (!engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Engine is not initialized"); + return nvinfer1::Dims{}; } + return engine_->getTensorShape(tensor_name); +} - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); +nvinfer1::Dims TrtCommon::getInputDims(const int32_t index) const +{ + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return {}; + } + const auto input = network_->getInput(index); + return input->getDimensions(); +} - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - return; +nvinfer1::Dims TrtCommon::getOutputDims(const int32_t index) const +{ + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return {}; } + const auto output = network_->getOutput(index); + return output->getDimensions(); +} + +bool TrtCommon::setTensorAddress(const int32_t index, void * data) +{ + auto const & name = getIOTensorName(index); + return setTensorAddress(name, data); +} - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - return; +bool TrtCommon::setTensorAddress(const char * tensor_name, void * data) +{ + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); + return false; + } + auto success = context_->setTensorAddress(tensor_name, data); + if (!success) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Failed to set tensor address for tensor: ", tensor_name); } + return success; +} - if (precision_ == "fp16" || precision_ == "int8") { - config->setFlag(nvinfer1::BuilderFlag::kFP16); +bool TrtCommon::setTensorsAddresses(std::vector & tensors) +{ + bool success = true; + for (std::size_t i = 0, e = tensors.size(); i < e; i++) { + auto const name = getIOTensorName(i); + success &= setTensorAddress(name, tensors.at(i)); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); -#else - config->setMaxWorkspaceSize(max_workspace_size_); -#endif + return success; +} - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - return; - } - int num = network->getNbLayers(); - float total_gflops = 0.0; - int total_params = 0; - for (int i = 0; i < num; i++) { - nvinfer1::ILayer * layer = network->getLayer(i); - auto layer_type = layer->getType(); - if (build_config_->profile_per_layer) { - model_profiler_.setProfDict(layer); - } - if (layer_type == nvinfer1::LayerType::kCONSTANT) { - continue; - } - nvinfer1::ITensor * in = layer->getInput(0); - nvinfer1::Dims dim_in = in->getDimensions(); - nvinfer1::ITensor * out = layer->getOutput(0); - nvinfer1::Dims dim_out = out->getDimensions(); - - if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { - nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer; - nvinfer1::Dims k_dims = conv->getKernelSizeNd(); - nvinfer1::Dims s_dims = conv->getStrideNd(); - int groups = conv->getNbGroups(); - int stride = s_dims.d[0]; - int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1]; - float gflops = (2.0 * num_weights) * (static_cast(dim_in.d[3]) / stride * - static_cast(dim_in.d[2]) / stride / 1e9); - total_gflops += gflops; - total_params += num_weights; - std::cout << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups - << ") " - << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x" - << dim_in.d[1] << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x" - << dim_out.d[1]; - std::cout << " weights:" << num_weights; - std::cout << " GFLOPs:" << gflops; - std::cout << std::endl; - } else if (layer_type == nvinfer1::LayerType::kPOOLING) { - nvinfer1::IPoolingLayer * pool = (nvinfer1::IPoolingLayer *)layer; - auto p_type = pool->getPoolingType(); - nvinfer1::Dims dim_stride = pool->getStrideNd(); - nvinfer1::Dims dim_window = pool->getWindowSizeNd(); - - std::cout << "L" << i << " ["; - if (p_type == nvinfer1::PoolingType::kMAX) { - std::cout << "max "; - } else if (p_type == nvinfer1::PoolingType::kAVERAGE) { - std::cout << "avg "; - } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) { - std::cout << "max avg blend "; - } - float gflops = static_cast(dim_in.d[1]) * - (static_cast(dim_window.d[0]) / static_cast(dim_stride.d[0])) * - (static_cast(dim_window.d[1]) / static_cast(dim_stride.d[1])) * - static_cast(dim_in.d[2]) * static_cast(dim_in.d[3]) / 1e9; - total_gflops += gflops; - std::cout << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]"; - std::cout << " GFLOPs:" << gflops; - std::cout << std::endl; - } else if (layer_type == nvinfer1::LayerType::kRESIZE) { - std::cout << "L" << i << " [resize]" << std::endl; - } +bool TrtCommon::setTensorsAddresses(std::unordered_map & tensors) +{ + bool success = true; + for (auto const & tensor : tensors) { + success &= setTensorAddress(tensor.first, tensor.second); } - std::cout << "Total " << total_gflops << " GFLOPs" << std::endl; - std::cout << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl; - return; + return success; +} + +bool TrtCommon::setInputShape(const int32_t index, const nvinfer1::Dims & dimensions) +{ + auto const & name = getIOTensorName(index); + return setInputShape(name, dimensions); } -bool TrtCommon::buildEngineFromOnnx( - const std::string & onnx_file_path, const std::string & output_engine_file_path) +bool TrtCommon::setInputShape(const char * tensor_name, const nvinfer1::Dims & dimensions) { - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); return false; } + auto success = context_->setInputShape(tensor_name, dimensions); + if (!success) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, "Failed to set input shape for tensor: ", tensor_name); + } + return success; +} - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); +bool TrtCommon::setInputsShapes(const std::vector & dimensions) +{ + bool success = true; + for (std::size_t i = 0, e = dimensions.size(); i < e; i++) { + success &= setInputShape(i, dimensions.at(i)); + } + return success; +} - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - return false; +bool TrtCommon::setInputsShapes(const std::unordered_map & dimensions) +{ + bool success = true; + for (auto const & dim : dimensions) { + success &= setInputShape(dim.first, dim.second); } + return success; +} - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - return false; +bool TrtCommon::setTensor(const int32_t index, void * data, nvinfer1::Dims dimensions) +{ + auto success = setTensorAddress(index, data); + if (dimensions.nbDims > 0) { + success &= setInputShape(index, dimensions); } + return success; +} - int num_available_dla = builder->getNbDLACores(); - if (build_config_->dla_core_id != -1) { - if (num_available_dla > 0) { - std::cout << "###" << num_available_dla << " DLAs are supported! ###" << std::endl; - } else { - std::cout << "###Warning : " - << "No DLA is supported! ###" << std::endl; - } - config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - config->setDLACore(build_config_->dla_core_id); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); -#else - config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); -#endif - config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); +bool TrtCommon::setTensor(const char * tensor_name, void * data, nvinfer1::Dims dimensions) +{ + auto success = setTensorAddress(tensor_name, data); + if (dimensions.nbDims > 0) { + success &= setInputShape(tensor_name, dimensions); } - if (precision_ == "fp16" || precision_ == "int8") { - config->setFlag(nvinfer1::BuilderFlag::kFP16); + return success; +} + +bool TrtCommon::setTensors(TensorsVec & tensors) +{ + bool success = true; + for (std::size_t i = 0, e = tensors.size(); i < e; i++) { + success &= setTensor(i, tensors.at(i).first, tensors.at(i).second); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); -#else - config->setMaxWorkspaceSize(max_workspace_size_); -#endif + return success; +} - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - std::cout << "Failed to parse onnx file" << std::endl; - return false; +bool TrtCommon::setTensors(TensorsMap & tensors) +{ + bool success = true; + for (auto const & tensor : tensors) { + success &= setTensor(tensor.first, tensor.second.first, tensor.second.second); } + return success; +} - const int num = network->getNbLayers(); - bool first = build_config_->quantize_first_layer; - bool last = build_config_->quantize_last_layer; - // Partial Quantization - if (precision_ == "int8") { - network->getInput(0)->setDynamicRange(0, 255.0); - for (int i = 0; i < num; i++) { - nvinfer1::ILayer * layer = network->getLayer(i); - auto layer_type = layer->getType(); - std::string name = layer->getName(); - nvinfer1::ITensor * out = layer->getOutput(0); - if (build_config_->clip_value > 0.0) { - std::cout << "Set max value for outputs : " << build_config_->clip_value << " " << name - << std::endl; - out->setDynamicRange(0.0, build_config_->clip_value); - } +std::shared_ptr TrtCommon::getModelProfiler() +{ + return model_profiler_; +} - if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { - if (first) { - layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << name << std::endl; - first = false; - } - if (last) { - // cspell: ignore preds - if ( - contain(name, "reg_preds") || contain(name, "cls_preds") || - contain(name, "obj_preds")) { - layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << name << std::endl; - } - for (int j = num - 1; j >= 0; j--) { - nvinfer1::ILayer * inner_layer = network->getLayer(j); - auto inner_layer_type = inner_layer->getType(); - std::string inner_name = inner_layer->getName(); - if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) { - inner_layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << inner_name << std::endl; - break; - } - if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) { - inner_layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << inner_name << std::endl; - break; - } - } - } - } - } +std::shared_ptr TrtCommon::getHostProfiler() +{ + return host_profiler_; +} + +std::shared_ptr TrtCommon::getTrtCommonConfig() +{ + return trt_config_; +} + +std::shared_ptr TrtCommon::getBuilderConfig() +{ + return builder_config_; +} + +std::shared_ptr TrtCommon::getNetwork() +{ + return network_; +} + +std::shared_ptr TrtCommon::getLogger() +{ + return logger_; +} + +bool TrtCommon::enqueueV3(cudaStream_t stream) +{ + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); + return false; + } + if (trt_config_->profile_per_layer) { + auto inference_start = std::chrono::high_resolution_clock::now(); + auto success = context_->enqueueV3(stream); + auto inference_end = std::chrono::high_resolution_clock::now(); + host_profiler_->reportLayerTime( + "inference_host", + std::chrono::duration(inference_end - inference_start).count()); + return success; } + return context_->enqueueV3(stream); +} - const auto input = network->getInput(0); - const auto input_dims = input->getDimensions(); - const auto input_channel = input_dims.d[1]; - const auto input_height = input_dims.d[2]; - const auto input_width = input_dims.d[3]; - const auto input_batch = input_dims.d[0]; +void TrtCommon::printProfiling() const +{ + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Host Profiling\n", host_profiler_->toString().c_str()); + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Model Profiling\n", model_profiler_->toString().c_str()); +} + +std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format) +{ + if (!context_ || !engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context or engine are not initialized"); + return {}; + } + auto inspector = std::unique_ptr(engine_->createEngineInspector()); + inspector->setExecutionContext(&(*context_)); + std::string result = inspector->getEngineInformation(format); + return result; +} - if (input_batch > 1) { - batch_config_[0] = input_batch; +bool TrtCommon::initialize() +{ + if (!fs::exists(trt_config_->onnx_path) || trt_config_->onnx_path.extension() != ".onnx") { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Invalid ONNX file path or extension"); + return false; } - if (batch_config_.at(0) > 1 && (batch_config_.at(0) == batch_config_.at(2))) { - // Attention : below API is deprecated in TRT8.4 - builder->setMaxBatchSize(batch_config_.at(2)); - } else { - if (build_config_->profile_per_layer) { - auto profile = builder->createOptimizationProfile(); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMIN, - nvinfer1::Dims4{batch_config_.at(0), input_channel, input_height, input_width}); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kOPT, - nvinfer1::Dims4{batch_config_.at(1), input_channel, input_height, input_width}); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMAX, - nvinfer1::Dims4{batch_config_.at(2), input_channel, input_height, input_width}); - config->addOptimizationProfile(profile); - } + builder_ = TrtUniquePtr(nvinfer1::createInferBuilder(*logger_)); + if (!builder_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); + return false; } - if (precision_ == "int8" && calibrator_) { - config->setFlag(nvinfer1::BuilderFlag::kINT8); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 + const auto explicit_batch = + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network_ = TrtUniquePtr(builder_->createNetworkV2(explicit_batch)); #else - config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); + network_ = TrtUniquePtr(builder_->createNetworkV2(0)); #endif - // QAT requires no calibrator. - // assert((calibrator != nullptr) && "Invalid calibrator for INT8 precision"); - config->setInt8Calibrator(calibrator_.get()); + + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); + return false; } - if (build_config_->profile_per_layer) { -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED); -#else - config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kVERBOSE); -#endif + + builder_config_ = TrtUniquePtr(builder_->createBuilderConfig()); + if (!builder_config_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); + return false; + } + + auto num_available_dla = builder_->getNbDLACores(); + if (trt_config_->dla_core_id != -1) { + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Number of DLAs supported: %d", num_available_dla); + builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + builder_config_->setDLACore(trt_config_->dla_core_id); + builder_config_->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); } + if (trt_config_->precision == "fp16") { + builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); + } else if (trt_config_->precision == "int8") { + builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); + } + builder_config_->setMemoryPoolLimit( + nvinfer1::MemoryPoolType::kWORKSPACE, trt_config_->max_workspace_size); + + parser_ = TrtUniquePtr(nvonnxparser::createParser(*network_, *logger_)); + if (!parser_->parseFromFile( + trt_config_->onnx_path.c_str(), + static_cast(nvinfer1::ILogger::Severity::kERROR))) { + return false; + } + + if (trt_config_->profile_per_layer) { + builder_config_->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED); + } + + return true; +} -#if TENSORRT_VERSION_MAJOR >= 8 - auto plan = - TrtUniquePtr(builder->buildSerializedNetwork(*network, *config)); +bool TrtCommon::buildEngineFromOnnx() +{ + // Build engine + auto plan = TrtUniquePtr( + builder_->buildSerializedNetwork(*network_, *builder_config_)); if (!plan) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory"); return false; } engine_ = TrtUniquePtr( runtime_->deserializeCudaEngine(plan->data(), plan->size())); -#else - engine_ = TrtUniquePtr(builder->buildEngineWithConfig(*network, *config)); -#endif if (!engine_) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); return false; } - // save engine -#if TENSORRT_VERSION_MAJOR < 8 - auto data = TrtUniquePtr(engine_->serialize()); -#endif + // Save engine std::ofstream file; - file.open(output_engine_file_path, std::ios::binary | std::ios::out); + file.open(trt_config_->engine_path, std::ios::binary | std::ios::out); if (!file.is_open()) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to open engine file"); return false; } -#if TENSORRT_VERSION_MAJOR < 8 - file.write(reinterpret_cast(data->data()), data->size()); -#else - file.write(reinterpret_cast(plan->data()), plan->size()); -#endif - + file.write(reinterpret_cast(plan->data()), plan->size()); // NOLINT file.close(); - return true; -} - -bool TrtCommon::isInitialized() -{ - return is_initialized_; -} - -nvinfer1::Dims TrtCommon::getBindingDimensions(const int32_t index) const -{ -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSOR_PATCH * 10) >= 8500 - auto const & name = engine_->getIOTensorName(index); - auto dims = context_->getTensorShape(name); - bool const has_runtime_dim = - std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + context_ = TrtUniquePtr(engine_->createExecutionContext()); + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); + return false; + } - if (has_runtime_dim) { - return dims; - } else { - return context_->getBindingDimensions(index); + if (trt_config_->profile_per_layer) { + context_->setProfiler(&*model_profiler_); } -#else - return context_->getBindingDimensions(index); -#endif -} -int32_t TrtCommon::getNbBindings() -{ - return engine_->getNbBindings(); -} + fs::path json_path = trt_config_->engine_path.replace_extension(".json"); + auto ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON); + std::ofstream os(json_path, std::ofstream::trunc); + os << ret << std::flush; + os.close(); -bool TrtCommon::setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const -{ - return context_->setBindingDimensions(index, dimensions); + return true; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 -bool TrtCommon::enqueueV3(cudaStream_t stream) +bool TrtCommon::loadEngine() { - if (build_config_->profile_per_layer) { - auto inference_start = std::chrono::high_resolution_clock::now(); - - bool ret = context_->enqueueV3(stream); + std::ifstream engine_file(trt_config_->engine_path); + std::stringstream engine_buffer; + engine_buffer << engine_file.rdbuf(); + std::string engine_str = engine_buffer.str(); - auto inference_end = std::chrono::high_resolution_clock::now(); - host_profiler_.reportLayerTime( - "inference", - std::chrono::duration(inference_end - inference_start).count()); - return ret; + engine_ = TrtUniquePtr(runtime_->deserializeCudaEngine( + reinterpret_cast( // NOLINT + engine_str.data()), + engine_str.size())); + if (!engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); + return false; } - return context_->enqueueV3(stream); -} -#endif -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 -bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed) -{ - if (build_config_->profile_per_layer) { - auto inference_start = std::chrono::high_resolution_clock::now(); - bool ret = context_->enqueueV2(bindings, stream, input_consumed); + context_ = TrtUniquePtr(engine_->createExecutionContext()); + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); + return false; + } - auto inference_end = std::chrono::high_resolution_clock::now(); - host_profiler_.reportLayerTime( - "inference", - std::chrono::duration(inference_end - inference_start).count()); - return ret; - } else { - return context_->enqueueV2(bindings, stream, input_consumed); + if (trt_config_->profile_per_layer) { + context_->setProfiler(&*model_profiler_); } + + return true; } -#endif -void TrtCommon::printProfiling() +bool TrtCommon::validateProfileDims() { - std::cout << host_profiler_; - std::cout << std::endl; - std::cout << model_profiler_; + auto success = true; + if (!profile_dims_ || profile_dims_->empty()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Input profile is empty, skipping validation. If network has dynamic shapes, it might lead " + "to undefined behavior."); + return success; + } + if (engine_->getNbOptimizationProfiles() != 1) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Number of optimization profiles in the engine (%d) is not equal to 1. Selecting the first " + "cached profile.", + engine_->getNbOptimizationProfiles()); + } + + for (const auto & profile_dim : *profile_dims_) { + nvinfer1::Dims min_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kMIN); + nvinfer1::Dims opt_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kOPT); + nvinfer1::Dims max_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX); + ProfileDims profile_from_engine{profile_dim.tensor_name, min_dims, opt_dims, max_dims}; + if (profile_dim != profile_from_engine) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Invalid profile. Current configuration: %s. Cached engine: %s", + profile_dim.toString().c_str(), profile_from_engine.toString().c_str()); + success = false; + } + } + return success; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 -std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format) +bool TrtCommon::validateNetworkIO() { - auto runtime = std::unique_ptr(nvinfer1::createInferRuntime(logger_)); - auto inspector = std::unique_ptr(engine_->createEngineInspector()); - if (context_ != nullptr) { - inspector->setExecutionContext(&(*context_)); + auto success = true; + if (!network_io_ || network_io_->empty()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Network IO is empty, skipping validation. It might lead to undefined behavior"); + return success; + } + + if (network_io_->size() != static_cast(getNbIOTensors())) { + std::string tensor_names = "[" + std::string(getIOTensorName(0)); + for (int32_t i = 1; i < getNbIOTensors(); ++i) { + tensor_names += ", " + std::string(getIOTensorName(i)); + } + tensor_names += "]"; + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Number of tensors in the engine (%d) does not match number of tensors in the config (%d). " + "Tensors in the built engine: %s", + getNbIOTensors(), network_io_->size(), tensor_names.c_str()); + success = false; + } + for (const auto & io : *network_io_) { + NetworkIO tensor_from_engine{io.tensor_name, getTensorShape(io.tensor_name.c_str())}; + if (io != tensor_from_engine) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid tensor. Current configuration: %s. Cached engine: %s", io.toString().c_str(), + tensor_from_engine.toString().c_str()); + success = false; + } } - std::string result = inspector->getEngineInformation(format); - return result; + + return success; } -#endif } // namespace tensorrt_common } // namespace autoware diff --git a/perception/autoware_tensorrt_yolox/CMakeLists.txt b/perception/autoware_tensorrt_yolox/CMakeLists.txt index 17b31fc7e8df1..cc0d793151781 100644 --- a/perception/autoware_tensorrt_yolox/CMakeLists.txt +++ b/perception/autoware_tensorrt_yolox/CMakeLists.txt @@ -10,6 +10,9 @@ endif() find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(OpenCV REQUIRED) option(CUDA_VERBOSE "Verbose output of CUDA modules" OFF) diff --git a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp index 483adfbdf2757..7d49df145b72e 100644 --- a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp +++ b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,13 @@ struct Object using ObjectArray = std::vector; using ObjectArrays = std::vector; +using autoware::tensorrt_common::CalibrationConfig; +using autoware::tensorrt_common::NetworkIOPtr; +using autoware::tensorrt_common::ProfileDimsPtr; +using autoware::tensorrt_common::Profiler; +using autoware::tensorrt_common::TrtCommon; +using autoware::tensorrt_common::TrtCommonConfig; +using autoware::tensorrt_common::TrtConvCalib; struct GridAndStride { @@ -70,31 +79,26 @@ class TrtYoloX public: /** * @brief Construct TrtYoloX. - * @param[in] mode_path ONNX model_path - * @param[in] precision precision for inference + * @param[in] trt_config base trt common configuration * @param[in] num_class classifier-ed num * @param[in] score_threshold threshold for detection * @param[in] nms_threshold threshold for NMS - * @param[in] build_config configuration including precision, calibration method, DLA, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder * @param[in] use_gpu_preprocess whether use cuda gpu for preprocessing - * @param[in] calibration_image_list_file path for calibration files (only require for + * @param[in] gpu_id GPU id for inference + * @param[in] calibration_image_list_path path for calibration files (only require for * quantization) * @param[in] norm_factor scaling factor for preprocess * @param[in] cache_dir unused variable - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine + * @param[in] color_map_path path for colormap for masks + * @param[in] calib_config calibration configuration */ TrtYoloX( - const std::string & model_path, const std::string & precision, const int num_class = 8, - const float score_threshold = 0.3, const float nms_threshold = 0.7, - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig(), - const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0, - std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0, - [[maybe_unused]] const std::string & cache_dir = "", - const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, - const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = ""); + TrtCommonConfig & trt_config, const int num_class = 8, const float score_threshold = 0.3, + const float nms_threshold = 0.7, const bool use_gpu_preprocess = false, + const uint8_t gpu_id = 0, std::string calibration_image_list_path = std::string(), + const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "", + const std::string & color_map_path = "", + const CalibrationConfig & calib_config = CalibrationConfig()); /** * @brief Deconstruct TrtYoloX */ @@ -168,6 +172,12 @@ class TrtYoloX cv::Mat & colorized_mask); inline std::vector getColorMap() { return sematic_color_map_; } + /** + * @brief get batch size + * @return batch size + */ + [[nodiscard]] int getBatchSize() const; + private: /** * @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU @@ -266,7 +276,7 @@ class TrtYoloX */ cv::Mat getMaskImageGpu(float * d_prob, nvinfer1::Dims dims, int out_w, int out_h, int b); - std::unique_ptr trt_common_; + std::unique_ptr trt_common_; std::vector input_h_; CudaUniquePtr input_d_; @@ -288,7 +298,7 @@ class TrtYoloX int num_class_; float score_threshold_; float nms_threshold_; - int batch_size_; + int32_t batch_size_; CudaUniquePtrHost out_prob_h_; // flag whether preprocess are performed on GPU diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp index 7e2e327bf6f5e..74d8d73ecaed4 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp @@ -155,13 +155,11 @@ std::vector get_seg_colormap(const std::stri namespace autoware::tensorrt_yolox { TrtYoloX::TrtYoloX( - const std::string & model_path, const std::string & precision, const int num_class, - const float score_threshold, const float nms_threshold, - autoware::tensorrt_common::BuildConfig build_config, const bool use_gpu_preprocess, - const uint8_t gpu_id, std::string calibration_image_list_path, const double norm_factor, - [[maybe_unused]] const std::string & cache_dir, - const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, - const std::string & color_map_path) + TrtCommonConfig & trt_config, const int num_class, const float score_threshold, + const float nms_threshold, const bool use_gpu_preprocess, const uint8_t gpu_id, + std::string calibration_image_list_path, const double norm_factor, + [[maybe_unused]] const std::string & cache_dir, const std::string & color_map_path, + const CalibrationConfig & calib_config) : gpu_id_(gpu_id), is_gpu_initialized_(false) { if (!setCudaDeviceId(gpu_id_)) { @@ -172,13 +170,29 @@ TrtYoloX::TrtYoloX( src_width_ = -1; src_height_ = -1; norm_factor_ = norm_factor; - batch_size_ = batch_config[2]; multitask_ = 0; sematic_color_map_ = get_seg_colormap(color_map_path); stream_ = makeCudaStream(); - if (precision == "int8") { - if (build_config.clip_value <= 0.0) { + trt_common_ = std::make_unique(trt_config); + + const auto network_input_dims = trt_common_->getTensorShape(0); + batch_size_ = network_input_dims.d[0]; + const auto input_channel = network_input_dims.d[1]; + const auto input_height = network_input_dims.d[2]; + const auto input_width = network_input_dims.d[3]; + + auto profile_dims_ptr = std::make_unique>(); + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + 0, {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}})}; + *profile_dims_ptr = profile_dims; + + if (trt_config.precision == "int8") { + if (calib_config.clip_value <= 0.0) { if (calibration_image_list_path.empty()) { throw std::runtime_error( "calibration_image_list_path should be passed to generate int8 engine " @@ -189,19 +203,17 @@ TrtYoloX::TrtYoloX( calibration_image_list_path = ""; } - int max_batch_size = batch_size_; - nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path); std::vector calibration_images; if (calibration_image_list_path != "") { calibration_images = loadImageList(calibration_image_list_path, ""); } - tensorrt_yolox::ImageStream stream(max_batch_size, input_dims, calibration_images); - fs::path calibration_table{model_path}; + tensorrt_yolox::ImageStream stream(batch_size_, network_input_dims, calibration_images); + fs::path calibration_table{trt_config.onnx_path}; std::string ext = ""; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { ext = "EntropyV2-"; } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { ext = "Legacy-"; } else { ext = "MinMax-"; @@ -209,17 +221,17 @@ TrtYoloX::TrtYoloX( ext += "calibration.table"; calibration_table.replace_extension(ext); - fs::path histogram_table{model_path}; + fs::path histogram_table{trt_config.onnx_path}; ext = "histogram.table"; histogram_table.replace_extension(ext); std::unique_ptr calibrator; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { calibrator.reset( new tensorrt_yolox::Int8EntropyCalibrator(stream, calibration_table, norm_factor_)); } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { const double quantile = 0.999999; const double cutoff = 0.999999; calibrator.reset(new tensorrt_yolox::Int8LegacyCalibrator( @@ -228,22 +240,20 @@ TrtYoloX::TrtYoloX( calibrator.reset( new tensorrt_yolox::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_)); } - trt_common_ = std::make_unique( - model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); + if (!trt_common_->setupWithCalibrator( + std::move(calibrator), calib_config, std::move((profile_dims_ptr)))) { + throw std::runtime_error("Failed to setup TensorRT engine with calibrator"); + } } else { - trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); - } - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - return; + if (!trt_common_->setup(std::move(profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TensorRT engine"); + } } // Judge whether decoding output is required // Plain models require decoding, while models with EfficientNMS_TRT module don't. // If getNbBindings == 5, the model contains EfficientNMS_TRT - switch (trt_common_->getNbBindings()) { + switch (trt_common_->getNbIOTensors()) { case 2: // Specified model is plain one. // Bindings are: [input, output] @@ -273,16 +283,16 @@ TrtYoloX::TrtYoloX( */ } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); if (needs_output_decode_) { - const auto output_dims = trt_common_->getBindingDimensions(1); - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); + const auto output_dims = trt_common_->getTensorShape(1); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); out_elem_num_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num_ = out_elem_num_ * batch_config[2]; - out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_elem_num_ = out_elem_num_ * batch_size_; + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_size_); out_prob_d_ = autoware::cuda_utils::make_unique(out_elem_num_); out_prob_h_ = autoware::cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); @@ -298,29 +308,27 @@ TrtYoloX::TrtYoloX( output_strides_ = {8, 16, 32, 4}; } } else { - const auto out_scores_dims = trt_common_->getBindingDimensions(3); + const auto out_scores_dims = trt_common_->getTensorShape(3); max_detections_ = out_scores_dims.d[1]; - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); - out_num_detections_d_ = autoware::cuda_utils::make_unique(batch_config[2]); - out_boxes_d_ = - autoware::cuda_utils::make_unique(batch_config[2] * max_detections_ * 4); - out_scores_d_ = autoware::cuda_utils::make_unique(batch_config[2] * max_detections_); - out_classes_d_ = - autoware::cuda_utils::make_unique(batch_config[2] * max_detections_); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); + out_num_detections_d_ = autoware::cuda_utils::make_unique(batch_size_); + out_boxes_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_ * 4); + out_scores_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_); + out_classes_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_); } if (multitask_) { // Allocate buffer for segmentation segmentation_out_elem_num_ = 0; for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections size_t out_elem_num = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num = out_elem_num * batch_config[2]; + out_elem_num = out_elem_num * batch_size_; segmentation_out_elem_num_ += out_elem_num; } segmentation_out_elem_num_per_batch_ = - static_cast(segmentation_out_elem_num_ / batch_config[2]); + static_cast(segmentation_out_elem_num_ / batch_size_); segmentation_out_prob_d_ = autoware::cuda_utils::make_unique(segmentation_out_elem_num_); segmentation_out_prob_h_ = autoware::cuda_utils::make_unique_host( @@ -387,7 +395,7 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) src_width_ = width; src_height_ = height; if (use_gpu_preprocess_) { - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); bool const hasRuntimeDim = std::any_of( input_dims.d, input_dims.d + input_dims.nbDims, [](int32_t input_dim) { return input_dim == -1; }); @@ -395,7 +403,9 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) input_dims.d[0] = batch_size_; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } scales_.clear(); } const float input_height = static_cast(input_dims.d[2]); @@ -414,7 +424,7 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) size_t argmax_out_elem_num = 0; for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections const float scale = std::min( output_dims.d[3] / static_cast(width), output_dims.d[2] / static_cast(height)); @@ -441,7 +451,7 @@ void TrtYoloX::printProfiling(void) void TrtYoloX::preprocessGpu(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -469,7 +479,9 @@ void TrtYoloX::preprocessGpu(const std::vector & images) src_height_ = height; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } scales_.clear(); } const float input_height = static_cast(input_dims.d[2]); @@ -495,7 +507,7 @@ void TrtYoloX::preprocessGpu(const std::vector & images) if (multitask_) { for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0: input, 1: output for detections + trt_common_->getTensorShape(m + 2); // 0: input, 1: output for detections const float scale = std::min( output_dims.d[3] / static_cast(image.cols), output_dims.d[2] / static_cast(image.rows)); @@ -530,9 +542,11 @@ void TrtYoloX::preprocessGpu(const std::vector & images) void TrtYoloX::preprocess(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -567,9 +581,6 @@ bool TrtYoloX::doInference( if (!setCudaDeviceId(gpu_id_)) { return false; } - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { preprocessGpu(images); @@ -588,7 +599,7 @@ void TrtYoloX::preprocessWithRoiGpu( const std::vector & images, const std::vector & rois) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -610,7 +621,9 @@ void TrtYoloX::preprocessWithRoiGpu( src_height_ = height; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -662,9 +675,11 @@ void TrtYoloX::preprocessWithRoi( const std::vector & images, const std::vector & rois) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -700,7 +715,7 @@ void TrtYoloX::preprocessWithRoi( void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector & rois) { const auto batch_size = rois.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; @@ -722,7 +737,9 @@ void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector< src_height_ = height; if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -770,9 +787,11 @@ void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector< void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector & rois) { const auto batch_size = rois.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -806,9 +825,6 @@ void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector & images, ObjectArrays & objects, const std::vector & rois) { - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { preprocessWithRoiGpu(images, rois); } else { @@ -827,9 +843,6 @@ bool TrtYoloX::doInferenceWithRoi( bool TrtYoloX::doMultiScaleInference( const cv::Mat & image, ObjectArrays & objects, const std::vector & rois) { - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { multiScalePreprocessGpu(image, rois); } else { @@ -849,8 +862,10 @@ bool TrtYoloX::feedforward(const std::vector & images, ObjectArrays & o std::vector buffers = { input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), out_classes_d_.get()}; - - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); const auto batch_size = images.size(); auto out_num_detections = std::make_unique(batch_size); @@ -903,7 +918,11 @@ bool TrtYoloX::feedforwardAndDecode( if (multitask_) { buffers = {input_d_.get(), out_prob_d_.get(), segmentation_out_prob_d_.get()}; } - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); const auto batch_size = images.size(); @@ -933,7 +952,7 @@ bool TrtYoloX::feedforwardAndDecode( static_cast(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_); for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections size_t out_elem_num = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); out_elem_num = out_elem_num * batch; @@ -972,7 +991,10 @@ bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, Obje input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), out_classes_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); auto out_num_detections = std::make_unique(batch_size); auto out_boxes = std::make_unique(4 * batch_size * max_detections_); @@ -1020,7 +1042,10 @@ bool TrtYoloX::multiScaleFeedforwardAndDecode( const cv::Mat & image, int batch_size, ObjectArrays & objects) { std::vector buffers = {input_d_.get(), out_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, @@ -1042,7 +1067,7 @@ void TrtYoloX::decodeOutputs( float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const { ObjectArray proposals; - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector grid_strides; @@ -1298,4 +1323,9 @@ void TrtYoloX::getColorizedMask( } } +int TrtYoloX::getBatchSize() const +{ + return batch_size_; +} + } // namespace autoware::tensorrt_yolox diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp index 94a3a37a4d08f..2048279dcf3f1 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp @@ -83,19 +83,18 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) roi_overlay_segment_labels_.ANIMAL = declare_parameter("roi_overlay_segment_label.ANIMAL"); replaceLabelMap(); - autoware::tensorrt_common::BuildConfig build_config( - calibration_algorithm, dla_core_id, quantize_first_layer, quantize_last_layer, - profile_per_layer, clip_value); + TrtCommonConfig trt_config( + model_path, precision, "", (1ULL << 30U), dla_core_id, profile_per_layer); + + CalibrationConfig calib_config( + calibration_algorithm, quantize_first_layer, quantize_last_layer, clip_value); const double norm_factor = 1.0; const std::string cache_dir = ""; - const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1}; - const size_t max_workspace_size = (1 << 30); trt_yolox_ = std::make_unique( - model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config, - preprocess_on_gpu, gpu_id, calibration_image_list_path, norm_factor, cache_dir, batch_config, - max_workspace_size, color_map_path); + trt_config, label_map_.size(), score_threshold, nms_threshold, preprocess_on_gpu, gpu_id, + calibration_image_list_path, norm_factor, cache_dir, color_map_path, calib_config); if (!trt_yolox_->isGPUInitialized()) { RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); diff --git a/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp index 243e82c65dab9..63c0ee0cd3e8a 100644 --- a/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp +++ b/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp @@ -46,7 +46,9 @@ class YoloXSingleImageInferenceNode : public rclcpp::Node const auto output_image_path = declare_parameter("output_image_path", p.string() + "_detect" + ext); - auto trt_yolox = std::make_unique(model_path, precision); + auto trt_config = tensorrt_common::TrtCommonConfig(model_path, precision); + + auto trt_yolox = std::make_unique(trt_config); auto image = cv::imread(image_path); tensorrt_yolox::ObjectArrays objects; std::vector masks; diff --git a/perception/autoware_traffic_light_classifier/CMakeLists.txt b/perception/autoware_traffic_light_classifier/CMakeLists.txt index d132577743ab0..181037caebfa5 100644 --- a/perception/autoware_traffic_light_classifier/CMakeLists.txt +++ b/perception/autoware_traffic_light_classifier/CMakeLists.txt @@ -34,7 +34,6 @@ if(NVINFER AND NVONNXPARSER AND NVINFER_PLUGIN) if(CUDA_VERBOSE) message(STATUS "TensorRT is available!") message(STATUS "NVINFER: ${NVINFER}") - message(STATUS "NVPARSERS: ${NVPARSERS}") message(STATUS "NVINFER_PLUGIN: ${NVINFER_PLUGIN}") message(STATUS "NVONNXPARSER: ${NVONNXPARSER}") endif() diff --git a/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp b/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp index f9bf2509aa2ae..d47cb1500fffd 100644 --- a/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp +++ b/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp @@ -51,13 +51,10 @@ CNNClassifier::CNNClassifier(rclcpp::Node * node_ptr) : node_ptr_(node_ptr) } readLabelfile(label_file_path, labels_); - nvinfer1::Dims input_dim = autoware::tensorrt_common::get_input_dims(model_file_path); - assert(input_dim.d[0] > 0); - batch_size_ = input_dim.d[0]; - autoware::tensorrt_common::BatchConfig batch_config{batch_size_, batch_size_, batch_size_}; classifier_ = std::make_unique( - model_file_path, precision, batch_config, mean_, std_); + model_file_path, precision, mean_, std_); + batch_size_ = classifier_->getBatchSize(); if (node_ptr_->declare_parameter("build_only", false)) { RCLCPP_INFO(node_ptr_->get_logger(), "TensorRT engine is built and shutdown node."); rclcpp::shutdown(); diff --git a/perception/autoware_traffic_light_fine_detector/CMakeLists.txt b/perception/autoware_traffic_light_fine_detector/CMakeLists.txt index 80d6e43c098ed..dffbab0e77681 100644 --- a/perception/autoware_traffic_light_fine_detector/CMakeLists.txt +++ b/perception/autoware_traffic_light_fine_detector/CMakeLists.txt @@ -39,7 +39,6 @@ if(NVINFER AND NVONNXPARSER AND NVINFER_PLUGIN) if(CUDA_VERBOSE) message(STATUS "TensorRT is available!") message(STATUS "NVINFER: ${NVINFER}") - message(STATUS "NVPARSERS: ${NVPARSERS}") message(STATUS "NVINFER_PLUGIN: ${NVINFER_PLUGIN}") message(STATUS "NVONNXPARSER: ${NVONNXPARSER}") endif() diff --git a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp index 37ffca4a9526c..d9e0471a43d62 100644 --- a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp +++ b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp @@ -75,21 +75,18 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt RCLCPP_ERROR(this->get_logger(), "Could not find tlr id"); } - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0); - const bool cuda_preprocess = true; const std::string calib_image_list = ""; const double scale = 1.0; const std::string cache_dir = ""; - nvinfer1::Dims input_dim = autoware::tensorrt_common::get_input_dims(model_path); - assert(input_dim.d[0] > 0); - batch_size_ = input_dim.d[0]; - const autoware::tensorrt_common::BatchConfig batch_config{batch_size_, batch_size_, batch_size_}; + + auto trt_config = autoware::tensorrt_common::TrtCommonConfig(model_path, precision); trt_yolox_ = std::make_unique( - model_path, precision, num_class, score_thresh_, nms_threshold, build_config, cuda_preprocess, - gpu_id, calib_image_list, scale, cache_dir, batch_config); + trt_config, num_class, score_thresh_, nms_threshold, cuda_preprocess, gpu_id, calib_image_list, + scale, cache_dir); + + batch_size_ = trt_yolox_->getBatchSize(); if (!trt_yolox_->isGPUInitialized()) { RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id);