TensorRT10系列的api使用以及部署案例
tensorRT更新到10系列后,一些接口发生变化。本文简单介绍使用tensorRT部署模型的接口介绍以及部署例子的分享。和前面的部署思路一致,我们从onnx开始构建模型推理引擎,然后加载推理引擎,推理模型。
1. 准备模型的logger
构建模型和推理代码的过程都需要logger,这个logger用来记录模型构建和推理过程中出现的异常或错误。需要手动实现这个logger类,logger类需要继承自tensorRT的iLogger类。
我的实现的方法如下:
其中trtlogger.h的代码如下:
#ifndef __LOGGER_HPP__
#define __LOGGER_HPP__#include <NvInfer.h>
#include <string>
#include <stdarg.h>
#include <memory>#define LOGF(...) trtlogger::Logger::__log_info(logger::Level::FATAL, __VA_ARGS__)
#define LOGE(...) trtlogger::Logger::__log_info(logger::Level::ERROR, __VA_ARGS__)
#define LOGW(...) trtlogger::Logger::__log_info(logger::Level::WARN, __VA_ARGS__)
#define LOG(...) trtlogger::Logger::__log_info(logger::Level::INFO, __VA_ARGS__)
#define LOGV(...) trtlogger::Logger::__log_info(logger::Level::VERB, __VA_ARGS__)
#define LOGD(...) trtlogger::Logger::__log_info(logger::Level::DEBUG, __VA_ARGS__)#define DGREEN "\033[1;36m"
#define BLUE "\033[1;34m"
#define PURPLE "\033[1;35m"
#define GREEN "\033[1;32m"
#define YELLOW "\033[1;33m"
#define RED "\033[1;31m"
#define CLEAR "\033[0m"namespace trtlogger{enum class Level : int32_t{FATAL = 0,ERROR = 1,WARN = 2,INFO = 3,VERB = 4,DEBUG = 5
};class Logger : public nvinfer1::ILogger{public:Logger();Logger(Level level);virtual void log(Severity severity, const char* msg) noexcept override;static void __log_info(Level level, const char* format, ...);Severity get_severity(Level level);Level get_level(Severity severity);private:static Level m_level;Severity m_severity;
};std::shared_ptr<Logger> create_logger(Level level);} // namespace logger#endif //__LOGGER_HPP__
其中的trtlogger.cpp代码如下:
#include "TrtLogger.hpp"
#include <NvInfer.h>
#include <cstdlib>using namespace std;namespace trtlogger {Level Logger::m_level = Level::INFO;Logger::Logger(Level level) {m_level = level;m_severity = get_severity(level);
}Logger::Severity Logger::get_severity(Level level) {switch (level) {case Level::FATAL: return Severity::kINTERNAL_ERROR;case Level::ERROR: return Severity::kERROR;case Level::WARN: return Severity::kWARNING;case Level::INFO: return Severity::kINFO;case Level::VERB: return Severity::kVERBOSE;default: return Severity::kVERBOSE;}
}Level Logger::get_level(Severity severity) {switch (severity) {case Severity::kINTERNAL_ERROR: return Level::FATAL;case Severity::kERROR: return Level::ERROR;case Severity::kWARNING: return Level::WARN;case Severity::kINFO: return Level::INFO;case Severity::kVERBOSE: return Level::VERB;default: return Level::FATAL;}
}void Logger::log (Severity severity, const char* msg) noexcept{/* 有的时候TensorRT给出的log会比较多并且比较细,所以我们选择将TensorRT的打印log的级别稍微约束一下- TensorRT的log级别如果是FATAL, ERROR, WARNING, 按照正常方式打印- TensorRT的log级别如果是INFO或者是VERBOSE的时候,只有当logger的level在大于VERBOSE的时候再打出*/if (severity <= get_severity(Level::WARN)|| m_level >= Level::DEBUG)__log_info(get_level(severity), "%s", msg);
}void Logger::__log_info(Level level, const char* format, ...) {char msg[1000];va_list args;va_start(args, format);int n = 0;switch (level) {case Level::DEBUG: n += snprintf(msg + n, sizeof(msg) - n, DGREEN "[debug]" CLEAR); break;case Level::VERB: n += snprintf(msg + n, sizeof(msg) - n, PURPLE "[verb]" CLEAR); break;case Level::INFO: n += snprintf(msg + n, sizeof(msg) - n, YELLOW "[info]" CLEAR); break;case Level::WARN: n += snprintf(msg + n, sizeof(msg) - n, BLUE "[warn]" CLEAR); break;case Level::ERROR: n += snprintf(msg + n, sizeof(msg) - n, RED "[error]" CLEAR); break;default: n += snprintf(msg + n, sizeof(msg) - n, RED "[fatal]" CLEAR); break;}n += vsnprintf(msg + n, sizeof(msg) - n, format, args);va_end(args);if (level <= m_level) fprintf(stdout, "%s\n", msg);if (level <= Level::ERROR) {fflush(stdout);exit(0);}
}shared_ptr<Logger> create_logger(Level level) {return make_shared<Logger>(level);
}} // namespace logger
1. 构建模型推理引擎
加载onnx模型,构建推理引擎。
这部分的代码和tensorRT8版本的变化不大,总的分为构建需要如下几个
1. 声明构建模型的四件套:(logger)、builder、network、config、parser
2. 配置模型优化参数:dynamic shape、Calibration、DLA
3. 申请workspace
4. 序列化保存推理引擎文件
需要引用前面的logger方法,使用tensorRT的api构建模型推理引擎的额完整代码如下:
bool genEngine(std::string onnx_file_path, std::string save_engine_path, trtlogger::Logger level, int maxbatch){auto logger = std::make_shared<trtlogger::Logger>(level);// 创建builderauto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(*logger));if(!builder){std::cout<<" (T_T)~~~, Failed to create builder."<<std::endl;return false;}auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0U));if(!network){std::cout<<" (T_T)~~~, Failed to create network."<<std::endl;return false;}// 创建 configauto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());if(!config){std::cout<<" (T_T)~~~, Failed to create config."<<std::endl;return false;}// 创建parser 从onnx自动构建模型,否则需要自己构建每个算子auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, *logger));if(!parser){std::cout<<" (T_T)~~~, Failed to create parser."<<std::endl;return false;}// 读取onnx模型文件开始构建模型auto parsed = parser->parseFromFile(onnx_file_path.c_str(), 1);if(!parsed){std::cout<<" (T_T)~~~ ,Failed to parse onnx file."<<std::endl;return false;}{auto input = network->getInput(0);auto input_dims = input->getDimensions();auto profile = builder->createOptimizationProfile(); // 配置最小、最优、最大范围input_dims.d[0] = 1; profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);input_dims.d[0] = maxbatch;profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);config->addOptimizationProfile(profile);// 判断是否使用半精度优化模型// if(FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16);config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);// 设置默认设备类型为 DLAconfig->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);// 获取 DLA 核心支持情况int numDLACores = builder->getNbDLACores();if (numDLACores > 0) {std::cout << "DLA is available. Number of DLA cores: " << numDLACores << std::endl;// 设置 DLA 核心int coreToUse = 0; // 选择第一个 DLA 核心(可以根据实际需求修改)config->setDLACore(coreToUse);std::cout << "Using DLA core: " << coreToUse << std::endl;} else {std::cerr << "DLA not available on this platform, falling back to GPU." << std::endl;// 如果 DLA 不可用,则设置 GPU 回退config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);config->setDefaultDeviceType(nvinfer1::DeviceType::kGPU);}};config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 28); /*在新的版本中被使用*/// 创建序列化引擎文件auto plan = std::unique_ptr<nvinfer1::IHostMemory>(builder->buildSerializedNetwork(*network, *config));if(!plan){std::cout<<" (T_T)~~~, Failed to SerializedNetwork."<<std::endl;return false;}//! 检查输入部分是否符合要求auto numInput = network->getNbInputs();std::cout<<"模型的输入个数是:"<<numInput<<std::endl;for(auto i = 0; i<numInput; ++i){std::cout<<" 模型的第"<<i<<"个输入:";auto mInputDims = network->getInput(i)->getDimensions();std::cout<<" ✨~ model input dims: "<<mInputDims.nbDims <<std::endl;for(size_t ii=0; ii<mInputDims.nbDims; ++ii){std::cout<<" ✨^_^ model input dim"<<ii<<": "<<mInputDims.d[ii] <<std::endl;}}auto numOutput = network->getNbOutputs();std::cout<<"模型的输出个数是:"<<numOutput<<std::endl;for(auto i=0; i<numOutput; ++i){std::cout<<" 模型的第"<<i<<"个输出:";auto mOutputDims = network->getOutput(i)->getDimensions();std::cout<<" ✨~ model output dims: "<<mOutputDims.nbDims <<std::endl;for(size_t jj=0; jj<mOutputDims.nbDims; ++jj){std::cout<<" ✨^_^ model output dim"<<jj<<": "<<mOutputDims.d[jj] <<std::endl;}}// 序列化保存推理引擎文件文件std::ofstream engine_file(save_engine_path, std::ios::binary);if(!engine_file.good()){std::cout<<" (T_T)~~~, Failed to open engine file"<<std::endl;return false;}engine_file.write((char *)plan->data(), plan->size());engine_file.close();std::cout << " ~~Congratulations! 🎉🎉🎉~ Engine build success!!! ✨✨✨~~ " << std::endl;return true;}
在10系列的trt构建的过程中,加载onnx模型开始构建推理引擎的时候,基于nvonnxparser这个动态库实现的。
3. input_dir--engine--outpur_dir对接
把准备好的数据传给engine,engine推理这个数据,然后输出推理的结果。B=engine(A),其中A是模型可以接收的数据格式,B是engine推理的结果。此时,需要把engine和A、B对接好接口。在当前的版本中,使用context->setTensorAddress(name, buffers.getDeviceBuffer(name))接口完成对接。
bool TrtModel::trtIOMemory() {m_inputDims = m_context->getTensorShape("images");m_outputDims = m_context->getTensorShape("output0");// for(auto i=0; i<IOName.size(); ++i){// m_IODims.push_back(m_context->getTensorShape(IOName[i].c_str()));// }// int memory_size = 1;// for(auto i=0; i<m_IODims.size(); ++i){// for(auto j=0; m_IODims[i].nbDims; ++j){// memory_size *= m_IODims[i].d[j];// std::cout<<"展示出来输入输出的内存大小"<<memory_size<<std::endl;// }// }this->kInputH = m_inputDims.d[2];this->kInputW = m_inputDims.d[3];m_inputSize = m_inputDims.d[0] * m_inputDims.d[1] * m_inputDims.d[2] * m_inputDims.d[3] * sizeof(float);m_outputSize = m_outputDims.d[0] * m_outputDims.d[1] * m_outputDims.d[2] * sizeof(float);checkRuntime(cudaMalloc(&buffers[0], m_inputSize));checkRuntime(cudaMalloc(&buffers[1], m_outputSize));checkRuntime(cudaMallocHost(&m_inputMemory, m_inputSize));checkRuntime(cudaMallocHost(&m_outputMemory[0], m_outputSize)); // cpu outputcheckRuntime(cudaMalloc(&m_outputMemory[1], m_outputSize));checkRuntime(cudaMalloc(&m_outputMemory[2], (1 + kMaxNumOutputBbox * kNumBoxElement) * sizeof(float)));checkRuntime(cudaMallocHost(&m_outputMemory[3], (1 + kMaxNumOutputBbox * kNumBoxElement) * sizeof(float)));m_context->setTensorAddress("images", buffers[0]);m_context->setTensorAddress("output0", buffers[1]);// for(auto i=0; i<IOName.size();++i){// m_context->setTensorAddress(IOName[i].c_str(), buffers[i]);// }// std::vector<std::string>().swap(IOName);checkRuntime(cudaStreamCreate(&m_stream));return true;
}
这里主要给模型的输入输出申请内存,然后绑定对接输入输出
4. 推理模型
推理:声明推理模型的三件套:(load_engine)、(logger)、runtime、deserialize、ExecutionContext
bool TrtModel::Runtime(std::string engine_file_path, trtlogger::Logger level, int maxBatch){auto logger = std::make_shared<trtlogger::Logger>(level);std::ifstream engineFile(engine_file_path, std::ios::binary);long int fsize = 0;engineFile.seekg(0, engineFile.end);fsize = engineFile.tellg();engineFile.seekg(0, engineFile.beg);std::vector<char> engineString(fsize);engineFile.read(engineString.data(), fsize);if (engineString.size() == 0) { std::cout << "Failed getting serialized engine!" << std::endl; return false; }// 创建推理引擎m_runtime.reset(nvinfer1::createInferRuntime(*logger));if(!m_runtime){std::cout<<" (T_T)~~~, Failed to create runtime."<<std::endl;return false;}// 反序列化推理引擎m_engine.reset(m_runtime->deserializeCudaEngine(engineString.data(), fsize));if(!m_engine){std::cout<<" (T_T)~~~, Failed to deserialize."<<std::endl;return false;}// 获取优化后的模型的输入维度和输出维度// int nbBindings = m_engine->getNbBindings(); // trt8.5 以前版本int nbBindings = m_engine->getNbIOTensors(); // trt8.5 以后版本auto num_tensors = m_engine->getNbIOTensors(); for(auto i=0; i<num_tensors; ++i){std::string name = std::string(m_engine->getIOTensorName(i)); // 获取张量名称auto shape = m_engine->getTensorShape(name.c_str()); IOName.push_back(name);auto dtype = m_engine->getTensorDataType(name.c_str()); // 获取张量数据类型bool input = (m_engine->getTensorIOMode(name.c_str()) == nvinfer1::TensorIOMode::kINPUT); // 判断张量是否为输入if(input){std::cout<<"input of model:"<<std::endl;for(auto i=0; i<shape.nbDims; ++i){std::cout<< i <<" dims: "<<shape.d[i]<<std::endl;}}else{std::cout<<"output of model:"<<std::endl;for(auto i=0; i<shape.nbDims; ++i){std::cout<< i <<" dims: "<<shape.d[i]<<std::endl;}}}// 推理执行上下文m_context.reset(m_engine->createExecutionContext());if(!m_context){std::cout<<" (T_T)~~~, Failed to create ExecutionContext."<<std::endl;return false;}auto input_dims = m_context->getTensorShape("images");input_dims.d[0] = maxBatch;m_context->setInputShape("images", input_dims);std::cout << " ~~Congratulations! 🎉🎉🎉~ create execution context success!!! ✨✨✨~~ " << std::endl;return true;
}
这部分代码主要用来加载推理引擎,然后创建createInferRuntime,deserializeCudaEngine,createExecutionContext,此时完善推理后,
bool status = this->m_context->enqueueV3(m_stream);
进行enqueueV3即可以得到模型推理的结果。此时推理出来的结果数据被存储在绑定的输出地址上,直接读取绑定的输出地址即可以进行后处理的操作。
总体来说和之前的部署思路完全一致,api变动是推理引擎的输入输出的绑定接口和推理模型的接口。