TensorRT作为NVIDIA推出的高性能深度学习推理引擎,在工业部署中扮演着关键角色。本文将深入解析如何通过C++ API实现TensorRT推理全流程,从模型转换到性能优化,为需要低延迟、高吞吐量部署场景的开发者提供完整解决方案。
TensorRT的核心优化技术包括层融合(Layer Fusion)、精度校准(Precision Calibration)和动态张量内存管理。其运行时引擎由Builder和Runtime两部分组成:
cpp复制nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(flags);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
cpp复制auto parser = nvonnxparser::createParser(*network, logger);
parser->parseFromFile(model_path, static_cast<int>(nvinfer1::ILogger::Severity::kWARNING));
cpp复制config->setMaxWorkspaceSize(1 << 30); // 1GB工作空间
if (builder->platformHasFastFp16()) {
config->setFlag(nvinfer1::BuilderFlag::kFP16);
}
cpp复制// 序列化引擎
nvinfer1::IHostMemory* serializedModel = engine->serialize();
// 反序列化
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(serializedData, serializedSize);
cpp复制nvinfer1::IExecutionContext* context = engine->createExecutionContext();
// 绑定输入输出缓冲区
void* buffers[inputIndex + outputIndex];
cudaMalloc(&buffers[inputIndex], inputSize);
cudaMalloc(&buffers[outputIndex], outputSize);
cpp复制cudaStream_t stream;
cudaStreamCreate(&stream);
context->enqueueV2(buffers, stream, nullptr);
cudaStreamSynchronize(stream);
cpp复制auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i) {
context->executeV2(buffers);
}
auto end = std::chrono::high_resolution_clock::now();
| 参数 | 推荐值 | 影响范围 |
|---|---|---|
| Workspace Size | 1-2GB | 影响优化器可用内存 |
| Batch Size | 2^n次方 | 提升并行效率 |
| FP16模式 | 硬件支持时启用 | 提升50%+性能 |
输入维度必须与模型定义完全一致,包括batch维度
cpp复制cudaError_t err = cudaMalloc(&ptr, size);
if (err != cudaSuccess) {
logger.log(ILogger::Severity::kERROR, cudaGetErrorString(err));
}
cpp复制class Logger : public nvinfer1::ILogger {
void log(Severity severity, const char* msg) override {
if (severity <= Severity::kVERBOSE) {
std::cout << msg << std::endl;
}
}
};
cpp复制auto profile = builder->createOptimizationProfile();
profile->setDimensions(inputName, OptProfileSelector::kMIN, Dims4{1,3,224,224});
profile->setDimensions(inputName, OptProfileSelector::kOPT, Dims4{8,3,224,224});
config->addOptimizationProfile(profile);
在实际部署中,我发现合理设置CUDA Graph可以进一步提升10-15%的推理性能。通过cudaGraphLaunch捕获计算图后,能显著减少内核启动开销。对于固定输入尺寸的模型,这是值得尝试的优化手段。