YOLOv5 Windows环境下的C++部署(GPU)
YOLOv5 Windows环境下的C++部署(GPU)
文章目录
前言
最近在学习pytorch模型的c++部署,查阅网上资料时发现了很多优秀的博主写的详细的教程,但大部分是以前的版本,所以在此整理记录一下新版的yolov5 c++部署
1、环境介绍
windows环境:
pytorch 1.11.0、yolov5 v6.0
2、环境配置
打开VS2017,新建一个控制台应用程序c++_test。
由于libtorch只能在64位windows上运行,因此我们需要修改项目为release x64,后面所有的项目配置都按照Release x64来配置,至于调试版的Debug x64可以按照这个教程一样的配置即可。
在项目中配置opencv和libtorch,依次选择项目、c++_test属性、VC++目录,包含目录中添加:
D:\opencv\build\include
D:\opencv\build\include\opencv2
D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\include
D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\include\torch\csrc\api\include
库目录中添加:
D:\opencv\build\x64\vc14\lib
D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\lib
再依次选择链接器、输入,附加依赖项中添加:
D:\opencv\build\x64\vc14\lib 文件夹下面的opencv_world420.lib
D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\lib 文件夹下面的所有lib文件。
注意vs的项目属性配置有release和debug两个版本,由于使用的第三方库是release版本的,所以项目的属性配置也需要是release版的
3、.torchscript.pt版本模型导出
TorchScript是PyTorch模型(nn.Module的子类)的中间表示,可以在高性能环境(例如C ++,注意不止是c++)中运行,TorchScript可以通过python语言使用和导出。导出代码如下所示(注意导出的是GPU版本,GPU版本可以使用GPU和CPU进行推理,而CPU版本仅支持使用CPU进行推理):
"""Exports a YOLOv5 *.pt model to ONNX and TorchScript formats
Usage:
$ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov5s.pt --img 640 --batch 1
"""
import argparse
import sys
import time
sys.path.append('./') # to run '$ python *.py' files in subdirectories
import torch
import torch.nn as nn
import models
from models.experimental import attempt_load
from utils.activations import Hardswish, SiLU
from utils.general import set_logging, check_img_size
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str, default='./last.pt', help='weights path') # from yolov5/models/
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size') # height, width
parser.add_argument('--dynamic', action='store_true', help='dynamic ONNX axes')
parser.add_argument('--batch-size', type=int, default=1, help='batch size')
opt = parser.parse_args()
opt.img_size *= 2 if len(opt.img_size) == 1 else 1 # expand
print(opt)
set_logging()
t = time.time()
# Load PyTorch model
# gpu
model = attempt_load(opt.weights, map_location=torch.device('cuda')) # load FP32 model
labels = model.names
# Checks
gs = int(max(model.stride)) # grid size (max stride)
opt.img_size = [check_img_size(x, gs) for x in opt.img_size] # verify img_size are gs-multiples
# Input
# gpu
img = torch.zeros(opt.batch_size, 3, *opt.img_size).to(device='cuda') # image size(1,3,320,192) iDetection
model.eval()
# Update model
for k, m in model.named_modules():
m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatibility
if isinstance(m, models.common.Conv): # assign export-friendly activations
if isinstance(m.act, nn.Hardswish):
m.act = Hardswish()
elif isinstance(m.act, nn.SiLU):
m.act = SiLU()
# elif isinstance(m, models.yolo.Detect):
# m.forward = m.forward_export # assign forward (optional)
model.model[-1].export = False # set Detect() layer export=True
y = model(img) # dry run
# TorchScript export
try:
print('\nStarting TorchScript export with torch %s...' % torch.__version__)
f = opt.weights.replace('.pt', '.GPU_torchscript.pt') # filename
ts = torch.jit.trace(model, img)
ts.save(f)
print('TorchScript export success, saved as %s' % f)
except Exception as e:
print('TorchScript export failure: %s' % e)
print('\nExport complete (%.2fs). Visualize with https://github.com/lutzroeder/netron.' % (time.time() - t))
4、c++中调用模型并进行推理
需要准备的文件:上述转换好的.torchscript.pt、coco.names。coco.names是存放标签名称的文件,完整c++代码显示如下
#include <torch/script.h>
#include <memory>
#include <torch/torch.h>
#include<opencv2/opencv.hpp>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui_c.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <time.h>
using namespace cv;
std::vector<std::string> LoadNames(const std::string& path)
{
// load class names
std::vector<std::string> class_names;
std::ifstream infile(path);
if (infile.is_open()) {
std::string line;
while (std::getline(infile, line)) {
class_names.emplace_back(line);
}
infile.close();
}
else {
std::cerr << "Error loading the class names!\n";
}
return class_names;
}
std::vector<float> LetterboxImage(const cv::Mat& src, cv::Mat& dst, const cv::Size& out_size)
{
auto in_h = static_cast<float>(src.rows);
auto in_w = static_cast<float>(src.cols);
float out_h = out_size.height;
float out_w = out_size.width;
float scale = std::min(out_w / in_w, out_h / in_h);
int mid_h = static_cast<int>(in_h * scale);
int mid_w = static_cast<int>(in_w * scale);
cv::resize(src, dst, cv::Size(mid_w, mid_h));
int top = (static_cast<int>(out_h) - mid_h) / 2;
int down = (static_cast<int>(out_h) - mid_h + 1) / 2;
int left = (static_cast<int>(out_w) - mid_w) / 2;
int right = (static_cast<int>(out_w) - mid_w + 1) / 2;
cv::copyMakeBorder(dst, dst, top, down, left, right, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
std::vector<float> pad_info{ static_cast<float>(left), static_cast<float>(top), scale };
return pad_info;
}
enum Det
{
tl_x = 0,
tl_y = 1,
br_x = 2,
br_y = 3,
score = 4,
class_idx = 5
};
struct Detection
{
cv::Rect bbox;
float score;
int class_idx;
};
void Tensor2Detection(const at::TensorAccessor<float, 2>& offset_boxes,
const at::TensorAccessor<float, 2>& det,
std::vector<cv::Rect>& offset_box_vec,
std::vector<float>& score_vec)
{
for (int i = 0; i < offset_boxes.size(0); i++) {
offset_box_vec.emplace_back(
cv::Rect(cv::Point(offset_boxes[i][Det::tl_x], offset_boxes[i][Det::tl_y]),
cv::Point(offset_boxes[i][Det::br_x], offset_boxes[i][Det::br_y]))
);
score_vec.emplace_back(det[i][Det::score]);
}
}
void ScaleCoordinates(std::vector<Detection>& data, float pad_w, float pad_h,
float scale, const cv::Size& img_shape)
{
auto clip = [](float n, float lower, float upper)
{
return std::max(lower, std::min(n, upper));
};
std::vector<Detection> detections;
for (auto & i : data) {
float x1 = (i.bbox.tl().x - pad_w) / scale; // x padding
float y1 = (i.bbox.tl().y - pad_h) / scale; // y padding
float x2 = (i.bbox.br().x - pad_w) / scale; // x padding
float y2 = (i.bbox.br().y - pad_h) / scale; // y padding
x1 = clip(x1, 0, img_shape.width);
y1 = clip(y1, 0, img_shape.height);
x2 = clip(x2, 0, img_shape.width);
y2 = clip(y2, 0, img_shape.height);
i.bbox = cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2));
}
}
torch::Tensor xywh2xyxy(const torch::Tensor& x)
{
auto y = torch::zeros_like(x);
// convert bounding box format from (center x, center y, width, height) to (x1, y1, x2, y2)
y.select(1, Det::tl_x) = x.select(1, 0) - x.select(1, 2).div(2);
y.select(1, Det::tl_y) = x.select(1, 1) - x.select(1, 3).div(2);
y.select(1, Det::br_x) = x.select(1, 0) + x.select(1, 2).div(2);
y.select(1, Det::br_y) = x.select(1, 1) + x.select(1, 3).div(2);
return y;
}
std::vector<std::vector<Detection>> PostProcessing(const torch::Tensor& detections,
float pad_w, float pad_h, float scale, const cv::Size& img_shape,
float conf_thres, float iou_thres)
{
/***
* 结果纬度为batch index(0), top-left x/y (1,2), bottom-right x/y (3,4), score(5), class id(6)
* 13*13*3*(1+4)*80
*/
constexpr int item_attr_size = 5;
int batch_size = detections.size(0);
// number of classes, e.g. 80 for coco dataset
auto num_classes = detections.size(2) - item_attr_size;
// get candidates which object confidence > threshold
auto conf_mask = detections.select(2, 4).ge(conf_thres).unsqueeze(2);
std::vector<std::vector<Detection>> output;
output.reserve(batch_size);
// iterating all images in the batch
for (int batch_i = 0; batch_i < batch_size; batch_i++) {
// apply constrains to get filtered detections for current image
auto det = torch::masked_select(detections[batch_i], conf_mask[batch_i]).view({ -1, num_classes + item_attr_size });
// if none detections remain then skip and start to process next image
if (0 == det.size(0)) {
continue;
}
// compute overall score = obj_conf * cls_conf, similar to x[:, 5:] *= x[:, 4:5]
det.slice(1, item_attr_size, item_attr_size + num_classes) *= det.select(1, 4).unsqueeze(1);
// box (center x, center y, width, height) to (x1, y1, x2, y2)
torch::Tensor box = xywh2xyxy(det.slice(1, 0, 4));
// [best class only] get the max classes score at each result (e.g. elements 5-84)
std::tuple<torch::Tensor, torch::Tensor> max_classes = torch::max(det.slice(1, item_attr_size, item_attr_size + num_classes), 1);
// class score
auto max_conf_score = std::get<0>(max_classes);
// index
auto max_conf_index = std::get<1>(max_classes);
max_conf_score = max_conf_score.to(torch::kFloat).unsqueeze(1);
max_conf_index = max_conf_index.to(torch::kFloat).unsqueeze(1);
// shape: n * 6, top-left x/y (0,1), bottom-right x/y (2,3), score(4), class index(5)
det = torch::cat({ box.slice(1, 0, 4), max_conf_score, max_conf_index }, 1);
// for batched NMS
constexpr int max_wh = 4096;
auto c = det.slice(1, item_attr_size, item_attr_size + 1) * max_wh;
auto offset_box = det.slice(1, 0, 4) + c;
std::vector<cv::Rect> offset_box_vec;
std::vector<float> score_vec;
// copy data back to cpu
auto offset_boxes_cpu = offset_box.cpu();
auto det_cpu = det.cpu();
const auto& det_cpu_array = det_cpu.accessor<float, 2>();
// use accessor to access tensor elements efficiently
Tensor2Detection(offset_boxes_cpu.accessor<float, 2>(), det_cpu_array, offset_box_vec, score_vec);
// run NMS
std::vector<int> nms_indices;
cv::dnn::NMSBoxes(offset_box_vec, score_vec, conf_thres, iou_thres, nms_indices);
std::vector<Detection> det_vec;
for (int index : nms_indices) {
Detection t;
const auto& b = det_cpu_array[index];
t.bbox =
cv::Rect(cv::Point(b[Det::tl_x], b[Det::tl_y]),
cv::Point(b[Det::br_x], b[Det::br_y]));
t.score = det_cpu_array[index][Det::score];
t.class_idx = det_cpu_array[index][Det::class_idx];
det_vec.emplace_back(t);
}
ScaleCoordinates(det_vec, pad_w, pad_h, scale, img_shape);
// save final detection for the current image
output.emplace_back(det_vec);
} // end of batch iterating
return output;
}
cv::Mat Demo(cv::Mat& img,
const std::vector<std::vector<Detection>>& detections,
const std::vector<std::string>& class_names,
bool label = true)
{
if (!detections.empty()) {
for (const auto& detection : detections[0]) {
const auto& box = detection.bbox;
float score = detection.score;
int class_idx = detection.class_idx;
cv::rectangle(img, box, cv::Scalar(0, 0, 255), 2);
if (label) {
std::stringstream ss;
ss << std::fixed << std::setprecision(2) << score;
std::string s = class_names[class_idx] + " " + ss.str();
auto font_face = cv::FONT_HERSHEY_DUPLEX;
auto font_scale = 1.0;
int thickness = 1;
int baseline = 0;
auto s_size = cv::getTextSize(s, font_face, font_scale, thickness, &baseline);
cv::rectangle(img,
cv::Point(box.tl().x, box.tl().y - s_size.height - 5),
cv::Point(box.tl().x + s_size.width, box.tl().y),
cv::Scalar(0, 0, 255), -1);
cv::putText(img, s, cv::Point(box.tl().x, box.tl().y - 5),
font_face, font_scale, cv::Scalar(255, 255, 255), thickness);
}
}
}
return img;
//cv::namedWindow("Result", cv::WINDOW_NORMAL);
//cv::imshow("Result", img);
}
int main()
{
torch::DeviceType device_type;
device_type = torch::kCUDA;
torch::Device device(device_type);
torch::jit::script::Module module;
module = torch::jit::load("D:\\pytorch\\c++_test\\x64\\Debug\\yolov5s.GPU_torchscript.pt", device); //加载模型
module.eval();
std::vector<std::string> class_names = LoadNames("D:\\pytorch\\c++_test\\x64\\Debug\\coco.names");//读取标签
if (class_names.empty()) {
return -1;
}
// set up threshold
float conf_thres = 0.4;
float iou_thres = 0.5;
VideoCapture video1(0);//打开笔记本自带摄像头(1)为外接摄像头
video1.set(CAP_PROP_FRAME_WIDTH, 1280);
video1.set(CAP_PROP_FRAME_HEIGHT, 720);
//读取视频帧率
double rate = video1.get(CAP_PROP_FPS);
std::cout << "rate: " << rate << std::endl;
//当前视频帧
Mat frame;
//每一帧之间的延时
int delay = 1000 / rate;
bool stop(false);
while (!stop)
{
double t = (double)cv::getTickCount();//开始计时
if (!video1.read(frame))
{
std::cout << "no video frame" << std::endl;
break;
}
cv::Mat img = frame;
//inference
torch::NoGradGuard no_grad;
cv::Mat img_input = img.clone();
std::vector<float> pad_info = LetterboxImage(img_input, img_input, cv::Size(640, 640));
const float pad_w = pad_info[0];
const float pad_h = pad_info[1];
const float scale = pad_info[2];
cv::cvtColor(img_input, img_input, cv::COLOR_BGR2RGB); // BGR -> RGB
//归一化需要是浮点类型
img_input.convertTo(img_input, CV_32FC3, 1.0f / 255.0f); // normalization 1/255
// 加载图像到设备
auto tensor_img = torch::from_blob(img_input.data, { 1, img_input.rows, img_input.cols, img_input.channels() }).to(device_type);
// BHWC -> BCHW
tensor_img = tensor_img.permute({ 0, 3, 1, 2 }).contiguous(); // BHWC -> BCHW (Batch, Channel, Height, Width)
std::vector<torch::jit::IValue> inputs;
// 在容器尾部添加一个元素,这个元素原地构造,不需要触发拷贝构造和转移构造
inputs.emplace_back(tensor_img);
//start = clock();
torch::jit::IValue output = module.forward(inputs);
// 解析结果
auto detections = output.toTuple()->elements()[0].toTensor();
auto result = PostProcessing(detections, pad_w, pad_h, scale, img.size(), conf_thres, iou_thres);
double endtime = (double)(t_stop - t_start) / CLOCKS_PER_SEC;
cv::Mat pre_img = Demo(img, result, class_names);
t = ((double)cv::getTickCount() - t) / cv::getTickFrequency();//结束计时
int fps = int(1.0 / t);//转换为帧率
std::cout << "FPS: " << fps << std::endl;//输出帧率
putText(pre_img, ("FPS: " + std::to_string(fps)), Point(0, 50), FONT_HERSHEY_COMPLEX, 0.5, Scalar(0, 0, 0));//输入到帧frame上
//cv::namedWindow("Result", cv::WINDOW_NORMAL);
cv::namedWindow("Result", cv::WINDOW_AUTOSIZE);
cv::imshow("Result", pre_img);
//waitKey()函数的作用是刷新imshow()展示的图片
if (waitKey(10) == 27)//27是键盘摁下esc时,计算机接收到的ascii码值
{
break;
}
}
video1.release();
return 0;
}
运行代码结果即为,对摄像头获取到的视频进行目标检测并显示实时帧率。
5、可能遇到的问题
1、‘’std‘’:不明确的符号
解决办法:项目->属性->c/c+±>语言->符合模式->选择否
2、模型加载出错
解决办法:查看cuda是否可用,若torch::cuda::is_available()返回false,则在项目属性(Release)-链接器 – 命令行 – 其他选项贴入下面命令
/INCLUDE:"?ignore_this_library_placeholder@@YAHXZ"
有些博客提到输入以下命令,实测vs2017 torch::cuda::is_available()虽然返回true,但模型加载依然会失败
/INCLUDE:?warp_size@cuda@at@@YAHXZ
3、coco.names读取失败
解决办法:注意文件后缀是否为.names。也可以直接进入该链接下载.names文件:https://gitee.com/goodtn/libtorch-yolov5-gpu/tree/master
参考:
环境配置参考:https://qianbin.blog.csdn.net/article/details/102937131
代码参考:https://www.cnblogs.com/tensorrt/p/14614632.html
来源:qq_43349822