mdz/paddle/ppyoloe/3_deploy/Deps/modelzoo/et_device.hpp

992 lines
38 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <random>
#include <vector>
#include <fstream>
#include <random>
#ifdef __linux__
#include <unistd.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <linux/fb.h>
#endif
#include "opencv2/opencv.hpp"
#include <spdlog/spdlog.h>
#include "icraft-xir/core/network.h"
#include "icraft-xir/core/data.h"
#include "icraft-xrt/dev/host_device.h"
#include "icraft-xrt/dev/buyi_device.h"
#include "icraft-backends/hostbackend/utils.h"
#include "modelzoo_utils.hpp"
using namespace icraft::xrt;
using namespace icraft::xir;
using namespace std::string_literals;
using namespace std::chrono;
using namespace std::chrono_literals;
// 枚举类
// 表示摄像头输入的图像格式
enum camera_fmt {
RGB565,
RGB,
RGBA,
YUV422,
};
// 枚举类
// 表示plresize模块的剪裁区域
enum crop_position {
top_left,
top_right,
bottom_left,
bottom_right,
center,
};
// nms_pre_data 一维数组包含多个框的位置信息和类别信息,按照框的置信度大小从高到低排序的,一个框的信息表示为{x1,y1,x2,y2,class}。
// nms_pre_idx 所有的框按照置信度从高到低排列后,nms_pre_idx 记录了数组中排序后框在原未排序数组中的idx
// bbox_num 为框的个数
// iou阈值
// 该模块限制最多输入框个数为5000个
std::vector<int> fpgaNms(icraft::xrt::Device& device,const std::vector<int16_t> & nms_pre_data, std::vector<int> nms_pre_idx,int bbox_num, const float& iou, uint64_t base_addr = 0x100001C00){
if (nms_pre_data.size() != bbox_num * 5 || nms_pre_idx.size() != bbox_num) {
std::cout << "ERROR in FpgaNms :: The data for FpgaNms is error, Please check it!" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: The data for FpgaNms is error, Please check it!");
}
std::vector<int> nms_indices;
auto nms_data_cptr = nms_pre_data.data();
auto uregion_ = device.getMemRegion("udma");
auto udma_chunk_ = uregion_.malloc(10e6);
auto mapped_base = udma_chunk_->begin.addr();
udma_chunk_.write(0, (char*)nms_data_cptr, bbox_num * 10);
//hard nms config
float threshold_f = iou;
uint64_t arbase = mapped_base;
uint64_t awbase = mapped_base;
//检查硬件的版本信息是否正确,不正确会抛出错误
if (device.defaultRegRegion().read(base_addr + 0x008, true) != 0x23110200) {
std::cout << "ERROR in FpgaNms :: No NMS HardWare" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: No NMS HardWare");
}
auto group_num = (uint64_t)ceilf((float)bbox_num / 16.f);
if (group_num == 0)
{
std::cout << "ERROR in FpgaNms :: group_num == 0" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: group_num == 0");
}
auto last_araddr = arbase + group_num * 160 - 8;
if (last_araddr < arbase)
{
std::cout << "ERROR in FpgaNms :: last_araddr < arbase" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: last_araddr < arbase");
}
auto anchor_hpsize = (uint64_t)ceilf((float)bbox_num / 64.f);
if (anchor_hpsize == 0)
{
std::cout << "ERROR in FpgaNms :: anchor_hpsize == 0" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: anchor_hpsize == 0");
}
auto last_awaddr = awbase + anchor_hpsize * 8 - 8;
if (last_awaddr < awbase)
{
std::cout << "ERROR in FpgaNms :: last_awaddr < awbase" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: last_awaddr < awbase");
}
auto threshold = (uint16_t)(threshold_f * pow(2, 15));
//config reg
device.defaultRegRegion().write(base_addr + 0x014, 1, true);
device.defaultRegRegion().write(base_addr + 0x014, 0, true);
device.defaultRegRegion().write(base_addr + 0x01C, arbase, true);
device.defaultRegRegion().write(base_addr + 0x020, awbase, true);
device.defaultRegRegion().write(base_addr + 0x024, last_araddr, true);
device.defaultRegRegion().write(base_addr + 0x028, last_awaddr, true);
device.defaultRegRegion().write(base_addr + 0x02C, group_num, true);
device.defaultRegRegion().write(base_addr + 0x030, 0, true); //mode: 0同类之间筛选、1所有类之间筛选
device.defaultRegRegion().write(base_addr + 0x034, threshold, true);
device.defaultRegRegion().write(base_addr + 0x038, anchor_hpsize, true);
device.defaultRegRegion().write(base_addr + 0x0, 1, true); //start
uint64_t reg_done;
auto start = std::chrono::steady_clock::now();
do {
reg_done = device.defaultRegRegion().read(base_addr + 0x004, true);
std::chrono::duration<double, std::milli> duration = std::chrono::steady_clock::now() - start;
if (duration.count() > 1000) {
std::cout << "ERROR in FpgaNms :: NMS Timeout!!!" <<std::endl;
throw std::runtime_error("ERROR in FpgaNms :: NMS Timeout!!!");
}
} while (reg_done == 0);
uint64_t mask_size = (uint64_t)(ceilf((float)bbox_num / 8.f));
char* mask = new char[64000];
udma_chunk_.read(mask, 0, mask_size);
for (int i = 0; i < bbox_num; ++i) {
const int idx = nms_pre_idx[i];
int mask_index = i / 8;
if (i % 8 == 0 && ((mask[mask_index] & (uint8_t)1) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 1 && ((mask[mask_index] & (uint8_t)2) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 2 && ((mask[mask_index] & (uint8_t)4) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 3 && ((mask[mask_index] & (uint8_t)8) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 4 && ((mask[mask_index] & (uint8_t)16) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 5 && ((mask[mask_index] & (uint8_t)32) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 6 && ((mask[mask_index] & (uint8_t)64) != 0))
nms_indices.emplace_back(idx);
else if (i % 8 == 7 && ((mask[mask_index] & (uint8_t)128) != 0))
nms_indices.emplace_back(idx);
}
delete mask;
return nms_indices;
}
void fpgaDma(Tensor& img_tensor, Device& device,uint64_t base_addr = 0x1000C0000) {
auto ImageMakeChannel = img_tensor.dtype()->shape[-1];
auto ImageMakeWidth = img_tensor.dtype()->shape[-2];
auto ImageMakeHeight = img_tensor.dtype()->shape[-3];
//获取umda的memRegion
auto uregion_ = device.getMemRegion("udma");
//将host上的输出复制到udma上并返回对应的tensor 包含了内存管理机制
auto utensor = img_tensor.to(uregion_);
//获取在udma上对应的物理指针
auto ImageMakeRddrBase = utensor.data().addr();
auto ImageMakeRlen = ((ImageMakeWidth * ImageMakeHeight - 1) / (24 / ImageMakeChannel) + 1) * 3;
auto ImageMakeLastSft = ImageMakeWidth * ImageMakeHeight - (ImageMakeRlen - 3) / 3 * (24 / ImageMakeChannel);
device.defaultRegRegion().write(base_addr + 0x4, ImageMakeRddrBase, true);
device.defaultRegRegion().write(base_addr + 0x8, ImageMakeRlen, true);
device.defaultRegRegion().write(base_addr + 0xC, ImageMakeLastSft, true);
device.defaultRegRegion().write(base_addr + 0x10, ImageMakeChannel, true);
device.defaultRegRegion().write(base_addr + 0x1C, 1, true);
device.defaultRegRegion().write(base_addr + 0x20, 0, true);
device.defaultRegRegion().write(base_addr, 1, true);
}
// warpaffine寄存器配置M_inversed: 2x3变换矩阵的逆矩阵
void fpgaWarpaffine(std::vector<std::vector<float>>& M_inversed, Device& device,uint64_t base_addr = 0x100002800) {
// 配置warpaffine寄存器
auto coef_a = int64_t(M_inversed[0][0] * pow(2, 15));
auto coef_b = int64_t(M_inversed[0][1] * pow(2, 15));
auto coef_c = int64_t(M_inversed[0][2] * 2);
auto coef_d = int64_t(M_inversed[1][0] * pow(2, 15));
auto coef_e = int64_t(M_inversed[1][1] * pow(2, 15));
auto coef_f = int64_t(M_inversed[1][2] * 2);
device.defaultRegRegion().write(base_addr + 0x030, coef_a, true);
device.defaultRegRegion().write(base_addr + 0x034, coef_c, true);
device.defaultRegRegion().write(base_addr + 0x038, coef_e, true);
device.defaultRegRegion().write(base_addr + 0x03C, coef_f, true);
device.defaultRegRegion().write(base_addr + 0x044, coef_b, true);
device.defaultRegRegion().write(base_addr + 0x048, coef_d, true);
}
Tensor fpgaArgmax2d(Device& dev, int wsize, int hsize, int valid_csize, int csize,uint64_t arbase,uint64_t last_araddr,uint64_t base_addr = 0x100003000){
// 参数说明
//arbase - - 初始地址
//last_araddr - 最后一层 ftmp 在plddr的地址
int w = wsize;
int h = hsize;
int c = valid_csize;
int csize_cal = (c > 32) ? ((c / 32 + 1) * 32) : static_cast<int>(std::pow(2,static_cast<uint32_t>(std::ceil(std::log2(c)))));
int cu = (csize > 32) ? 32 : csize;
int ct = csize / cu;
int cu_araddr_num = ((w*h*cu)%64==0)?((w * h * cu) / 64 - 1):(w * h * cu) / 64;
int cu_flag = std::log2(cu);
int last_vld_cu = (c % cu == 0) ? (cu - 1) : (c % cu - 1);
int cu_size = w*h*cu;
const uint64_t ARGMAX2D_START = base_addr + 0x000;
const uint64_t ARGMAX2D_DONE = base_addr + 0x004;
const uint64_t ARGMAX2D_VER = base_addr + 0x008;
const uint64_t ARGMAX2D_TEST = base_addr + 0x00c;
const uint64_t ARGMAX2D_TIME_CNT = base_addr + 0x010;
const uint64_t ARGMAX2D_SOFT_RST = base_addr + 0x014;
const uint64_t ARGMAX2D_STATUS = base_addr + 0x018;
const uint64_t ARGMAX2D_ARBASE = base_addr + 0x01c;
const uint64_t ARGMAX2D_AWBASE = base_addr + 0x020;
const uint64_t ARGMAX2D_LAST_ARADDR = base_addr + 0x024;
const uint64_t ARGMAX2D_LAST_AWADDR = base_addr + 0x028;
const uint64_t ARGMAX2D_CU_ARADDR_NUM = base_addr + 0x02c;
const uint64_t ARGMAX2D_CU_FLAG = base_addr + 0x030;
const uint64_t ARGMAX2D_LAST_VLD_CU = base_addr + 0x034;
const uint64_t ARGMAX2D_CU_SIZE = base_addr + 0x038;
const uint64_t ARGMAX2D_SLEEPTIME = 50;
// 在udmabuf上申请argmax2d的缓存区,获取缓存的首尾物理地址
const uint64_t argmax2d_psbuf_size = valid_csize * 8;
auto argmax2d_pschunck = dev.getMemRegion("udma").malloc(argmax2d_psbuf_size, true);// auto free chunk
auto awbase = argmax2d_pschunck->begin.addr();
auto last_awaddr = awbase + argmax2d_psbuf_size;
//参数合法性检查
uint32_t argmax_ver_rd = dev.defaultRegRegion().read(ARGMAX2D_VER, true);
// uint32_t argmax_ver_rt = 0x24051200;
// uint32_t argmax_ver_rt = 0x24071800;
uint32_t argmax_ver_rt = 0x24073000;
if (argmax_ver_rd != argmax_ver_rt) {
std::cout << "Error in FpgaArgma2d: Argmax2d HardWare Version Mismatch! Read Version is " << argmax_ver_rd << ", Right version is" << argmax_ver_rt << std::endl;
ICRAFT_LOG(EXCEPT) << "Error in FpgaArgma2d :: No Argmax2d HardWare Or Version mismatch";
}
if (csize_cal != csize) {
std::cout << "Error in FpgaArgma2d: csize input is" << csize << "calculated csize is " << csize_cal << std::endl;
ICRAFT_LOG(EXCEPT) << "Error in FpgaArgma2d: csize input err";
}
if ((w * h * cu) % 64 != 0) {
std::cout << "Error in FpgaArgma2d: (w * h * cu) % 64 != 0, argmax2d hardop not support!" << std::endl;
ICRAFT_LOG(EXCEPT) << "Error in FpgaArgma2d: (w * h * cu) % 64 != 0, argmax2d hardop not support";
}
// 调试用
// std::cout<<"csize_cal ="<<csize_cal<<" cu ="<<cu<<" cu_araddr_num ="<<cu_araddr_num<<" last_vld_cu ="<<last_vld_cu<<" cu_size ="<<cu_size<<" cu_flag ="<<cu_flag<<std::endl;
//配置寄存器
dev.defaultRegRegion().write(ARGMAX2D_ARBASE, arbase, true);
dev.defaultRegRegion().write(ARGMAX2D_LAST_ARADDR, last_araddr, true);
dev.defaultRegRegion().write(ARGMAX2D_AWBASE, awbase, true);
dev.defaultRegRegion().write(ARGMAX2D_LAST_AWADDR, last_awaddr, true);
dev.defaultRegRegion().write(ARGMAX2D_CU_ARADDR_NUM, cu_araddr_num, true);
dev.defaultRegRegion().write(ARGMAX2D_CU_FLAG, cu_flag, true);
dev.defaultRegRegion().write(ARGMAX2D_LAST_VLD_CU, last_vld_cu, true);
dev.defaultRegRegion().write(ARGMAX2D_CU_SIZE, cu_size, true);
dev.defaultRegRegion().write(ARGMAX2D_START, 1, true);
//轮询done信号
unsigned int argmax2d_done = 0;
auto start = std::chrono::steady_clock::now();
do {
#ifndef _WIN32
usleep(ARGMAX2D_SLEEPTIME);
#endif
argmax2d_done = dev.defaultRegRegion().read(ARGMAX2D_DONE, true);
std::chrono::duration<double, std::milli> duration = std::chrono::steady_clock::now() - start;
if (duration.count() > 1000) {
std::cout << "Error in FpgaArgma2d :: Argmax2d Timeout" << std::endl;
ICRAFT_LOG(EXCEPT) << "Error in FpgaArgma2d :: Argmax2d Timeout";
}
} while (argmax2d_done == 0);
//获取FPGA计时
unsigned int argmax2d_time_cnt = dev.defaultRegRegion().read(ARGMAX2D_TIME_CNT, true);
double argmax2d_hard_time = (argmax2d_time_cnt * 5) / 1000000.0; //单位ms
// std::cout << "argmax2d_hard_time = " << argmax2d_hard_time << std::endl;
// 获取各通道最大值的坐标,构造输出的tensor
auto ofm_layout = Layout::NHWC();
icraft::xir::TensorType output_type;
// icraft::xir::Array<int64_t> output_dim = { 1,1,c,8 };
icraft::xir::Array<IntImm> output_dim = { 1,1,c,8 };
//auto data = std::shared_ptr<uint8_t[]>(new uint8_t[c*8]);
//argmax2d_pschunck.read((char*)data.get(), 0, c * 8);
output_type = TensorType(xir::IntegerType::UInt8(), output_dim, ofm_layout);
auto output_tensor = Tensor(output_type, argmax2d_pschunck, 0);//udma buffer 获取结果
return output_tensor;
}
/**
* nms_hard,使用说明
* 若最终输出检测数量为500个nms_hard耗时约0.638ms
* 若最终输出检测数量为100个nms_hard耗时约0.297ms
* 当最终检测数量小于30个的情况下采用nms_soft会比nms_hard速度快。
* 确保送入该函数的框的置信度以及在外部进行了阈值筛选
* 注该函数适配大部分yolo系列模型后处理的hard nms函数其调用了setFpgaNms模块
*/
std::vector<std::tuple<int, float, cv::Rect2f>> nms_hard(std::vector<cv::Rect2f>& box_list, std::vector<float>& score_list, std::vector<int>& id_list, const float& iou, icraft::xrt::Device& device, int max_nms = 3000) {
std::vector<std::pair<float, int> > score_index_vec;
std::vector<std::tuple<int, float, cv::Rect2f>> num_res;
std::vector<int> after_id_list;
if (box_list.size() == 0) return num_res;
for (size_t i = 0; i < score_list.size(); ++i) {
score_index_vec.emplace_back(std::make_pair(score_list[i], i));
after_id_list.push_back(id_list[i]);
}
std::stable_sort(score_index_vec.begin(), score_index_vec.end(),
[](const std::pair<float, int>& pair1, const std::pair<float, int>& pair2) {return pair1.first > pair2.first; });
// 重新排列 after_id_list
std::vector<int> resort_idx;
std::vector<int> nms_pre_idx;
std::vector<int> sorted_after_id_list(after_id_list.size());
for (size_t i = 0; i < score_index_vec.size(); ++i) {
sorted_after_id_list[i] = after_id_list[score_index_vec[i].second];
resort_idx.push_back(score_index_vec[i].second);
}
// 更新 after_id_list
after_id_list = sorted_after_id_list;
std::vector<int16_t> nms_pre_data;
int box_num = score_index_vec.size();
if (box_num > max_nms) {
box_num = max_nms;
}
for (int i = 0; i < box_num; ++i) {
const int idx = score_index_vec[i].second;
auto x1 = box_list[idx].tl().x;
if (x1 < 0) x1 = 0;
auto y1 = box_list[idx].tl().y;
if (y1 < 0) y1 = 0;
auto x2 = box_list[idx].br().x;
auto y2 = box_list[idx].br().y;
nms_pre_data.push_back((int16_t)x1);
nms_pre_data.push_back((int16_t)y1);
nms_pre_data.push_back((int16_t)x2);
nms_pre_data.push_back((int16_t)y2);
nms_pre_data.push_back((int16_t)after_id_list[i]);
nms_pre_idx.push_back(resort_idx[i]);
}
std::vector<int> nms_indices = fpgaNms(device, nms_pre_data, nms_pre_idx, box_num, iou);
for (auto idx : nms_indices) {
num_res.push_back({ id_list[idx],score_list[idx],box_list[idx] });
}
return num_res;
}
void dmaInit(const bool& run_sim, const bool& has_ImageMake, Tensor& img_tensor, Device& device) {
#ifdef _WIN32
if (run_sim || !has_ImageMake) {
return;
}
#endif
if (has_ImageMake) {
fpgaDma(img_tensor, device);
}
}
void dma_imk_Init(const bool& run_sim, const bool& has_ImageMake, Operation& ImageMake_ ,Tensor& img_tensor, Device& device,Session &session) {
#ifdef _WIN32
if (run_sim || !has_ImageMake) {
return;
}
#endif
if (has_ImageMake) {
session->backends[0].cast<BuyiBackend>().initOp(ImageMake_);
fpgaDma(img_tensor, device);
}
}
float calculate_scale(double thr_f1, double thr_f2) {
// 检查thr_f1和thr_f2是否在[0, 1)范围内
if (thr_f1 <= 0 || thr_f1 >= 1 || thr_f2 <= 0 || thr_f2 >= 1) {
throw std::invalid_argument("Both thr_f1 and thr_f2 must be in the range (0, 1) to avoid division by zero.");
}
// 计算scale的值
float scale = log(1 / thr_f1 - 1) / log(1 / thr_f2 - 1);
//return static_cast<int64_t>(scale);
return scale;
}
void updateDetpost(NetInfo& netinfo, Session& session, float conf) {
//获取detpost op
Operation det = netinfo.DetPost_;
//如果yaml.conf与detpost的conf不一致则更新detpost的data_thr
if (netinfo.thr_f != conf) {
//获取detpost原始的data_thr
Array<int64_t> data_thr = det->getAttr("data_thr").cast<Array<int64_t>>();
//计算缩放scale
float thr_f1 = conf;//new conf
float thr_f2 = netinfo.thr_f;//original conf
float scale = calculate_scale(thr_f1, thr_f2);
netinfo.thr_f = conf;//更新netinfo.thr_f
//计算new thr_q(data_thr)
try {
for (int i = 0; i < data_thr.size(); i++) {
netinfo.data_thr[i] *= scale;
data_thr.set(i, static_cast<int64_t>(netinfo.data_thr[i]));// calculate new data_thr
//std::cout << netinfo.data_thr[i] << std::endl;
}
det.setAttr("data_thr", data_thr);//set Attr
}
catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
}
}
// re-init Detpost
session->backends[0].cast<BuyiBackend>().initOp(det);
//check detpost data_thr
//Array<int64_t> data_thr2 = det->getAttr("data_thr").cast<Array<int64_t>>();
}
//-------------------------------------//
// PLin
//-------------------------------------//
void hardResizePS(BuyiDevice dev, const int CAMERA_WIDTH, const int CAMERA_HEIGHT,
const int FRAME_WIDTH, const int FRAME_HEIGHT,
camera_fmt fmt, crop_position crop, uint64_t base_addr = 0x40080000)
{
int ws = CAMERA_WIDTH / FRAME_WIDTH;
int hs = CAMERA_HEIGHT / FRAME_HEIGHT;
int IMG_W = ws * FRAME_WIDTH;
int IMG_H = hs * FRAME_HEIGHT;
int x0, y0, x1, y1;
switch (crop)
{
case crop_position::center:
x0 = (CAMERA_WIDTH - IMG_W) / 2;
y0 = (CAMERA_HEIGHT - IMG_H) / 2;
x1 = CAMERA_WIDTH - x0 - 1;
y1 = CAMERA_HEIGHT - y0 - 1;
break;
case crop_position::top_left:
x0 = 0;
y0 = 0;
x1 = IMG_W - 1;
y1 = IMG_H - 1;
break;
case crop_position::top_right:
x0 = CAMERA_WIDTH - IMG_W;
y0 = 0;
x1 = CAMERA_WIDTH - 1;
y1 = IMG_H - 1;
break;
case crop_position::bottom_left:
x0 = 0;
y0 = CAMERA_HEIGHT - IMG_H;
x1 = IMG_W - 1;
y1 = CAMERA_HEIGHT - 1;
break;
case crop_position::bottom_right:
x0 = CAMERA_WIDTH - IMG_W;
y0 = CAMERA_HEIGHT - IMG_H;
x1 = CAMERA_WIDTH - 1;
y1 = CAMERA_HEIGHT - 1;
break;
}
dev.defaultRegRegion().write(base_addr + 0x18, 1);
dev.defaultRegRegion().write(base_addr + 0x5c, x0 << 16 | x1);
dev.defaultRegRegion().write(base_addr + 0x60, y0 << 16 | y1);
dev.defaultRegRegion().write(base_addr + 0x64, CAMERA_WIDTH << 16 | CAMERA_HEIGHT);
dev.defaultRegRegion().write(base_addr + 0x68, ws << 4 | hs);
int image_fmt_channel = 4;
switch (fmt)
{
case camera_fmt::RGB565:
dev.defaultRegRegion().write(base_addr + 0x78, 0);
image_fmt_channel = 2;
break;
case camera_fmt::RGB:
image_fmt_channel = 3;
break;
case camera_fmt::RGBA:
dev.defaultRegRegion().write(base_addr + 0x78, 0);
image_fmt_channel = 4;
break;
case camera_fmt::YUV422:
image_fmt_channel = 2;
dev.defaultRegRegion().write(base_addr + 0x7c, FRAME_WIDTH);
dev.defaultRegRegion().write(base_addr + 0x78, 1);
break;
default:
break;
}
// spdlog::info("Hard Resize PS, x0={}, y0={}, x1={}, y1={}, stride x={}, stride y={}, resize channel={}",
// x0, y0, x1, y1, ws, hs, image_fmt_channel);
std::cout << "Hard Resize PS, x0={"<< x0 <<"}, y0={"<< y0 <<"}, x1={"<< x1 <<"}, y1={"<< y1 <<
"}, stride x={"<< ws <<"}, stride y={"<< hs <<"}, resize channel={"<< image_fmt_channel <<"}" <<std::endl;
dev.defaultRegRegion().write(base_addr + 0x6c, FRAME_WIDTH * FRAME_HEIGHT * image_fmt_channel / 8);
}
void hardResizePL(BuyiDevice device, int x0, int y0, int x1, int y1, int RATIO_W, int RATIO_H, int CAMERA_WIDTH, int CAMERA_HEIGHT,
uint64_t base_addr = 0x40080000)
{
device.defaultRegRegion().write(base_addr + 0x18, 1);
device.defaultRegRegion().write(base_addr + 0x20, RATIO_W); // x方向行步长
device.defaultRegRegion().write(base_addr + 0x24, RATIO_H); // y方向列步长
device.defaultRegRegion().write(base_addr + 0x28, x0); // 起始x0 坐标位置 0~FRAME_W
device.defaultRegRegion().write(base_addr + 0x2C, y0); // 起始y0 坐标位置 0~FRAME_H
device.defaultRegRegion().write(base_addr + 0x30, x1); // 终止x1 坐标位置 0~FRAME_W
device.defaultRegRegion().write(base_addr + 0x34, y1); // 终止y1 坐标位置 0~FRAME_H
device.defaultRegRegion().write(base_addr + 0x38, CAMERA_WIDTH); // 图像X方向总长度 FRAME_W
device.defaultRegRegion().write(base_addr + 0x3C, CAMERA_HEIGHT); // 图像y方向总长度 FRAME_H
// spdlog::info("Hard Resize PL, x0={}, y0={}, x1={}, y1={}, stride x={}, stride y={}",
// x0, y0, x1, y1, RATIO_W, RATIO_H);
std::cout << "Hard Resize PL, x0={"<< x0 <<"}, y0={"<< y0 <<"}, x1={"<< x1 <<"}, y1={"<< y1 <<"},stride x={"<< RATIO_W <<"}, stride y={"<< RATIO_H <<"}" <<std::endl;
}
std::tuple<int, int, int, int > preprocess_plin(BuyiDevice device,
const int CAMERA_WIDTH, const int CAMERA_HEIGHT,
const int NET_W, const int NET_H,
crop_position crop,
uint64_t base_addr = 0x40080000)
{
int RATIO_W = CAMERA_WIDTH / NET_W;
int RATIO_H = CAMERA_HEIGHT / NET_H;
int IMG_W = RATIO_W * NET_W;
int IMG_H = RATIO_H * NET_H;
int BIAS_W = (CAMERA_WIDTH - IMG_W) / 2;
int BIAS_H = (CAMERA_HEIGHT - IMG_H) / 2;
int x0, y0, x1, y1;
switch (crop)
{
case crop_position::center:
x0 = (CAMERA_WIDTH - IMG_W) / 2;
y0 = (CAMERA_HEIGHT - IMG_H) / 2;
x1 = CAMERA_WIDTH - x0 - 1;
y1 = CAMERA_HEIGHT - y0 - 1;
break;
case crop_position::top_left:
x0 = 0;
y0 = 0;
x1 = IMG_W - 1;
y1 = IMG_H - 1;
break;
case crop_position::top_right:
x0 = CAMERA_WIDTH - IMG_W;
y0 = 0;
x1 = CAMERA_WIDTH - 1;
y1 = IMG_H - 1;
break;
case crop_position::bottom_left:
x0 = 0;
y0 = CAMERA_HEIGHT - IMG_H;
x1 = IMG_W - 1;
y1 = CAMERA_HEIGHT - 1;
break;
case crop_position::bottom_right:
x0 = CAMERA_WIDTH - IMG_W;
y0 = CAMERA_HEIGHT - IMG_H;
x1 = CAMERA_WIDTH - 1;
y1 = CAMERA_HEIGHT - 1;
break;
}
hardResizePL(device, x0, y0, x1, y1, RATIO_W, RATIO_H, CAMERA_WIDTH, CAMERA_HEIGHT, base_addr);
return { RATIO_W, RATIO_H, BIAS_W,BIAS_H };
}
namespace PLDDRMemRegion {
// pl_ddr dma
const uint64_t PLDDR_DMA_BASE = 0x100041000;
const uint64_t PLDDR_DMA_START = 0x04 + PLDDR_DMA_BASE;
const uint64_t PLDDR_DMA_READ_BOTTOM = 0x18 + PLDDR_DMA_BASE;
const uint64_t PLDDR_DMA_READ_TOP = 0x1C + PLDDR_DMA_BASE;
const uint64_t PLDDR_DMA_WRITE_BOTTOM = 0x20 + PLDDR_DMA_BASE;
const uint64_t PLDDR_DMA_WRITE_TOP = 0x24 + PLDDR_DMA_BASE;
const uint64_t PLDDR_DMA_STATUS = 0x84 + PLDDR_DMA_BASE;
const uint32_t PLDDR_DMA_ST_MASK_1 = 0b0000; // success
const uint32_t PLDDR_DMA_ST_MASK_2 = 0b0011; // rdma err
const uint32_t PLDDR_DMA_ST_MASK_3 = 0b1100; // wdma err
const uint32_t PLDDR_DMA_ST_MASK_4 = 0b1111; // both wdma and rdma err
const uint32_t PLDDR_DMA_ST_MASK_5 = 0b0001; // rdma un-done
const uint32_t PLDDR_DMA_ST_MASK_6 = 0b0100; // wdma un-done
const uint32_t PLDDR_DMA_ST_MASK_7 = 0b0101; // both wdma and rdma un-done
const uint32_t PLDDR_DMA_ST_MASK_8 = 0b1101; // wdma err, rdma un-done
const uint32_t PLDDR_DMA_ST_MASK_9 = 0b0111; // wdma un-done, rdma err
const uint32_t PLDDR_DMA_ST_HIT = 0b1111;
//bool statusHit(uint32_t status, uint32_t mask);
bool statusHit(uint32_t status, uint32_t mask) {
return status == mask;
}
//std::tuple<bool, uint64_t, int64_t> waitPLDMADone(int timeout_ms, const std::chrono::steady_clock::time_point& start, icraft::xrt::Device device);
std::tuple<bool, uint64_t, int64_t> waitPLDMADone(int timeout_ms, const std::chrono::steady_clock::time_point& start, icraft::xrt::Device device) {
uint64_t status = device.defaultRegRegion().read(PLDDR_DMA_STATUS, true);
int64_t duration = -1;
bool ret = utils::WaitUntil([&status, &start, &duration, &device]() {
status = device.defaultRegRegion().read(PLDDR_DMA_STATUS, true);
//ICRAFT_LOG(INFO).append("internal status: {:#x}", status);
if (statusHit(status, PLDDR_DMA_ST_MASK_1)) {
auto finish = std::chrono::steady_clock::now();
duration = (finish - start).count();
return true;
}
return false;
}, milliseconds(timeout_ms)
);
//ICRAFT_LOG(INFO).append("return status: {:#x}, duration: {}", status, duration);
return { ret, status, duration };
}
void Plddr_memcpy(uint64_t read_bottom, uint64_t read_top, uint64_t write_bottom, uint64_t write_top, icraft::xrt::Device& device) {
// 作用将PLDDR上src的数据拷贝给PLDDR上dest
ICRAFT_LOG(INFO).append("Begin plddr memcpy...");
//自行在外部对齐数据
//uint64_t read_bottom = src_begin_addr;
//uint64_t read_top = read_bottom + byte_size - 64; //对齐64byte整数倍
//uint64_t write_bottom = dest_addr;
//uint64_t write_top = write_bottom + byte_size - 64;//对齐64byte整数倍
std::mutex plddr_dma_mutex_;
// lock
std::unique_lock<std::mutex> plddr_dma_lock(plddr_dma_mutex_);
// write reg: [r_b, r_t] -> [w_b, w_t]
device.defaultRegRegion().write(PLDDR_DMA_READ_BOTTOM, read_bottom, true);//输入数据的base地址
device.defaultRegRegion().write(PLDDR_DMA_READ_TOP, read_top, true); //输入数据的结束地址
device.defaultRegRegion().write(PLDDR_DMA_WRITE_BOTTOM, write_bottom, true);//输出数据的base地址
device.defaultRegRegion().write(PLDDR_DMA_WRITE_TOP, write_top, true);//输出数据的结束地址
uint64_t aa = device.defaultRegRegion().read(PLDDR_DMA_READ_BOTTOM, true);
uint64_t bb = device.defaultRegRegion().read(PLDDR_DMA_READ_TOP, true);
uint64_t cc = device.defaultRegRegion().read(PLDDR_DMA_WRITE_BOTTOM, true);
uint64_t dd = device.defaultRegRegion().read(PLDDR_DMA_WRITE_TOP, true);
ICRAFT_LOG(INFO)
.append("read_form: {}, read_to: {}, write_from: {}, write_to: {}",
aa, bb, cc, dd);
// launch plddr dma
auto start = std::chrono::steady_clock::now();
uint64_t ee = device.defaultRegRegion().read(PLDDR_DMA_STATUS, true);//启动后轮询全0表示done
ICRAFT_LOG(INFO).append("begin status: {:#x}", ee);
// 启动数据传输
device.defaultRegRegion().write(PLDDR_DMA_START, 1, true);
device.defaultRegRegion().write(PLDDR_DMA_START, 0, true);
auto [done, status, duration] = waitPLDMADone(1000, start, device);
ICRAFT_LOG(INFO).append("(inner) PLDDR_PLDDR DMA time cost: {}ns", duration);
if (!done) {
if (statusHit(status, PLDDR_DMA_ST_MASK_2))
ICRAFT_LOG(EXCEPT, 1301).append("Unexpected launch of RDMA when RDMA is running, while WDMA is running well.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_3))
ICRAFT_LOG(EXCEPT, 1302).append("Unexpected launch of WDMA when WDMA is running, while RDMA is running well.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_4))
ICRAFT_LOG(EXCEPT, 1303).append("Unexpected launches of both WDMA and RDMA when they are running.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_5))
ICRAFT_LOG(EXCEPT, 1304).append("RDMA is un-done, while WDMA running well.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_6))
ICRAFT_LOG(EXCEPT, 1305).append("WDMA is un-done, while RDMA running well.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_7))
ICRAFT_LOG(EXCEPT, 1306).append("Both WDMA and RDMA are un-done.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_8))
ICRAFT_LOG(EXCEPT, 1307).append("Unexpected launch of WDMA and RDMA is un-done.");
else if (statusHit(status, PLDDR_DMA_ST_MASK_9))
ICRAFT_LOG(EXCEPT, 1308).append("Unexpected launch of RDMA and WDMA is un-done.");
else
ICRAFT_LOG(EXCEPT, 1309).append("Unkown status of PLDDR DMA, which is {:#x}.", status);
}
}
}
#ifdef __linux__
template<typename predicate, typename Rep, typename Period>
bool WaitUntil(predicate check, std::chrono::duration<Rep, Period> timeout) {
auto start = std::chrono::steady_clock::now();
while (!check()) {
usleep(50);
if (timeout > 0ms && std::chrono::steady_clock::now() - start > timeout) { return false; }
}
return true;
}
class Camera {
public:
Camera() = default;
Camera(BuyiDevice device, uint64_t buffer_size, uint64_t base_addr = 0x40080000)
: device_(device), buffer_size_(buffer_size), base_addr_(base_addr)
{
take_addr_ = base_addr_ + 0x04;
write_addr_ = base_addr_ + 0x50;
done_addr_ = base_addr_ + 0x58;
}
void get(int8_t* frame, const MemChunk& memchunk) const {
memchunk.read((char*)frame, 0, buffer_size_);
}
void take(const MemChunk& memchunk) const {
// 取帧到MemChunk处
device_.defaultRegRegion().write(write_addr_, memchunk->begin.addr() >> 3);
device_.defaultRegRegion().write(take_addr_, 1);
}
bool wait(int wait_time_ms = 100) const {
bool error = false;
bool done = false;
WaitUntil(
[&]() -> bool {
auto camera_done = device_.defaultRegRegion().read(done_addr_);
error = camera_done & 0x4;
done = camera_done & 0x1;
return done;
},
// std::chrono::duration<int, std::milli>(wait_time_ms)
100ms
);
return !error && done;
}
private:
BuyiDevice device_;
uint64_t buffer_size_ = 0;
uint64_t base_addr_;
uint64_t take_addr_;
uint64_t write_addr_;
uint64_t done_addr_;
};
/**
* Hdmi显示抽象类
* 用于wukong板
* 输入的数据为 RGB565
* 尺寸是1920*1080
*/
class Display_pHDMI_RGB565 {
public:
Display_pHDMI_RGB565() = default;
Display_pHDMI_RGB565(BuyiDevice device, uint64_t buffer_size, MemChunk chunck)
:device_(device), buffer_size_(buffer_size), chunck_(chunck) {
}
void show(int8_t* frame) const {
chunck_.write(0, (char*)frame, buffer_size_);
device_.defaultRegRegion().write(DISPLAY_READ_ADDR, chunck_->begin.addr() >> 3);
}
private:
BuyiDevice device_;
uint64_t buffer_size_ = 0;
MemChunk chunck_;
const static auto DISPLAY_READ_ADDR = 0x40080054;
};
/**
* Hdmi显示抽象类
* 用于demov1板子 做成framebuffer驱动
* 输入的数据为 RGBA
* 尺寸是1920*1080
*/
class Display_sHDMI_RGBA {
public:
Display_sHDMI_RGBA() = default;
Display_sHDMI_RGBA(const char* dev)
{
int ret = 0;
fd_ = open(dev, O_RDWR);
if (fd_ < 0) {
printf("open device [%s] failed:%s\n", dev, strerror(errno));
}
ret = ioctl(fd_, FBIOGET_FSCREENINFO, &fix);
if (ret < 0) {
printf("read fb device fscreeninfo failed:%s\n", strerror(errno));
close(fd_);
}
ret = ioctl(fd_, FBIOGET_VSCREENINFO, &var);
if (ret < 0) {
printf("read fb device vscreeninfo failed:%s\n", strerror(errno));
close(fd_);
}
mem_size_ = var.xres * var.yres * var.bits_per_pixel / 8; /* 计算内存 */
ptr_buf = (uint8_t*)mmap(NULL, mem_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0);
if (ptr_buf == NULL) {
printf("fb device mmap failed:%s\n", strerror(errno));
close(fd_);
}
memset(ptr_buf, 0, mem_size_); // 清除屏幕
}
~Display_sHDMI_RGBA() {
munmap(ptr_buf, mem_size_);
close(fd_);
}
void show(int8_t* frame) const {
memcpy(ptr_buf, frame, mem_size_);
}
void draw_top_left(int8_t* frame) {
uint8_t* poffset_buf = ptr_buf;
for (int col = 0; col < var.yres; ++col) {
memcpy(poffset_buf, frame, var.xres / 2);
poffset_buf += var.xres;
frame += var.xres / 2;
}
}
void draw_top_right(int8_t* frame) {
uint8_t* poffset_buf = ptr_buf + var.xres / 2;
for (int col = 0; col < var.yres; ++col) {
memcpy(poffset_buf, frame, var.xres / 2);
poffset_buf += var.xres;
frame += var.xres / 2;
}
}
void draw_bottom_left(int8_t* frame) {
uint8_t* poffset_buf = ptr_buf + var.yres * var.xres / 2;
for (int col = 0; col < var.yres; ++col) {
memcpy(poffset_buf, frame, var.xres / 2);
poffset_buf += var.xres;
frame += var.xres / 2;
}
}
void draw_bottom_right(int8_t* frame) {
uint8_t* poffset_buf = ptr_buf + var.yres * var.xres / 2 + var.xres / 2;
for (int col = 0; col < var.yres; ++col) {
memcpy(poffset_buf, frame, var.xres / 2);
poffset_buf += var.xres;
frame += var.xres / 2;
}
}
void draw_pixel(int x, int y, uint32_t color)
{
uint8_t* poffset_buf = NULL;
poffset_buf = ptr_buf + (x * var.bits_per_pixel / 8)
+ (y * var.xres * var.bits_per_pixel / 8); /* 计算内存偏移地址 */
*(uint32_t*)poffset_buf = color; /* ARGB32格式 */
}
void fill_pixel(uint32_t color)
{
int i, j;
for (i = 0; i < var.xres; i++)
{
for (j = 0; j < var.yres; j++)
{
draw_pixel(i, j, color);
}
}
}
uint8_t* getPtr() const { return ptr_buf; }
private:
uint8_t* ptr_buf;
int fd_;
int mem_size_;
struct fb_fix_screeninfo fix;
struct fb_var_screeninfo var;/* framebuffer设备信息*/
};
class DisplayRange {
public:
DisplayRange(int startrow, int endrow, int startcol, int endcol, const cv::Mat& mat)
:startrow_(startrow), endrow_(endrow), startcol_(startcol), endcol_(endcol) {
mat_ = mat.rowRange(startrow, endrow).colRange(startcol, endcol);
}
const cv::Mat& mat() const { return mat_; }
const int startrow() const { return startrow_; }
const int endrow() const { return endrow_; }
const int startcol() const { return startcol_; }
const int endcol() const { return endcol_; }
private:
int startrow_;
int endrow_;
int startcol_;
int endcol_;
cv::Mat mat_;
};
class ProgressPrinter {
public:
ProgressPrinter(int line = 0) : line_(line) {
this->lines_ = std::vector<std::string>(line);
}
void print(int line_index, int progress, int total_n, std::string pre_info, std::string last_info) {
if (line_index > this->lines_.size()) return;
auto full_info = pre_info + " " + std::to_string(progress) + "/" + std::to_string(total_n) + "[";
for (int i = 0; i < 50; ++i) {
int prog = float(progress) / float(total_n) * 100.0 / 2.0;
if (i < prog)
full_info += "=";
if (i == prog)
full_info += ">";
if (i > prog)
full_info += " ";
}
full_info += +"]";
full_info += fmt::format(" {:.2f}% ", float(progress) / float(total_n) * 100.0);
full_info += last_info;
this->lines_[line_index] = full_info;
std::string to_topline = "\033[" + std::to_string(line_) + "A";
std::unique_lock<std::mutex> prt_lock(prt_mutex_);
std::cout << "\033[?25l" << "\033[K" << to_topline << "\033[0m" << "\r";
for (auto&& line : this->lines_) {
std::cout << "\033[K" << line << '\n';
}
prt_lock.unlock();
}
private:
int line_;
std::vector<std::string> lines_;
std::mutex prt_mutex_;
bool first_print_ = true;
};
#endif