打开微信,使用扫一扫进入页面后,点击右上角菜单,
点击“发送给朋友”或“分享到朋友圈”完成分享
本篇接http://forum.cambricon.com/index.php?m=content&c=index&a=show&catid=33&id=678
我们继续以几个深度学习常用算子为例来学习BANG C算子开发过程
本节以softmax算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
本样例以输入数据格式为HWC,进行softmax的维度为C为例,实现的部分代码如下,完整代码参见/path/to/code。
获取Channel方向最大值
# CUDA Kernel code:
__global__ void kernel_channel_max(const int channels,
const int spatial_dim, const floSat* data, float* out) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < spatial_dim;
index += blockDim.x * gridDim.x) {
float maxval = -FLT_MAX;
for (int c = 0; c < channels; ++c) {
maxval = max(data[index * channels + c], maxval);
}
out[index] = maxval;
}}# BANG C Kernel code:
__mlu_func__ void kernel_channel_max(int channels, int spatial_dim, half* data, half* out) {
// 将输入数据格式(channels, height, width),看作是(channels, 1, height * width),然后借助
// maxpool算子获取最大值
__bang_maxpool(out, data, spatial_dim, channels, 1, channels, 1, 1, 1);}减去最大值
# CUDA Kernel code:
__global__ void kernel_channel_subtract(const int channels,
const int spatial_dim, const float* channel_max, float* data) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < channels * spatial_dim;
index += blockDim.x * gridDim.x) {
int s = index / channels;
data[index] -= channel_max[s];
}}# BANG C Kernel code:
__mlu_func__ void kernel_channel_subtract(int channels, int spatial_dim,
half* channel_max, half* data) {
__bang_cycle_sub(data, data, channel_max, channels * spatial_dim, spatial_dim);}计算自然对数
# CUDA Kernel code:
__global__ void kernel_exp(const int count, const float* data, float* out) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
out[index] = exp(data[index]);
}}# BANG C Kernel code:
__mlu_func__ void kernel_exp(int count, half* data, half* out) {
// 由于需要计算自然对数的数据均小于等于0,因此可以使用下面的接口以提高计算精度
__bang_active_exp_less_0(out, data, count);}Channel方向求和
# CUDA Kernel code:
__global__ void kernel_channel_sum(const int channels,
const int spatial_dim, const float* data, float* channel_sum) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < spatial_dim; index += blockDim.x * gridDim.x) {
float sum = 0;
for (int c = 0; c < channels; ++c) {
sum += data[index * channels + c];
}
channel_sum[index] = sum;
}}# BANG C Kernel code:
__mlu_func__ void kernel_channel_sum(int channels, int spatial_dim, half* data, half* channel_sum) {
// 与获取最大值相似,借助sumpool算子求和
__bang_sumpool(channel_sum, data, spatial_dim, channels, 1, channels, 1, 1, 1);}Channel方向归一化
# CUDA Kernel code:
__global__ void kernel_channel_div(const int channels,
const int spatial_dim, const float* channel_sum, float* data) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < channels * spatial_dim; index += blockDim.x * gridDim.x) {
int s = index / channels;
data[index] /= channel_sum[s];
}}# BANG C Kernel code:
__mlu_func__ void kernel_channel_div(int channels, int spatial_dim, half* channel_sum, half* data) {
__bang_active_recip(channel_sum, channel_sum, spatial_dim);
__bang_cycle_mul(data, data, channel_sum, channels * spatial_dim, spatial_dim);}Entry函数
# CUDA entry code:
void Softmax(float* input,float* output, int channels, int height, int width) {
int spatial_dim = height * width;
int size = spatial_dim * channels;
__shared__ int temp_data[spatial_dim];
kernel_channel_max<<>>(channels, spatial_dim, input, temp_data);
kernel_channel_subtract<<>>(channels, spatial_dim, temp_data, output);
kernel_exp<<>>(size, output, output);
kernel_channel_sum<<>>(channels, spatial_dim, output, temp_data);
kernel_channel_div<<>>(channels, spatial_dim, temp_data, output);}# BANG C entry code:
__mlu_entry__ void Softmax(half* input, half* output, int channels, int height, int width) {
__nram__ half nram_buffer[NRAM_HALF_SIZE];
// 考虑到是以channels维度进行softmax,因此在进行数据拆分时也需要以channels维度作为基准
int channels_up = ALIGIN_UP(channels, 64);
// 考虑到我们在kernel函数中需要两块大小相同的内存空间进行计算(需要对原始数据进行transpose),且spatial_dim也需要进行对齐,
// 综合考虑计算需要进行的循环次数
int spatial_dim = NRAM_HALF_SIZE / channels_up / 2;
int spatial_dim_down = ALIGN_DOWN_TO(spatial_dim, 64);
int loop_time = height * width / spatial_dim_down;
half* src = &nram_buffer[0];
half* trans_data = src + spatial_dim_down * channels_up;
for (int i = 0; i < loop_time; i++) {
half* input_loop = ((half*) input) + i * spatial_dim_down * channels;
half* output_loop = ((half*) output) + i * spatial_dim_down * channels;
// 考虑到需要将补齐部分的数据进行填充,因此在拷入数据时,需要将channels个数据拷入到channels_up的空间里(补0对齐),
// 并再重复spatial_dim_down - 1次
__memcpy(src, input_loop, channels * sizeof(half), GDRAM2NRAM,
channels_up * sizeof(half), channels * sizeof(half), spatial_dim_down - 1);
// 将数据从(spatial_dim_down,channels_up)形状转换为(channels_up,spatial_dim_down)以方便后续运算,
// 由于__bang_transpose不支持原位运算,因此我们需要使用另一块与src大小相同的数据trans_data
__bang_transpose(trans_data, src, spatial_dim_down, channels_up);
// src 上的数据不再需要使用,因此可以作为复用空间
kernel_channel_max(channels, spatial_dim_down, trans_data, src);
kernel_channel_subtract(channels, spatial_dim_down, src, trans_data);
kernel_exp(spatial_dim_down * channels, trans_data, trans_data);
kernel_channel_sum(channels, spatial_dim_down, trans_data, src);
kernel_channel_div(channels, spatial_dim_down, src, trans_data);
// 最终结果在trans_data中,需要transpose后拷出
__bang_transpose(src, trans_data, channels_up, spatial_dim_down);
__memcpy(output_loop, src, channels * sizeof(half), NRAM2GDRAM, channels * sizeof(half),
channels_up * sizeof(half), spatial_dim_down - 1);
}
// 如数据总量非整除,计算剩余部分
int spatial_dim_remain = height * width % spatial_dim_down;
if (spatial_dim_remain != 0) {
int spatial_dim_remain_up = ALIGIN_UP(spatial_dim_remain, 64);
half* input_remain = ((half*) input) + loop_time * spatial_dim_down * channels;
half* output_remain = ((half*) output) + loop_time * spatial_dim_down * channels;
__memcpy(src, input_remain, channels * sizeof(half), GDRAM2NRAM, channels_up * sizeof(half),
channels * sizeof(half), spatial_dim_remain - 1);
__bang_transpose(trans_data, src, spatial_dim_remain_up, channels_up);
kernel_channel_max(channels, spatial_dim_remain_up, trans_data, src);
kernel_channel_subtract(channels, spatial_dim_remain_up, src, trans_data);
kernel_exp(spatial_dim_remain_up * channels, trans_data, trans_data);
kernel_channel_sum(channels, spatial_dim_remain_up, trans_data, src);
kernel_channel_div(channels, spatial_dim_remain_up, src, trans_data);
__bang_transpose(src, trans_data, channels, spatial_dim_remain_up);
__memcpy(output_remain, src, channels * sizeof(half), NRAM2GDRAM, channels * sizeof(half),
channels_up * sizeof(half), spatial_dim_remain - 1);
}}本节以l2loss算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _Cu_l2loss(const float* a, const float* b, int n, float* out) {
cg::thread_block cta = cg::this_thread_block();
float *sdata = SharedMemory();
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
float error = a[i] - b[i];
error = error * error;
sdata[tid] = (i 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
cg::sync(cta);
}
if (tid == 0) {
out[blockIdx.x] = 0.5 * sdata[0];
}}BANG C Kernel code:
#define NFU_ALIGN_SIZE 128#define DDR_ALIGN_MAP3 (1024 * 16)#define MAX_NRAM_SIZE (1024 * NFU_ALIGN_SIZE * 3)#define SIZE_NRAM_BUF ((MAX_NRAM_SIZE) / 2)#define ALIGIN_UP(x, num) (((x - 1) / num + 1) * num)#define ALIGN_DOWN_TO(x, num) (((x - 1) / num) * num)__mlu_entry__ void kernelL2loss(half *input_a,
half *input_b,
half *output,
const uint32_t size) {
__nram__ char nram_buffer_a[SIZE_NRAM_BUF/2];
__nram__ char nram_buffer_b[SIZE_NRAM_BUF/2];
__mlu_shared__ char sram_buffer[SIZE_NRAM_BUF];
uint32_t ALIGN_T_SIZE = NFU_ALIGN_SIZE / sizeof(half);
uint32_t num_total = size;
uint32_t num_per_core = num_total / taskDim;
uint32_t rem_for_all = num_total % taskDim;
uint32_t dealsize = (DDR_ALIGN_MAP3 / sizeof(half));
dealsize = ALIGN_DOWN_TO(dealsize, ALIGN_T_SIZE);
half *nram_a = (half *)nram_buffer_a;
half *nram_b = (half *)nram_buffer_b;
half *sram_sum = (half *)sram_buffer;
half sum = 0.0;
uint32_t repeat = num_per_core / dealsize;
uint32_t rem = num_per_core % dealsize;
uint32_t align_rem = ALIGIN_UP(rem, ALIGN_T_SIZE);
// for every taskId
half * _addr_a = (half *)input_a + taskId * num_per_core;
half * _addr_b = (half *)input_b + taskId * num_per_core;
// Loop compute NFU aligned part
for(int i=0; i 0) {
if (taskId == taskDim - 1) {
for(int i=0; i<rem_for_all; i++) {
half err = input_a[num_per_core * taskDim + i] - input_b[num_per_core * taskDim + i];
err = err * err;
sum = sum + err;
}
}
}
sram_sum[taskId] = sum;
__sync_cluster();
// Handle sram_sum & copy out
if (taskId == taskDim -1) {
__nramset(nram_a, taskDim, (half)0.0);
__memcpy(nram_a, sram_sum, taskDim * sizeof(half), SRAM2NRAM);
for(int i = 1; i < taskDim; i++) {
nram_a[0] = nram_a[0] + nram_a[i];
}
__memcpy(output, nram_a, 1 * sizeof(half), NRAM2GDRAM);
output[0] = output[0] * 0.5;
}}本节会以boxencode算子为例,介绍 CUDA 算子转BANG C算子开发过程的一些基本思路。 本样例以输入数据格式为CHW,实现的部分代码如下,完整代码参见/path/to/code。
获取box坐标、类别和置信度
# CUDA Kernel code:
__global__ void kernel_box_encode(int w, int h, int channel, const float *arm_loc,
const float *odm_loc, const float *prior_box, float *norm_box) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < w * h * channel;
index += blockDim.x * gridDim.x) {
int priorn = channel / 4;
int boxstep = w * h;
int chanelStep = w * h * 4;
int pix = w * h;
int _c = index / pix;
int _h = (index % pix) / w;
int _w = (index % pix) % w;
int _p = _c / 4;
int i = _c % 4;
int box_index = _p * pix + _h * w + _w;
int prior_index = ((_h * w + _w) * priorn * 4) + (_p * 4);
int v_index = prior_index + w * h * channel;
float pcx = prior_box[prior_index];
float pcy = prior_box[prior_index + 1];
float pw = prior_box[prior_index + 2];
float ph = prior_box[prior_index + 3];
int arm_index;
float arm_cx, arm_cy, arm_w, arm_h, odm_cx, odm_cy, odm_w, odm_h;
arm_index = _p * chanelStep + _h * w + _w;
arm_cx = arm_loc[arm_index + 0 * boxstep];
arm_cy = arm_loc[arm_index + 1 * boxstep];
arm_w = arm_loc[arm_index + 2 * boxstep];
arm_h = arm_loc[arm_index + 3 * boxstep];
odm_cx = odm_loc[arm_index + 0 * boxstep];
odm_cy = odm_loc[arm_index + 1 * boxstep];
odm_w = odm_loc[arm_index + 2 * boxstep];
odm_h = odm_loc[arm_index + 3 * boxstep];
float decode_bbox_center_x = prior_box[v_index] * arm_cx * pw + pcx;
float decode_bbox_center_y = prior_box[v_index + 1] * arm_cy * ph + pcy;
float decode_bbox_width = exp(prior_box[v_index + 2] * arm_w) * pw;
float decode_bbox_height = exp(prior_box[v_index + 3] * arm_h) * ph;
float _decode_bbox_center_x = prior_box[v_index] * odm_cx * decode_bbox_width + decode_bbox_center_x;
float _decode_bbox_center_y = prior_box[v_index + 1] * odm_cy * decode_bbox_height + decode_bbox_center_y;
float _decode_bbox_width = exp(prior_box[v_index + 2] * odm_w) * decode_bbox_width;
float _decode_bbox_height = exp(prior_box[v_index + 3] * odm_h) * decode_bbox_height;
switch (i)
{
//0:xmin, 1:ymin, 2:xmax, 3:ymax
case 0:
norm_box[box_index * 6 + 0] = static_cast(_decode_bbox_center_x - _decode_bbox_width / 2.);
break;
case 1:
norm_box[box_index * 6 + 1] = static_cast(_decode_bbox_center_y - _decode_bbox_height / 2.);
break;
case 2:
norm_box[box_index * 6 + 2] = static_cast(_decode_bbox_center_x + _decode_bbox_width / 2.);
break;
case 3:
norm_box[box_index * 6 + 3] = static_cast(_decode_bbox_center_y + _decode_bbox_height / 2.);
break;
default:
break;
}
}}__global__ void kernel_box_score(int w, int h, int channel, int classn, const float *arm_conf,
const float *odm_conf, const float objThresh, float *norm_box) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < w * h * channel / 4;
index += blockDim.x * gridDim.x) {
int armChanelStep = w * h * 2;
int odmChanelStep = w * h * classn;
int classStep = w * h;
int pix = w * h;
int _c = index / pix;
int _h = (index % pix) / w;
int _w = (index % pix) % w;
int armIndex = _c * armChanelStep + _h * w + _w;
int odmIndex = _c * odmChanelStep + _h * w + _w;
int boxIndex = _c * pix + _h * w + _w;
float maxScore = 0;
float maxid = 0;
if (arm_conf[armIndex + classStep] < objThresh) {
maxScore = 1;
maxid = 0;
}
else{
for (int c = 0; c maxScore) {
maxScore = odm_conf[odmIndex + c * classStep];
maxid = c;
}
}
}
norm_box[boxIndex * 6 + 4] = static_cast(maxid);
norm_box[boxIndex * 6 + 5] = static_cast(maxScore);
}}# BANG C Kernel code:
template __mlu_func__ void box_encode_kernel(T* arm_loc, T* odm_loc, T* arm_conf, T* odm_conf, T* prior_box, T* mask, T* nram_buffer,
T* norm_box, int src_w, int src_h, int src_c, int classn, float objThresh) {
T* arm_loc_n = &nram_buffer[0];
int total_num = src_w * src_h * src_c;
int priorn = src_c / 4;
int src_hw_up = ALIGIN_UP(src_w * src_h, 64);
int total_num_up = src_hw_up * src_c;
// 利用指针位移来预先分配需要的内存空间
T* arm_conf_n = arm_loc_n + total_num_up;
T* mask1 = arm_conf_n + src_hw_up * priorn;
T* mask2 = mask1 + total_num_up;
T* mask3 = mask2 + total_num_up;
T* mask4 = mask3 + total_num_up;
T* odm_loc_n = mask4 + total_num_up;
T* odm_conf_n = odm_loc_n + total_num_up;
T* prior_box_n = odm_conf_n + src_c * src_hw_up;
T* arm_cx = prior_box_n + total_num_up * 2;
T* arm_cy = arm_cx + total_num_up / 4;
T* arm_w = arm_cy + total_num_up / 4;
T* arm_h = arm_w + total_num_up / 4;
T* odm_cx = arm_h + total_num_up / 4;
T* odm_cy = odm_cx + total_num_up / 4;
T* odm_w = odm_cy + total_num_up / 4;
T* odm_h = odm_w + total_num_up / 4;
T* pcx = odm_h + total_num_up / 4;
T* pcy = pcx + total_num_up / 4;
T* pw = pcy + total_num_up / 4;
T* ph = pw + total_num_up / 4;
T* pv1 = ph + total_num_up / 4;
T* pv2 = pv1 + total_num_up / 4;
T* pv3 = pv2 + total_num_up / 4;
T* pv4 = pv3 + total_num_up / 4;
// 由于数据对齐的需要,通过stride copy将arm_conf和odm_conf拷贝到sram,注意此处由于我们知道arm_conf仅使用了
// 第二个类别的数据,因此在拷贝时仅拷贝了第二个类别的数据
__memcpy(arm_conf_n, arm_conf + src_w * src_h, src_w * src_h * sizeof(T), GDRAM2NRAM,
src_hw_up * sizeof(T), 2 * src_w * src_h * sizeof(T), priorn - 1);
__memcpy(odm_conf_n, odm_conf, src_w * src_h * sizeof(T), GDRAM2NRAM,
src_hw_up * sizeof(T), src_w * src_h * sizeof(T), src_c - 1);
// 借助stride copy的方式,我们将arm_loc_n和odm_loc_n的数据拷贝的同时从CHW型转为了HWC型方便后面进行数据挑选
// 和进一步的计算
for (int i = 0; i < src_c; i++) {
__memcpy(arm_loc_n + i, arm_loc + i * src_w * src_h, sizeof(T), GDRAM2NRAM,
src_c * sizeof(T), sizeof(T), src_w * src_h - 1);
__memcpy(odm_loc_n + i, odm_loc + i * src_w * src_h, sizeof(T), GDRAM2NRAM,
src_c * sizeof(T), sizeof(T), src_w * src_h - 1);
}
// mask实际上包含了mask1, mask2, mask3和mask4的数据,用于挑取数据进行进一步的计算。
// 由于mask可以预先生成,因此我们在cpu上生成数据再传入nram
__memcpy(mask1, mask, total_num_up * 4 * sizeof(T), GDRAM2NRAM);
__memcpy(prior_box_n, prior_box, total_num * sizeof(T), GDRAM2NRAM);
__memcpy(prior_box_n + total_num_up, prior_box + total_num, total_num * sizeof(T), GDRAM2NRAM);
// 使用__bang_collect接口获取数据,mask1是由total_num_up / 4个{1,0,0,0}组成的,mask2、
// mask3、mask4依次类推,因此可以将arm_loc、odm_loc和prior_box的数据拆分开来
__bang_collect(arm_cx, arm_loc_n, mask1, total_num_up);
__bang_collect(arm_cy, arm_loc_n, mask2, total_num_up);
__bang_collect(arm_w, arm_loc_n, mask3, total_num_up);
__bang_collect(arm_h, arm_loc_n, mask4, total_num_up);
__bang_collect(odm_cx, odm_loc_n, mask1, total_num_up);
__bang_collect(odm_cy, odm_loc_n, mask2, total_num_up);
__bang_collect(odm_w, odm_loc_n, mask3, total_num_up);
__bang_collect(odm_h, odm_loc_n, mask4, total_num_up);
__bang_collect(pcx, prior_box_n, mask1, total_num_up);
__bang_collect(pcy, prior_box_n, mask2, total_num_up);
__bang_collect(pw, prior_box_n, mask3, total_num_up);
__bang_collect(ph, prior_box_n, mask4, total_num_up);
__bang_collect(pv1, prior_box_n + total_num_up, mask1, total_num_up);
__bang_collect(pv2, prior_box_n + total_num_up, mask2, total_num_up);
__bang_collect(pv3, prior_box_n + total_num_up, mask3, total_num_up);
__bang_collect(pv4, prior_box_n + total_num_up, mask4, total_num_up);
// 计算 decode_bbox_center_x,考虑arm_cx的数据已经不再使用,暂时存储在 arm_cx处
__bang_mul(arm_cx, arm_cx, pw, total_num_up / 4);
__bang_mul(arm_cx, arm_cx, pv1, total_num_up / 4);
__bang_add(arm_cx, arm_cx, pcx, total_num_up / 4);
// 计算 decode_bbox_center_y,考虑arm_cx的数据已经不再使用,暂时存储在 arm_cy处
__bang_mul(arm_cy, arm_cy, ph, total_num_up / 4);
__bang_mul(arm_cy, arm_cy, pv2, total_num_up / 4);
__bang_add(arm_cy, arm_cy, pcy, total_num_up / 4);
// 计算 decode_bbox_width,考虑arm_cx的数据已经不再使用,暂时存储在 arm_w处
__bang_mul(arm_w, arm_w, pv3, total_num_up / 4);
__bang_active_exp(arm_w, arm_w, total_num_up / 4);
__bang_mul(arm_w, arm_w, pw, total_num_up / 4);
// 计算 decode_bbox_height,考虑arm_cx的数据已经不再使用,暂时存储在 arm_h处
__bang_mul(arm_h, arm_h, pv4, total_num_up / 4);
__bang_active_exp(arm_h, arm_h, total_num_up / 4);
__bang_mul(arm_h, arm_h, ph, total_num_up / 4);
// 计算 _decode_bbox_center_x,考虑odm_cx的数据已经不再使用,暂时存储在 odm_cx处
__bang_mul(odm_cx, odm_cx, arm_w, total_num_up / 4);
__bang_mul(odm_cx, odm_cx, pv1, total_num_up / 4);
__bang_add(odm_cx, odm_cx, arm_cx, total_num_up / 4);
// 计算 _decode_bbox_center_y,考虑odm_cy的数据已经不再使用,暂时存储在 odm_cy处
__bang_mul(odm_cy, odm_cy, arm_h, total_num_up / 4);
__bang_mul(odm_cy, odm_cy, pv2, total_num_up / 4);
__bang_add(odm_cy, odm_cy, arm_cy, total_num_up / 4);
// 计算 _decode_bbox_width,考虑odm_w的数据已经不再使用,暂时存储在 odm_w处
__bang_mul(odm_w, odm_w, pv3, total_num_up / 4);
__bang_active_exp(odm_w, odm_w, total_num_up / 4);
__bang_mul(odm_w, odm_w, arm_w, total_num_up / 4);
// 计算 _decode_bbox_height,考虑odm_h的数据已经不再使用,暂时存储在 odm_h处
__bang_mul(odm_h, odm_h, pv4, total_num_up / 4);
__bang_active_exp(odm_h, odm_h, total_num_up / 4);
__bang_mul(odm_h, odm_h, arm_h, total_num_up / 4);
// 计算最终的x、y、w、h,考虑prior_box的数据已经不再使用,暂时存储在 prior_box处
__bang_mul_const(odm_w, odm_w, (T)0.5, total_num_up / 4);
__bang_sub(prior_box_n, odm_cx, odm_w, total_num_up / 4);
__bang_mul_const(odm_h, odm_h, (T)0.5, total_num_up / 4);
__bang_sub(prior_box_n + total_num_up / 4, odm_cy, odm_h, total_num_up / 4);
__bang_add(prior_box_n + 2 * total_num_up / 4, odm_cx, odm_w, total_num_up / 4);
__bang_add(prior_box_n + 3 * total_num_up / 4, odm_cy, odm_h, total_num_up / 4);
// 计算类别和置信度,考虑到后面需要拷出,因此将结果放在连续的内存上方便拷贝
__bang_write_zero(prior_box_n + total_num_up, 2 * total_num_up / 4);
// 借用mask1位置生成一个64位的objThresh数据与arm_conf_n进行比较
__nramset(mask1, 64, (half) objThresh);
__bang_cycle_lt(arm_conf_n, arm_conf_n, mask1, src_hw_up * priorn, 64);
// 通过__bang_maxpool_index获取odm_conf上的类别,需要注意odm_conf的实际shape为
// (priorn,classn,src_hw_up),因此需要使用(classn, 1)shape的kernel
half* index_ = prior_box_n + 4 * (total_num_up / 4);
unsigned short* index_us = (unsigned short* )index_;
__bang_maxpool_index(index_us, odm_conf_n, src_hw_up, priorn, classn, 1, classn, 1, 1);
// 使用mask2空间将获得的类别转为half方便后续计算
__bang_short2half(mask2, (short*)index_us, src_hw_up * priorn);
// 通过__bang_maxpool_index获取odm_conf上的置信度
__bang_maxpool(mask3, odm_conf_n, src_hw_up, priorn, classn, 1, classn, 1, 1);
// 由于arm_conf 的结果优先于odm_conf的结果,因此需要使用arm_conf的结果作为mask覆盖掉odm_conf的结果
__bang_add_const(mask1, arm_conf_n, (half) -1, src_hw_up * priorn);
__bang_mul_const(mask1, mask1, (half) -1, src_hw_up * priorn);
__bang_mul(mask2, mask2, mask1, src_hw_up * priorn);
__bang_mul(mask3, mask3, mask1, src_hw_up * priorn);
__bang_add(arm_conf_n, arm_conf_n, mask3, src_hw_up * priorn);
// 将最终的结果汇总至prior_box_n,注意使用了stride copy,将CHW的结果转换为HWC
for (int i = 0; i < priorn; i++) {
__memcpy(prior_box_n + 4 * (total_num_up / 4) + i, mask2 + i * src_hw_up, sizeof(T),
NRAM2NRAM, priorn * sizeof(T), sizeof(T), src_w * src_h - 1);
__memcpy(prior_box_n + 5 * (total_num_up / 4) + i, arm_conf_n + i * src_hw_up, sizeof(T),
NRAM2NRAM, priorn * sizeof(T), sizeof(T), src_w * src_h - 1);
}
// 最终将结果拷出,注意当前获得的结果形状为 (6, num_boxes),因此借用stride copy转换为 (num_boxes, 6)
for (int i = 0; i < 6; i++) {
__memcpy(norm_box + i, prior_box_n + i * (total_num_up / 4), sizeof(T), NRAM2GDRAM,
6 * sizeof(T), sizeof(T), total_num / 4 - 1);
}}热门帖子
精华帖子