打开微信,使用扫一扫进入页面后,点击右上角菜单,
点击“发送给朋友”或“分享到朋友圈”完成分享
本篇接http://forum.cambricon.com/index.php?m=content&c=index&a=show&catid=33&id=678
我们继续以几个深度学习常用算子为例来学习BANG C算子开发过程
本节以softmax算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
本样例以输入数据格式为HWC,进行softmax的维度为C为例,实现的部分代码如下,完整代码参见/path/to/code。
获取Channel方向最大值
# CUDA Kernel code:
__global__ void kernel_channel_max(const int channels, const int spatial_dim, const floSat* data, float* out) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < spatial_dim; index += blockDim.x * gridDim.x) { float maxval = -FLT_MAX; for (int c = 0; c < channels; ++c) { maxval = max(data[index * channels + c], maxval); } out[index] = maxval; }}
# BANG C Kernel code:
__mlu_func__ void kernel_channel_max(int channels, int spatial_dim, half* data, half* out) { // 将输入数据格式(channels, height, width),看作是(channels, 1, height * width),然后借助 // maxpool算子获取最大值 __bang_maxpool(out, data, spatial_dim, channels, 1, channels, 1, 1, 1);}
减去最大值
# CUDA Kernel code:
__global__ void kernel_channel_subtract(const int channels, const int spatial_dim, const float* channel_max, float* data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < channels * spatial_dim; index += blockDim.x * gridDim.x) { int s = index / channels; data[index] -= channel_max[s]; }}
# BANG C Kernel code:
__mlu_func__ void kernel_channel_subtract(int channels, int spatial_dim, half* channel_max, half* data) { __bang_cycle_sub(data, data, channel_max, channels * spatial_dim, spatial_dim);}
计算自然对数
# CUDA Kernel code:
__global__ void kernel_exp(const int count, const float* data, float* out) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) { out[index] = exp(data[index]); }}
# BANG C Kernel code:
__mlu_func__ void kernel_exp(int count, half* data, half* out) { // 由于需要计算自然对数的数据均小于等于0,因此可以使用下面的接口以提高计算精度 __bang_active_exp_less_0(out, data, count);}
Channel方向求和
# CUDA Kernel code:
__global__ void kernel_channel_sum(const int channels, const int spatial_dim, const float* data, float* channel_sum) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < spatial_dim; index += blockDim.x * gridDim.x) { float sum = 0; for (int c = 0; c < channels; ++c) { sum += data[index * channels + c]; } channel_sum[index] = sum; }}
# BANG C Kernel code:
__mlu_func__ void kernel_channel_sum(int channels, int spatial_dim, half* data, half* channel_sum) { // 与获取最大值相似,借助sumpool算子求和 __bang_sumpool(channel_sum, data, spatial_dim, channels, 1, channels, 1, 1, 1);}
Channel方向归一化
# CUDA Kernel code:
__global__ void kernel_channel_div(const int channels, const int spatial_dim, const float* channel_sum, float* data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < channels * spatial_dim; index += blockDim.x * gridDim.x) { int s = index / channels; data[index] /= channel_sum[s]; }}
# BANG C Kernel code:
__mlu_func__ void kernel_channel_div(int channels, int spatial_dim, half* channel_sum, half* data) { __bang_active_recip(channel_sum, channel_sum, spatial_dim); __bang_cycle_mul(data, data, channel_sum, channels * spatial_dim, spatial_dim);}
Entry函数
# CUDA entry code:
void Softmax(float* input,float* output, int channels, int height, int width) { int spatial_dim = height * width; int size = spatial_dim * channels; __shared__ int temp_data[spatial_dim]; kernel_channel_max<<>>(channels, spatial_dim, input, temp_data); kernel_channel_subtract<<>>(channels, spatial_dim, temp_data, output); kernel_exp<<>>(size, output, output); kernel_channel_sum<<>>(channels, spatial_dim, output, temp_data); kernel_channel_div<<>>(channels, spatial_dim, temp_data, output);}
# BANG C entry code:
__mlu_entry__ void Softmax(half* input, half* output, int channels, int height, int width) { __nram__ half nram_buffer[NRAM_HALF_SIZE]; // 考虑到是以channels维度进行softmax,因此在进行数据拆分时也需要以channels维度作为基准 int channels_up = ALIGIN_UP(channels, 64); // 考虑到我们在kernel函数中需要两块大小相同的内存空间进行计算(需要对原始数据进行transpose),且spatial_dim也需要进行对齐, // 综合考虑计算需要进行的循环次数 int spatial_dim = NRAM_HALF_SIZE / channels_up / 2; int spatial_dim_down = ALIGN_DOWN_TO(spatial_dim, 64); int loop_time = height * width / spatial_dim_down; half* src = &nram_buffer[0]; half* trans_data = src + spatial_dim_down * channels_up; for (int i = 0; i < loop_time; i++) { half* input_loop = ((half*) input) + i * spatial_dim_down * channels; half* output_loop = ((half*) output) + i * spatial_dim_down * channels; // 考虑到需要将补齐部分的数据进行填充,因此在拷入数据时,需要将channels个数据拷入到channels_up的空间里(补0对齐), // 并再重复spatial_dim_down - 1次 __memcpy(src, input_loop, channels * sizeof(half), GDRAM2NRAM, channels_up * sizeof(half), channels * sizeof(half), spatial_dim_down - 1); // 将数据从(spatial_dim_down,channels_up)形状转换为(channels_up,spatial_dim_down)以方便后续运算, // 由于__bang_transpose不支持原位运算,因此我们需要使用另一块与src大小相同的数据trans_data __bang_transpose(trans_data, src, spatial_dim_down, channels_up); // src 上的数据不再需要使用,因此可以作为复用空间 kernel_channel_max(channels, spatial_dim_down, trans_data, src); kernel_channel_subtract(channels, spatial_dim_down, src, trans_data); kernel_exp(spatial_dim_down * channels, trans_data, trans_data); kernel_channel_sum(channels, spatial_dim_down, trans_data, src); kernel_channel_div(channels, spatial_dim_down, src, trans_data); // 最终结果在trans_data中,需要transpose后拷出 __bang_transpose(src, trans_data, channels_up, spatial_dim_down); __memcpy(output_loop, src, channels * sizeof(half), NRAM2GDRAM, channels * sizeof(half), channels_up * sizeof(half), spatial_dim_down - 1); } // 如数据总量非整除,计算剩余部分 int spatial_dim_remain = height * width % spatial_dim_down; if (spatial_dim_remain != 0) { int spatial_dim_remain_up = ALIGIN_UP(spatial_dim_remain, 64); half* input_remain = ((half*) input) + loop_time * spatial_dim_down * channels; half* output_remain = ((half*) output) + loop_time * spatial_dim_down * channels; __memcpy(src, input_remain, channels * sizeof(half), GDRAM2NRAM, channels_up * sizeof(half), channels * sizeof(half), spatial_dim_remain - 1); __bang_transpose(trans_data, src, spatial_dim_remain_up, channels_up); kernel_channel_max(channels, spatial_dim_remain_up, trans_data, src); kernel_channel_subtract(channels, spatial_dim_remain_up, src, trans_data); kernel_exp(spatial_dim_remain_up * channels, trans_data, trans_data); kernel_channel_sum(channels, spatial_dim_remain_up, trans_data, src); kernel_channel_div(channels, spatial_dim_remain_up, src, trans_data); __bang_transpose(src, trans_data, channels, spatial_dim_remain_up); __memcpy(output_remain, src, channels * sizeof(half), NRAM2GDRAM, channels * sizeof(half), channels_up * sizeof(half), spatial_dim_remain - 1); }}
本节以l2loss算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _Cu_l2loss(const float* a, const float* b, int n, float* out) { cg::thread_block cta = cg::this_thread_block(); float *sdata = SharedMemory(); int tid = threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x; float error = a[i] - b[i]; error = error * error; sdata[tid] = (i 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } cg::sync(cta); } if (tid == 0) { out[blockIdx.x] = 0.5 * sdata[0]; }}
BANG C Kernel code:
#define NFU_ALIGN_SIZE 128#define DDR_ALIGN_MAP3 (1024 * 16)#define MAX_NRAM_SIZE (1024 * NFU_ALIGN_SIZE * 3)#define SIZE_NRAM_BUF ((MAX_NRAM_SIZE) / 2)#define ALIGIN_UP(x, num) (((x - 1) / num + 1) * num)#define ALIGN_DOWN_TO(x, num) (((x - 1) / num) * num)__mlu_entry__ void kernelL2loss(half *input_a, half *input_b, half *output, const uint32_t size) { __nram__ char nram_buffer_a[SIZE_NRAM_BUF/2]; __nram__ char nram_buffer_b[SIZE_NRAM_BUF/2]; __mlu_shared__ char sram_buffer[SIZE_NRAM_BUF]; uint32_t ALIGN_T_SIZE = NFU_ALIGN_SIZE / sizeof(half); uint32_t num_total = size; uint32_t num_per_core = num_total / taskDim; uint32_t rem_for_all = num_total % taskDim; uint32_t dealsize = (DDR_ALIGN_MAP3 / sizeof(half)); dealsize = ALIGN_DOWN_TO(dealsize, ALIGN_T_SIZE); half *nram_a = (half *)nram_buffer_a; half *nram_b = (half *)nram_buffer_b; half *sram_sum = (half *)sram_buffer; half sum = 0.0; uint32_t repeat = num_per_core / dealsize; uint32_t rem = num_per_core % dealsize; uint32_t align_rem = ALIGIN_UP(rem, ALIGN_T_SIZE); // for every taskId half * _addr_a = (half *)input_a + taskId * num_per_core; half * _addr_b = (half *)input_b + taskId * num_per_core; // Loop compute NFU aligned part for(int i=0; i 0) { if (taskId == taskDim - 1) { for(int i=0; i<rem_for_all; i++) { half err = input_a[num_per_core * taskDim + i] - input_b[num_per_core * taskDim + i]; err = err * err; sum = sum + err; } } } sram_sum[taskId] = sum; __sync_cluster(); // Handle sram_sum & copy out if (taskId == taskDim -1) { __nramset(nram_a, taskDim, (half)0.0); __memcpy(nram_a, sram_sum, taskDim * sizeof(half), SRAM2NRAM); for(int i = 1; i < taskDim; i++) { nram_a[0] = nram_a[0] + nram_a[i]; } __memcpy(output, nram_a, 1 * sizeof(half), NRAM2GDRAM); output[0] = output[0] * 0.5; }}
本节会以boxencode算子为例,介绍 CUDA 算子转BANG C算子开发过程的一些基本思路。 本样例以输入数据格式为CHW,实现的部分代码如下,完整代码参见/path/to/code。
获取box坐标、类别和置信度
# CUDA Kernel code:
__global__ void kernel_box_encode(int w, int h, int channel, const float *arm_loc, const float *odm_loc, const float *prior_box, float *norm_box) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < w * h * channel; index += blockDim.x * gridDim.x) { int priorn = channel / 4; int boxstep = w * h; int chanelStep = w * h * 4; int pix = w * h; int _c = index / pix; int _h = (index % pix) / w; int _w = (index % pix) % w; int _p = _c / 4; int i = _c % 4; int box_index = _p * pix + _h * w + _w; int prior_index = ((_h * w + _w) * priorn * 4) + (_p * 4); int v_index = prior_index + w * h * channel; float pcx = prior_box[prior_index]; float pcy = prior_box[prior_index + 1]; float pw = prior_box[prior_index + 2]; float ph = prior_box[prior_index + 3]; int arm_index; float arm_cx, arm_cy, arm_w, arm_h, odm_cx, odm_cy, odm_w, odm_h; arm_index = _p * chanelStep + _h * w + _w; arm_cx = arm_loc[arm_index + 0 * boxstep]; arm_cy = arm_loc[arm_index + 1 * boxstep]; arm_w = arm_loc[arm_index + 2 * boxstep]; arm_h = arm_loc[arm_index + 3 * boxstep]; odm_cx = odm_loc[arm_index + 0 * boxstep]; odm_cy = odm_loc[arm_index + 1 * boxstep]; odm_w = odm_loc[arm_index + 2 * boxstep]; odm_h = odm_loc[arm_index + 3 * boxstep]; float decode_bbox_center_x = prior_box[v_index] * arm_cx * pw + pcx; float decode_bbox_center_y = prior_box[v_index + 1] * arm_cy * ph + pcy; float decode_bbox_width = exp(prior_box[v_index + 2] * arm_w) * pw; float decode_bbox_height = exp(prior_box[v_index + 3] * arm_h) * ph; float _decode_bbox_center_x = prior_box[v_index] * odm_cx * decode_bbox_width + decode_bbox_center_x; float _decode_bbox_center_y = prior_box[v_index + 1] * odm_cy * decode_bbox_height + decode_bbox_center_y; float _decode_bbox_width = exp(prior_box[v_index + 2] * odm_w) * decode_bbox_width; float _decode_bbox_height = exp(prior_box[v_index + 3] * odm_h) * decode_bbox_height; switch (i) { //0:xmin, 1:ymin, 2:xmax, 3:ymax case 0: norm_box[box_index * 6 + 0] = static_cast(_decode_bbox_center_x - _decode_bbox_width / 2.); break; case 1: norm_box[box_index * 6 + 1] = static_cast(_decode_bbox_center_y - _decode_bbox_height / 2.); break; case 2: norm_box[box_index * 6 + 2] = static_cast(_decode_bbox_center_x + _decode_bbox_width / 2.); break; case 3: norm_box[box_index * 6 + 3] = static_cast(_decode_bbox_center_y + _decode_bbox_height / 2.); break; default: break; } }}__global__ void kernel_box_score(int w, int h, int channel, int classn, const float *arm_conf, const float *odm_conf, const float objThresh, float *norm_box) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < w * h * channel / 4; index += blockDim.x * gridDim.x) { int armChanelStep = w * h * 2; int odmChanelStep = w * h * classn; int classStep = w * h; int pix = w * h; int _c = index / pix; int _h = (index % pix) / w; int _w = (index % pix) % w; int armIndex = _c * armChanelStep + _h * w + _w; int odmIndex = _c * odmChanelStep + _h * w + _w; int boxIndex = _c * pix + _h * w + _w; float maxScore = 0; float maxid = 0; if (arm_conf[armIndex + classStep] < objThresh) { maxScore = 1; maxid = 0; } else{ for (int c = 0; c maxScore) { maxScore = odm_conf[odmIndex + c * classStep]; maxid = c; } } } norm_box[boxIndex * 6 + 4] = static_cast(maxid); norm_box[boxIndex * 6 + 5] = static_cast(maxScore); }}
# BANG C Kernel code:
template __mlu_func__ void box_encode_kernel(T* arm_loc, T* odm_loc, T* arm_conf, T* odm_conf, T* prior_box, T* mask, T* nram_buffer, T* norm_box, int src_w, int src_h, int src_c, int classn, float objThresh) { T* arm_loc_n = &nram_buffer[0]; int total_num = src_w * src_h * src_c; int priorn = src_c / 4; int src_hw_up = ALIGIN_UP(src_w * src_h, 64); int total_num_up = src_hw_up * src_c; // 利用指针位移来预先分配需要的内存空间 T* arm_conf_n = arm_loc_n + total_num_up; T* mask1 = arm_conf_n + src_hw_up * priorn; T* mask2 = mask1 + total_num_up; T* mask3 = mask2 + total_num_up; T* mask4 = mask3 + total_num_up; T* odm_loc_n = mask4 + total_num_up; T* odm_conf_n = odm_loc_n + total_num_up; T* prior_box_n = odm_conf_n + src_c * src_hw_up; T* arm_cx = prior_box_n + total_num_up * 2; T* arm_cy = arm_cx + total_num_up / 4; T* arm_w = arm_cy + total_num_up / 4; T* arm_h = arm_w + total_num_up / 4; T* odm_cx = arm_h + total_num_up / 4; T* odm_cy = odm_cx + total_num_up / 4; T* odm_w = odm_cy + total_num_up / 4; T* odm_h = odm_w + total_num_up / 4; T* pcx = odm_h + total_num_up / 4; T* pcy = pcx + total_num_up / 4; T* pw = pcy + total_num_up / 4; T* ph = pw + total_num_up / 4; T* pv1 = ph + total_num_up / 4; T* pv2 = pv1 + total_num_up / 4; T* pv3 = pv2 + total_num_up / 4; T* pv4 = pv3 + total_num_up / 4; // 由于数据对齐的需要,通过stride copy将arm_conf和odm_conf拷贝到sram,注意此处由于我们知道arm_conf仅使用了 // 第二个类别的数据,因此在拷贝时仅拷贝了第二个类别的数据 __memcpy(arm_conf_n, arm_conf + src_w * src_h, src_w * src_h * sizeof(T), GDRAM2NRAM, src_hw_up * sizeof(T), 2 * src_w * src_h * sizeof(T), priorn - 1); __memcpy(odm_conf_n, odm_conf, src_w * src_h * sizeof(T), GDRAM2NRAM, src_hw_up * sizeof(T), src_w * src_h * sizeof(T), src_c - 1); // 借助stride copy的方式,我们将arm_loc_n和odm_loc_n的数据拷贝的同时从CHW型转为了HWC型方便后面进行数据挑选 // 和进一步的计算 for (int i = 0; i < src_c; i++) { __memcpy(arm_loc_n + i, arm_loc + i * src_w * src_h, sizeof(T), GDRAM2NRAM, src_c * sizeof(T), sizeof(T), src_w * src_h - 1); __memcpy(odm_loc_n + i, odm_loc + i * src_w * src_h, sizeof(T), GDRAM2NRAM, src_c * sizeof(T), sizeof(T), src_w * src_h - 1); } // mask实际上包含了mask1, mask2, mask3和mask4的数据,用于挑取数据进行进一步的计算。 // 由于mask可以预先生成,因此我们在cpu上生成数据再传入nram __memcpy(mask1, mask, total_num_up * 4 * sizeof(T), GDRAM2NRAM); __memcpy(prior_box_n, prior_box, total_num * sizeof(T), GDRAM2NRAM); __memcpy(prior_box_n + total_num_up, prior_box + total_num, total_num * sizeof(T), GDRAM2NRAM); // 使用__bang_collect接口获取数据,mask1是由total_num_up / 4个{1,0,0,0}组成的,mask2、 // mask3、mask4依次类推,因此可以将arm_loc、odm_loc和prior_box的数据拆分开来 __bang_collect(arm_cx, arm_loc_n, mask1, total_num_up); __bang_collect(arm_cy, arm_loc_n, mask2, total_num_up); __bang_collect(arm_w, arm_loc_n, mask3, total_num_up); __bang_collect(arm_h, arm_loc_n, mask4, total_num_up); __bang_collect(odm_cx, odm_loc_n, mask1, total_num_up); __bang_collect(odm_cy, odm_loc_n, mask2, total_num_up); __bang_collect(odm_w, odm_loc_n, mask3, total_num_up); __bang_collect(odm_h, odm_loc_n, mask4, total_num_up); __bang_collect(pcx, prior_box_n, mask1, total_num_up); __bang_collect(pcy, prior_box_n, mask2, total_num_up); __bang_collect(pw, prior_box_n, mask3, total_num_up); __bang_collect(ph, prior_box_n, mask4, total_num_up); __bang_collect(pv1, prior_box_n + total_num_up, mask1, total_num_up); __bang_collect(pv2, prior_box_n + total_num_up, mask2, total_num_up); __bang_collect(pv3, prior_box_n + total_num_up, mask3, total_num_up); __bang_collect(pv4, prior_box_n + total_num_up, mask4, total_num_up); // 计算 decode_bbox_center_x,考虑arm_cx的数据已经不再使用,暂时存储在 arm_cx处 __bang_mul(arm_cx, arm_cx, pw, total_num_up / 4); __bang_mul(arm_cx, arm_cx, pv1, total_num_up / 4); __bang_add(arm_cx, arm_cx, pcx, total_num_up / 4); // 计算 decode_bbox_center_y,考虑arm_cx的数据已经不再使用,暂时存储在 arm_cy处 __bang_mul(arm_cy, arm_cy, ph, total_num_up / 4); __bang_mul(arm_cy, arm_cy, pv2, total_num_up / 4); __bang_add(arm_cy, arm_cy, pcy, total_num_up / 4); // 计算 decode_bbox_width,考虑arm_cx的数据已经不再使用,暂时存储在 arm_w处 __bang_mul(arm_w, arm_w, pv3, total_num_up / 4); __bang_active_exp(arm_w, arm_w, total_num_up / 4); __bang_mul(arm_w, arm_w, pw, total_num_up / 4); // 计算 decode_bbox_height,考虑arm_cx的数据已经不再使用,暂时存储在 arm_h处 __bang_mul(arm_h, arm_h, pv4, total_num_up / 4); __bang_active_exp(arm_h, arm_h, total_num_up / 4); __bang_mul(arm_h, arm_h, ph, total_num_up / 4); // 计算 _decode_bbox_center_x,考虑odm_cx的数据已经不再使用,暂时存储在 odm_cx处 __bang_mul(odm_cx, odm_cx, arm_w, total_num_up / 4); __bang_mul(odm_cx, odm_cx, pv1, total_num_up / 4); __bang_add(odm_cx, odm_cx, arm_cx, total_num_up / 4); // 计算 _decode_bbox_center_y,考虑odm_cy的数据已经不再使用,暂时存储在 odm_cy处 __bang_mul(odm_cy, odm_cy, arm_h, total_num_up / 4); __bang_mul(odm_cy, odm_cy, pv2, total_num_up / 4); __bang_add(odm_cy, odm_cy, arm_cy, total_num_up / 4); // 计算 _decode_bbox_width,考虑odm_w的数据已经不再使用,暂时存储在 odm_w处 __bang_mul(odm_w, odm_w, pv3, total_num_up / 4); __bang_active_exp(odm_w, odm_w, total_num_up / 4); __bang_mul(odm_w, odm_w, arm_w, total_num_up / 4); // 计算 _decode_bbox_height,考虑odm_h的数据已经不再使用,暂时存储在 odm_h处 __bang_mul(odm_h, odm_h, pv4, total_num_up / 4); __bang_active_exp(odm_h, odm_h, total_num_up / 4); __bang_mul(odm_h, odm_h, arm_h, total_num_up / 4); // 计算最终的x、y、w、h,考虑prior_box的数据已经不再使用,暂时存储在 prior_box处 __bang_mul_const(odm_w, odm_w, (T)0.5, total_num_up / 4); __bang_sub(prior_box_n, odm_cx, odm_w, total_num_up / 4); __bang_mul_const(odm_h, odm_h, (T)0.5, total_num_up / 4); __bang_sub(prior_box_n + total_num_up / 4, odm_cy, odm_h, total_num_up / 4); __bang_add(prior_box_n + 2 * total_num_up / 4, odm_cx, odm_w, total_num_up / 4); __bang_add(prior_box_n + 3 * total_num_up / 4, odm_cy, odm_h, total_num_up / 4); // 计算类别和置信度,考虑到后面需要拷出,因此将结果放在连续的内存上方便拷贝 __bang_write_zero(prior_box_n + total_num_up, 2 * total_num_up / 4); // 借用mask1位置生成一个64位的objThresh数据与arm_conf_n进行比较 __nramset(mask1, 64, (half) objThresh); __bang_cycle_lt(arm_conf_n, arm_conf_n, mask1, src_hw_up * priorn, 64); // 通过__bang_maxpool_index获取odm_conf上的类别,需要注意odm_conf的实际shape为 // (priorn,classn,src_hw_up),因此需要使用(classn, 1)shape的kernel half* index_ = prior_box_n + 4 * (total_num_up / 4); unsigned short* index_us = (unsigned short* )index_; __bang_maxpool_index(index_us, odm_conf_n, src_hw_up, priorn, classn, 1, classn, 1, 1); // 使用mask2空间将获得的类别转为half方便后续计算 __bang_short2half(mask2, (short*)index_us, src_hw_up * priorn); // 通过__bang_maxpool_index获取odm_conf上的置信度 __bang_maxpool(mask3, odm_conf_n, src_hw_up, priorn, classn, 1, classn, 1, 1); // 由于arm_conf 的结果优先于odm_conf的结果,因此需要使用arm_conf的结果作为mask覆盖掉odm_conf的结果 __bang_add_const(mask1, arm_conf_n, (half) -1, src_hw_up * priorn); __bang_mul_const(mask1, mask1, (half) -1, src_hw_up * priorn); __bang_mul(mask2, mask2, mask1, src_hw_up * priorn); __bang_mul(mask3, mask3, mask1, src_hw_up * priorn); __bang_add(arm_conf_n, arm_conf_n, mask3, src_hw_up * priorn); // 将最终的结果汇总至prior_box_n,注意使用了stride copy,将CHW的结果转换为HWC for (int i = 0; i < priorn; i++) { __memcpy(prior_box_n + 4 * (total_num_up / 4) + i, mask2 + i * src_hw_up, sizeof(T), NRAM2NRAM, priorn * sizeof(T), sizeof(T), src_w * src_h - 1); __memcpy(prior_box_n + 5 * (total_num_up / 4) + i, arm_conf_n + i * src_hw_up, sizeof(T), NRAM2NRAM, priorn * sizeof(T), sizeof(T), src_w * src_h - 1); } // 最终将结果拷出,注意当前获得的结果形状为 (6, num_boxes),因此借用stride copy转换为 (num_boxes, 6) for (int i = 0; i < 6; i++) { __memcpy(norm_box + i, prior_box_n + i * (total_num_up / 4), sizeof(T), NRAM2GDRAM, 6 * sizeof(T), sizeof(T), total_num / 4 - 1); }}
热门帖子
精华帖子