打开微信,使用扫一扫进入页面后,点击右上角菜单,
点击“发送给朋友”或“分享到朋友圈”完成分享
本系列文章介绍优化YOLOv4网络的整个流程,涵盖了 BangC 算子开发,TensorFlow 自定义算子的注册及添加,PB模型的算子替换和性能精度验证。
实验环境:
MLU270 v1.4.0 release
Cambricon-Tensorflow 1.14
目录
1.PB模型的分析
2.Mish bangC算子的开发、精度验证与性能测试
3.TensorFlow 自定义算子的集成
4.PB网络模型的大算子替换
5.单算子与整体网络的精度验证
PB模型的分析
通过 darknet yolov4 weights 转换成 pb 模型后,利用开源模型可视化软件Netron查看,其中mish 算子的构成如下:

在 yolov4 中,需要运用到 72 个 mish 算子,而TF中暂时没有mish算子的实现,只能用 Exp+Log1p+Tanh+Mul 进行拼凑,然而在MLU270/220的架构下,这种拼凑算子形式会在IO上损耗资源,影响性能,所以全部替换成Mish大算子会有可观的性能提升。
Mish bangC算子的开发、精度验证与性能测试
设计原理:
1. Mish 算子属于激活算子,输入输出均为1维,所以可以将数据做平坦后输出,再再Bangc 里面进行数据分块运算;
2. Mish算子的公式如图xx,bangc中可以使用 __bang_active_exp、__bang_active_log、__bang_active_tanh、__bang_mul 进行拼凑;


3. 数据分块计算
假设输入为长度为1000000的数据,自定义的数据分块尺寸为12800,先进行的是每个ipu core 的分块,将数据分成16+1份,其中16份的长度相同,剩下的对128向上取整;
然后是对每个ipu core 上的那份数据进行分块,每份长度为12800,剩下的对128向上取整;
这时候完成了数据分块,先对ipu core 上每份12800大小的数据进行多核计算,然后是对每个ipu core 上余下的进行计算,最后是对16份ipu core 分块时余下的进行单核计算;
代码实现:
BangC Mish 算子实现和单元测试,主要对单算子的实现、精度调试和性能评估:
__mlu_entry__ void MishKernel_MLU270(half* input_data_, half* output_data_, int input_size){
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);//计算算子时间
__nram__ half nram_input[MISH_SIZE];
__nram__ half tmp0[MISH_SIZE];
uint32_t num_total = input_size;
// uint32_t data_count = data_count_/ sizeof(half);
uint32_t num_per_core = num_total / taskDim;
uint32_t rem_for_all = num_total % taskDim;
uint32_t align_rem_for_all = alignUp(rem_for_all,128 );
uint32_t dealsize = MISH_SIZE;
uint32_t repeat = num_per_core / dealsize;
uint32_t rem = num_per_core % dealsize;
uint32_t align_rem = alignUp(rem, 128);
for (int i = 0; i 0){ //最后利用core0来计算剩下的数据块
if (taskId == 0){
__memcpy(nram_input, input_data_ + taskDim * num_per_core , rem_for_all * sizeof(half),GDRAM2NRAM);
//__bang_half2float(nram_input_fp32,nram_input,dealsize);
__bang_active_exp(tmp0,nram_input,align_rem_for_all);
__bang_add_const(tmp0,tmp0,1.0,align_rem_for_all);
__bang_active_log(tmp0,tmp0,align_rem_for_all);
__bang_active_relu(tmp0,tmp0,dealsize);
__bang_active_tanh(tmp0,tmp0,align_rem_for_all);
__bang_mul(tmp0,nram_input,tmp0,align_rem_for_all);
//__bang_float2half_rd(nram_input,tmp0,dealsize);
__memcpy(output_data_ + taskDim * num_per_core , tmp0 , rem_for_all*sizeof(half),NRAM2GDRAM);
}
}
__sync_all();//同步
//计算耗时
gettimeofday(&end, NULL);
uint32_t time_usec = (uint32_t)end.tv_usec - (uint32_t)start.tv_usec;
printf("Hardware Total Time: %u us\n", time_usec);
}
int main() {
const int data_count = DATA_COUNT*BATCH_SIZE;
int batch_num_ = BATCH_SIZE;
int core_num_ = NUM_MULTICORE;
float* data = (float*)malloc(data_count * sizeof(float));
float* cpu_out = (float*)malloc(data_count * sizeof(float));
for (int i = 0; i<DATA_COUNT;i++ ){
data[i] = (rand() % 100) / 50.0; //随机生成数据
}
mish_cpu(data, cpu_out, data_count); //计算CPU
cnrtInit(0); //初始化设备
cnrtDev_t dev;
cnrtGetDeviceHandle(&dev, 0);
cnrtSetCurrentDevice(dev);
cnrtQueue_t pQueue;
cnrtCreateQueue(&pQueue);
cnrtDim3_t dim = {NUM_MULTICORE,1,1};
cnrtFunctionType_t c;
switch (core_num_) {
case 1:
c = CNRT_FUNC_TYPE_BLOCK;
printf("task type = BLOCK\n");
break;
case 4:
c = CNRT_FUNC_TYPE_UNION1;
printf("task type = UNION1\n");
break;
case 16:
c = CNRT_FUNC_TYPE_UNION4;
printf("task type = UNION4\n");
break;
default:
exit(-1);
}
std::vector input_data;
std::vector output_data;
half *data_mlu, *out_data;
CNRT_CHECK(cnrtMalloc((void**)&data_mlu, data_count * sizeof(half)));
CNRT_CHECK(cnrtMalloc((void**)&out_data, data_count * sizeof(half)));
cnrtMemcpyFloatToHalf(data_mlu, data, data_count);
cnrtNotifier_t Notifier_start;
cnrtNotifier_t Notifier_end;
cnrtCreateNotifier(&Notifier_start);
cnrtCreateNotifier(&Notifier_end);
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
cnrtPlaceNotifier(Notifier_start, pQueue);
MishKernel_MLU270<<>>(data_mlu,out_data,data_count);//MLU执行运算
cnrtPlaceNotifier(Notifier_end, pQueue);
CNRT_CHECK(cnrtSyncQueue(pQueue));
gettimeofday(&end, NULL);
float time_use = ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec))/1000.0;
printf("time use: %.3f ms\n", time_use);
float* output_tmp = (float*)malloc(data_count * sizeof(float));
cnrtMemcpyHalfToFloat(output_tmp, (uint16_t *)out_data, data_count);
printf("MSE:%f \n", calc_mse(cpu_out, output_tmp, data_count));//计算MSE精度
FILE* mluOutputFile = fopen("./mluoutput.txt", "w");
FILE* cpuOutputFile = fopen("./cpuoutput.txt", "w");
for (int i = 0; i < data_count; i++) {
fprintf(mluOutputFile, "%f\n", output_tmp[i]);
fprintf(cpuOutputFile, "%f\n", cpu_out[i]);
}
fclose(mluOutputFile);
fclose(cpuOutputFile);
CNRT_CHECK(cnrtFree(data_mlu));
CNRT_CHECK(cnrtFree(out_data));
CNRT_CHECK(cnrtDestroyQueue(pQueue));
cnrtDestroyNotifier(&Notifier_start);
cnrtDestroyNotifier(&Notifier_end);
cnrtDestroy();
free(data);
free(output_tmp);
free(cpu_out);
}运行结果:

热门帖子
精华帖子