打开微信,使用扫一扫进入页面后,点击右上角菜单,
点击“发送给朋友”或“分享到朋友圈”完成分享
接上回BANG C编程基础: http://forum.cambricon.com/index.php?m=content&c=index&a=show&catid=33&id=677
接下来我就具体介绍几个深度学习中常用的算子如何用BANG C实现。
以relu算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _cuda_relu(const float* a, int total, float* b) { int tid = threadIdx.x + blockDim.x * blockIdx.x; int skip = blockDim.x * gridDim.x; for (int i = tid; i < total; i += skip) { float v = a[i]; b[i] = max(v, float(0)); }}
BANG C Kernel code:
__mlu_entry__ void kernelRelu(half *X, half *Y, int size) { __nram__ half buffer[NRAM_HALF_SIZE]; int size_up = ALIGIN_UP(size, 64); __memcpy(buffer, X, size * sizeof(half), GDRAM2NRAM); __bang_active_relu(buffer, buffer, size_up); __memcpy(Y, buffer, size * sizeof(half), NRAM2GDRAM);}
以abs算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void kernel_abs(const int count, const float* data, float* out) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) { out[index] = abs(data[index]); }}
BANG C Kernel code:
__mlu_func__ void kernel_abs(int count, half* data, half* out) { __nram__ half nram_buffer[NRAM_HALF_SIZE]; half* zeros = &nram_buffer[0]; __bang_write_zero(zeros, 64); half* input = zeros + 64; int count_up = ALIGIN_UP(count, 64); half* mask = input + count_up; __memcpy(input, data, count * sizeof(half), GDRAM2NRAM); // 方法一: if (1) { // 获取输入数据中大于0的mask __bang_cycle_gt(mask, input, zeros, count_up, 64); // 将mask中的1变为2 __bang_mul_const(mask, mask, (half) 2, count_up); // 将mask减去1,从而将mask中的0变为-1,2变为1 __bang_sub_const(mask, mask, (half) 1, count_up); //将mask与原输入数据相乘,从而获得最终结果 __bang_mul(input, input, mask, count_up); } // 方法二: else{ // 将原数据乘以 -1 __bang_mul_const(mask, input, (half) -1, count_up); // 获得原数据与mask中的较大值,即为最终结果 __bang_maxequal(input, mask, input, count_up); } __memcpy(out, input, count * sizeof(half), NRAM2GDRAM);}
以clip算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _cuda_clip(const float* a, int total, float minValue, float maxValue, float* b) { int tid = threadIdx.x + blockDim.x * blockIdx.x; int skip = blockDim.x * gridDim.x; for (int i = tid; i < total; i += skip) { float v = a[i]; v = max(v, minValue); v = min(v, maxValue); b[i] = v; }}
BANG C Kernel code:
// 以__mlu_func__为标识的函数为BANG C的inline函数,只能被MLU调用__mlu_func__ void clipForward(float *nram_in, float *nram_out, const int32_t num_align, const float min, const float max) { // 将nram_out的值设为clip的最小值,并通过__bang_maxequal将nram_in内小于min值的数值设为min __nramset((float *)nram_out, num_align, (float)min); __bang_maxequal((float *)nram_in, (float *)nram_in, (float *)nram_out, num_align); // 将nram_out的值设为clip的最大值,并通过__bang_minequal将nram_in内大于max值的数值设为max, // 最终将结果放在nram_out内 __nramset((float *)nram_out, num_align, (float)max); __bang_minequal((float *)nram_out, (float *)nram_in, (float *)nram_out, num_align);}__mlu_entry__ void kernelClip(half *input, half *output, const uint32_t size, const float min, const float max) { __nram__ char buffer[MAX_NRAM_SIZE]; char* nram_in = (half*)buffer; char* nram_out = nram_in + MAX_NRAM_SIZE / 2; int size_up = ALIGIN_UP(size, 64); __memcpy((half *)nram_in, input, size * sizeof(half), GDRAM2NRAM); clipForward((half *)nram_in, (half *)nram_out, size_up, min, max); __memcpy(output, (half *)nram_out, size * sizeof(half), NRAM2GDRAM);}
热门帖子
精华帖子