打开微信,使用扫一扫进入页面后,点击右上角菜单,
点击“发送给朋友”或“分享到朋友圈”完成分享
接上回BANG C编程基础: http://forum.cambricon.com/index.php?m=content&c=index&a=show&catid=33&id=677
接下来我就具体介绍几个深度学习中常用的算子如何用BANG C实现。
以relu算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _cuda_relu(const float* a, int total, float* b) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int skip = blockDim.x * gridDim.x;
for (int i = tid; i < total; i += skip) {
float v = a[i];
b[i] = max(v, float(0));
}}BANG C Kernel code:
__mlu_entry__ void kernelRelu(half *X, half *Y, int size) {
__nram__ half buffer[NRAM_HALF_SIZE];
int size_up = ALIGIN_UP(size, 64);
__memcpy(buffer, X, size * sizeof(half), GDRAM2NRAM);
__bang_active_relu(buffer, buffer, size_up);
__memcpy(Y, buffer, size * sizeof(half), NRAM2GDRAM);}以abs算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void kernel_abs(const int count, const float* data, float* out) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count;
index += blockDim.x * gridDim.x) {
out[index] = abs(data[index]);
}}BANG C Kernel code:
__mlu_func__ void kernel_abs(int count, half* data, half* out) {
__nram__ half nram_buffer[NRAM_HALF_SIZE];
half* zeros = &nram_buffer[0];
__bang_write_zero(zeros, 64);
half* input = zeros + 64;
int count_up = ALIGIN_UP(count, 64);
half* mask = input + count_up;
__memcpy(input, data, count * sizeof(half), GDRAM2NRAM);
// 方法一:
if (1) {
// 获取输入数据中大于0的mask
__bang_cycle_gt(mask, input, zeros, count_up, 64);
// 将mask中的1变为2
__bang_mul_const(mask, mask, (half) 2, count_up);
// 将mask减去1,从而将mask中的0变为-1,2变为1
__bang_sub_const(mask, mask, (half) 1, count_up);
//将mask与原输入数据相乘,从而获得最终结果
__bang_mul(input, input, mask, count_up);
}
// 方法二:
else{
// 将原数据乘以 -1
__bang_mul_const(mask, input, (half) -1, count_up);
// 获得原数据与mask中的较大值,即为最终结果
__bang_maxequal(input, mask, input, count_up);
}
__memcpy(out, input, count * sizeof(half), NRAM2GDRAM);}以clip算子为例,介绍 CUDA 算子转 BANG C 算子开发过程的一些基本思路。
CUDA Kernel code:
__global__ void _cuda_clip(const float* a, int total, float minValue, float maxValue, float* b) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int skip = blockDim.x * gridDim.x;
for (int i = tid; i < total; i += skip) {
float v = a[i];
v = max(v, minValue);
v = min(v, maxValue);
b[i] = v;
}}BANG C Kernel code:
// 以__mlu_func__为标识的函数为BANG C的inline函数,只能被MLU调用__mlu_func__ void clipForward(float *nram_in,
float *nram_out,
const int32_t num_align,
const float min,
const float max) {
// 将nram_out的值设为clip的最小值,并通过__bang_maxequal将nram_in内小于min值的数值设为min
__nramset((float *)nram_out, num_align, (float)min);
__bang_maxequal((float *)nram_in, (float *)nram_in, (float *)nram_out, num_align);
// 将nram_out的值设为clip的最大值,并通过__bang_minequal将nram_in内大于max值的数值设为max,
// 最终将结果放在nram_out内
__nramset((float *)nram_out, num_align, (float)max);
__bang_minequal((float *)nram_out, (float *)nram_in, (float *)nram_out, num_align);}__mlu_entry__ void kernelClip(half *input,
half *output,
const uint32_t size,
const float min,
const float max) {
__nram__ char buffer[MAX_NRAM_SIZE];
char* nram_in = (half*)buffer;
char* nram_out = nram_in + MAX_NRAM_SIZE / 2;
int size_up = ALIGIN_UP(size, 64);
__memcpy((half *)nram_in, input, size * sizeof(half), GDRAM2NRAM);
clipForward((half *)nram_in, (half *)nram_out, size_up, min, max);
__memcpy(output, (half *)nram_out, size * sizeof(half), NRAM2GDRAM);}热门帖子
精华帖子