.mlu代码还请提供下
.mlu代码还请提供下
#include "mlu.h"
#define INPUT_HEIGHT 1024
#define OUT_HEIGHT 256
#define ROUND 32
__mlu_func__ void resize_mlu(int* in_array, int* out_array, uint32_t height, uint32_t width, uint32_t out_height, uint32_t out_width) {
int quotient = height / out_height;
__nram__ int outputNRAM[OUT_HEIGHT * ROUND / 4];
__nram__ int inputNRAM[INPUT_HEIGHT * ROUND];
__mlu_shared__ int inputSRAM[INPUT_HEIGHT * ROUND * 4];
int size_int = sizeof(int);
int all_round = width / (ROUND * taskDim);
for (int i = 0; i < all_round; i++) {
__memcpy(inputSRAM, in_array + ROUND * (i * taskDim + clusterId * 4) * width, width * ROUND * 4 *size_int, GDRAM2SRAM);
__sync_cluster();
__memcpy(inputNRAM, inputSRAM + ROUND * coreId * width , width * ROUND * size_int, SRAM2NRAM);
int i_height = ROUND/ quotient;
for(int j = 0; j < i_height; j++) {
for(int k =0; k < out_width; k ++) {
outputNRAM[j * i_height + k] = inputNRAM[(quotient * (j + 1) - 1)* height + quotient * (k + 1) - 1];
}
}
__memcpy(out_array + ROUND / quotient * (i * taskDim + taskId) * out_width, outputNRAM, OUT_HEIGHT * ROUND / 4 * size_int, NRAM2GDRAM);
__sync_cluster();
}
}
__mlu_entry__ void resize(int *src, int *dst, uint32_t src_height, uint32_t src_width, uint32_t dst_height, uint32_t dst_width){
resize_mlu(src, dst, src_height, src_width, dst_height, dst_width);
}
请登录后评论