/************************************************************************* * Copyright (C) [2020] by Cambricon, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ #include #include #include #include #include #include #include #include #define M 4 #define N 128 #define P 128 #define ESP 0.01h float AF[M][P]; float BF[P][N]; half A[M][P]; half B[P][N]; float C[M][N]; half cpu_result[M][N]; half mlu_result[M][N]; bool isEqual(half a, half b) { if (fabs(a - b) < ESP) return true; return false; } __mlu_global__ void MatrixMul(half *dA, half *dB, half *dC, int m, int n, int p) { __nram__ half nA[M * P]; __nram__ half nB[P * N]; __nram__ half nTmpB[N * P]; __nram__ half nC[M * N]; __nram__ half nTmpC[P]; __memcpy(nA, dA, m * p * sizeof(half), GDRAM2NRAM); __memcpy(nB, dB, p * n * sizeof(half), GDRAM2NRAM); __bang_transpose(nTmpB, nB, p, n); __nramset(nC, m * n, 0.0h); for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { __bang_mul(nTmpC, nA + i * p, nTmpB + j * p, p); __bang_reduce_sum(nTmpC, nTmpC, p); // sum with 64 element stride, as __bang_reduce_sum() alignment is 128bit for (int k = 0; k < p / 64; k++) { nC[i * n + j] += nTmpC[k * 64]; } } } __memcpy(dC, nC, m * n * sizeof(half), NRAM2GDRAM); } int main() { cnrtDev_t dev; cnrtQueue_t queue; cnrtDim3_t dim = {1, 1, 1}; cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_BLOCK; cnrtInit(0); cnrtGetDeviceHandle(&dev, 0); cnrtSetCurrentDevice(dev); cnrtCreateQueue(&queue); int sizeA = M * P; int sizeB = P * N; int sizeC = M * N; half *pA, *pB, *pC; srand((unsigned)time(NULL)); struct timeval start; struct timeval end; float total_time=0.0f; float time_usec=0.0f; CNRT_CHECK(cnrtMalloc((void **)&pA, sizeA * sizeof(half))); CNRT_CHECK(cnrtMalloc((void **)&pB, sizeB * sizeof(half))); CNRT_CHECK(cnrtMalloc((void **)&pC, sizeC * sizeof(half))); for(int i = 0; i<100000;i++){ for (int i = 0; i < M; i++) { for (int j = 0; j < P; j++) { AF[i][j] = (float)((rand() % 1000 - rand() % 1000) / 10000.0); A[i][j] = (half)AF[i][j]; } } for (int i = 0; i < P; i++) { for (int j = 0; j < N; j++) { BF[i][j] = (float)((rand() % 1000 - rand() % 1000) / 10000.0); B[i][j] = (half)BF[i][j]; } } gettimeofday(&start, NULL); CNRT_CHECK(cnrtMemcpy(pA, A, sizeA * sizeof(half), CNRT_MEM_TRANS_DIR_HOST2DEV)); CNRT_CHECK(cnrtMemcpy(pB, B, sizeB * sizeof(half), CNRT_MEM_TRANS_DIR_HOST2DEV)); MatrixMul<<>>(pA, pB, pC, M, N, P); CNRT_CHECK(cnrtSyncQueue(queue)); CNRT_CHECK(cnrtMemcpy(mlu_result, pC, sizeC * sizeof(half), CNRT_MEM_TRANS_DIR_DEV2HOST)); gettimeofday(&end, NULL); time_usec = ((float)((long)end.tv_usec - (long)start.tv_usec))/1000.0f; total_time+=time_usec; } printf("MLU Total Time: %f ms\n", total_time); CNRT_CHECK(cnrtFree(pA)); CNRT_CHECK(cnrtFree(pB)); CNRT_CHECK(cnrtFree(pC)); CNRT_CHECK(cnrtDestroyQueue(queue)); cnrtDestroy(); return 0; }