/*************************************************************************
 * Copyright (C) [2020] by Cambricon, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/

#include <bang.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

#define M 4
#define N 128
#define P 128
#define ESP 0.01h

float AF[M][P];
float BF[P][N];
half A[M][P];
half B[P][N];
float C[M][N];
half cpu_result[M][N];
half mlu_result[M][N];

bool isEqual(half a, half b) {
  if (fabs(a - b) < ESP)
    return true;
  return false;
}
__mlu_global__ void MatrixMul(half *dA, half *dB, half *dC, int m, int n, int p) {


  __nram__ half nA[M * P];
  __nram__ half nB[P * N];
  __nram__ half nTmpB[N * P];
  __nram__ half nC[M * N];
  __nram__ half nTmpC[P];

  __memcpy(nA, dA, m * p * sizeof(half), GDRAM2NRAM);
  __memcpy(nB, dB, p * n * sizeof(half), GDRAM2NRAM);
  __bang_transpose(nTmpB, nB, p, n);
  __nramset(nC, m * n, 0.0h);

  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      __bang_mul(nTmpC, nA + i * p, nTmpB + j * p, p);

      __bang_reduce_sum(nTmpC, nTmpC, p);
      // sum with 64 element stride, as __bang_reduce_sum() alignment is 128bit
      for (int k = 0; k < p / 64; k++) {
        nC[i * n + j] += nTmpC[k * 64];
      }
    }
  }

  __memcpy(dC, nC, m * n * sizeof(half), NRAM2GDRAM);


}

int main() {
  cnrtDev_t dev;
  cnrtQueue_t queue;
  cnrtDim3_t dim = {1, 1, 1};
  cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_BLOCK;

  cnrtInit(0);
  cnrtGetDeviceHandle(&dev, 0);
  cnrtSetCurrentDevice(dev);
  cnrtCreateQueue(&queue);

  int sizeA = M * P;
  int sizeB = P * N;
  int sizeC = M * N;
  half *pA, *pB, *pC;

  srand((unsigned)time(NULL));
  struct timeval start;
  struct timeval end;
  float total_time=0.0f;
  float time_usec=0.0f;
  
  CNRT_CHECK(cnrtMalloc((void **)&pA, sizeA * sizeof(half)));
  CNRT_CHECK(cnrtMalloc((void **)&pB, sizeB * sizeof(half)));
  CNRT_CHECK(cnrtMalloc((void **)&pC, sizeC * sizeof(half)));

  for(int i = 0; i<100000;i++){
    for (int i = 0; i < M; i++) {
      for (int j = 0; j < P; j++) {
        AF[i][j] = (float)((rand() % 1000 - rand() % 1000) / 10000.0);
        A[i][j] = (half)AF[i][j];
      }
    }

    for (int i = 0; i < P; i++) {
      for (int j = 0; j < N; j++) {
        BF[i][j] = (float)((rand() % 1000 - rand() % 1000) / 10000.0);
        B[i][j] = (half)BF[i][j];
      }
    }

    gettimeofday(&start, NULL);


    CNRT_CHECK(cnrtMemcpy(pA, A, sizeA * sizeof(half), CNRT_MEM_TRANS_DIR_HOST2DEV));
    CNRT_CHECK(cnrtMemcpy(pB, B, sizeB * sizeof(half), CNRT_MEM_TRANS_DIR_HOST2DEV));

    MatrixMul<<<dim, ktype, queue>>>(pA, pB, pC, M, N, P);

    CNRT_CHECK(cnrtSyncQueue(queue));
    CNRT_CHECK(cnrtMemcpy(mlu_result, pC, sizeC * sizeof(half), CNRT_MEM_TRANS_DIR_DEV2HOST));

    gettimeofday(&end, NULL);
    time_usec = ((float)((long)end.tv_usec - (long)start.tv_usec))/1000.0f;
    total_time+=time_usec;
  }

  printf("MLU Total Time: %f ms\n", total_time);


  CNRT_CHECK(cnrtFree(pA));
  CNRT_CHECK(cnrtFree(pB));
  CNRT_CHECK(cnrtFree(pC));
  CNRT_CHECK(cnrtDestroyQueue(queue));

  cnrtDestroy();
  return 0;
}