#include #include #include #include #include #include #include #include "cnrt.h" #define PATH_MAX_LENGTH 1024 std::string GetExePath() { char path[PATH_MAX_LENGTH]; int cnt = readlink("/proc/self/exe", path, PATH_MAX_LENGTH); if (cnt < 0 || cnt >= PATH_MAX_LENGTH) { return ""; } if (path[cnt - 1] == '/') { path[cnt - 1] = '\0'; } else { path[cnt] = '\0'; } std::string result(path); return std::string(path).substr(0, result.find_last_of('/') + 1); } constexpr const char *gmodel_path_220 = "../../tests/data/resnet18_220.cambricon"; constexpr const char *gmodel_path_270 = "../../tests/data/yolov3_offline.cambricon"; constexpr const char *image_path_1 = "../../tests/data/000000000785.jpg"; constexpr const char *image_path_2 = "../../tests/data/500x500.jpg"; constexpr const char *fname = "../../tests/data/yolov3_4b4c_simple.cambricon"; void loadmodel(cnrtModel_t &model, const char *fname) { std::cout << "load model" << std::endl; cnrtLoadModel(&model, fname); } int main(int argc, char *argv[]) { std::string function_name = "subnet0"; try { cnrtInit(0); cnrtDeviceInfo_t device_info; cnrtGetDeviceInfo(&device_info, 0); cnrtCoreVersion_t core_version = device_info.core_version; printf("mlp offline test\n"); if (core_version != CNRT_MLU270) { std::cout << "the device is not MLU270" << std::endl; } std::string exe_path = GetExePath(); std::string model_path = exe_path + fname; printf("fname: %s\n", fname); // load model cnrtModel_t model; cnrtLoadModel(&model, model_path.c_str()); // loadmodel(model, fname); cnrtDev_t dev; cnrtGetDeviceHandle(&dev, 0); cnrtSetCurrentDevice(dev); // get model total memory int64_t totalMem; cnrtGetModelMemUsed(model, &totalMem); printf("total memory used: %ld Bytes\n", totalMem); // get model parallelism int model_parallelism; cnrtQueryModelParallelism(model, &model_parallelism); printf("model parallelism: %d.\n", model_parallelism); int func_num = 0; cnrtGetFunctionNumber(model, &func_num); for (int i = 0; i < func_num; i++) { char *symbol; int symbol_len; cnrtGetFunctionSymbol(model, i, &symbol, &symbol_len); std::cout << "symbol name is:" << symbol << std::endl; } // load extract function cnrtFunction_t function; cnrtCreateFunction(&function); cnrtExtractFunction(&function, model, "subnet0"); bool is_cache_model = true; int cache_num = 0; cnrtQueryCacheMode(function, &is_cache_model, &cache_num); int batch_sizes[cache_num]; cnrtQueryBatches(function, batch_sizes); int inputNum, outputNum; int64_t *inputSizeS, *outputSizeS; cnrtGetInputDataSize(&inputSizeS, &inputNum, function); cnrtGetOutputDataSize(&outputSizeS, &outputNum, function); // cnrtParamDescArray_t input_param_descs, output_param_descs; // cnrtCreateParamDescArray(&input_param_descs, inputNum); // cnrtCreateParamDescArray(&output_param_descs, outputNum); // cnrtInferFunctionOutputShape(function, inputNum, input_param_descs, outputNum, output_param_descs); cnrtDataType_t *inputDataTypes; cnrtGetInputDataType(&inputDataTypes, &inputNum, function); printf("the input DataType is %d\n", inputDataTypes[0]); int *dimvalues; int dimsize; cnrtGetInputDataShape(&dimvalues, &dimsize, 0, function); for (int i = 0; i < dimsize; i++) { printf("the dimindex is: %d, the dimvalue is: %d\n", i, dimvalues[i]); } printf("\n"); // prepare data on cpu void **inputCpuPtrS = (void **)malloc(inputNum * sizeof(void *)); void **outputCpuPtrS = (void **)malloc(outputNum * sizeof(void *)); // allocate I/O data memory on MLU void **inputMluPtrS = (void **)malloc(inputNum * sizeof(void *)); void **outputMluPtrS = (void **)malloc(outputNum * sizeof(void *)); std::string image_path = exe_path + image_path_1; cv::Mat img = cv::imread(image_path, cv::IMREAD_COLOR); std::cout << "img depth" << img.depth() << " channel " << img.channels() << std::endl; cv::resize(img, img, cv::Size(416, 416)); cv::cvtColor(img, img, cv::COLOR_BGR2RGBA); // img.convertTo(img, CV_32FC4); std::cout << "img depth" << img.depth() << " channel " << img.channels() << "type: " << img.type() << std::endl; printf("pointer start is %p,end is %p/n", img.datastart, img.dataend); size_t input_img_size = img.size().width * img.size().height * img.channels(); printf("[LOG] Copied image buf size = %lu\n", input_img_size); printf("[LOG] cpuinputs buf size = %lu cpuoutput buf size is %lu\n", inputSizeS[0], outputSizeS[0]); // std::string image_path2 = exe_path + image_path_2; // cv::Mat img2 = cv::imread(image_path2, cv::IMREAD_COLOR); // std::cout << "img depth" << img2.depth() << " channel " << img2.channels() << std::endl; // cv::resize(img2, img2, cv::Size(416, 416)); // cv::cvtColor(img2, img2, cv::COLOR_BGR2RGBA); std::cout << "input memcpy" << std::endl; for (int i = 0; i < inputNum; i++) { // converts data format when using new interface model inputCpuPtrS[i] = malloc(inputSizeS[i]); memcpy(inputCpuPtrS[0], (void *)img.data, input_img_size); // memcpy((void *)((char *)inputCpuPtrS[0] + input_img_size), (void *)img2.data, input_img_size); cnrtMalloc(&(inputMluPtrS[i]), inputSizeS[i]); cnrtMemcpy(inputMluPtrS[i], inputCpuPtrS[i], inputSizeS[i], CNRT_MEM_TRANS_DIR_HOST2DEV); } // prepare output buffer for (int i = 0; i < outputNum; i++) { outputCpuPtrS[i] = malloc(outputSizeS[i]); // malloc mlu memory cnrtMalloc(&(outputMluPtrS[i]), outputSizeS[i]); cnrtMemset(outputMluPtrS[i], 0, outputSizeS[i]); } // prepare parameters for input/output buffers void **param = (void **)malloc(sizeof(void *) * (inputNum + outputNum)); for (int i = 0; i < inputNum; ++i) { param[i] = inputMluPtrS[i]; } for (int i = 0; i < outputNum; ++i) { param[inputNum + i] = outputMluPtrS[i]; } std::cout << "call cnrtCreateRuntimeContext" << std::endl; // setup runtime ctx cnrtRuntimeContext_t ctx; cnrtCreateRuntimeContext(&ctx, function, NULL); std::cout << "call cnrtInitRuntimeContext" << std::endl; // bind device cnrtSetRuntimeContextDeviceId(ctx, 0); cnrtInitRuntimeContext(ctx, NULL); std::cout << "call cnrtRuntimeContextCreateQueue" << std::endl; // compute offline cnrtQueue_t queue; cnrtRuntimeContextCreateQueue(ctx, &queue); std::cout << "call cnrtInvokeRuntimeContext_V3" << std::endl; // invoke // cnrtInvokeRuntimeContext(ctx, param, queue, NULL); // int batch_size = 1; // *batch_sizes = 1; int batch_array[inputNum] = {1}; // u32_t affinity = 0x01; // cnrtInvokeParam_t invoke_param; // invoke_param.cluster_affinity.affinity = &affinity; // invoke_param.invoke_param_type = CNRT_INVOKE_PARAM_TYPE_0; cnrtRet_t ret = cnrtInvokeRuntimeContext_V3(ctx, batch_array, param, queue, NULL); if (ret != CNRT_RET_SUCCESS) { std::cout << "call cnrtInvokeRuntimeContext_V3 error, the return code is:" << ret << std::endl; } // cnrtInvokeRuntimeContext(ctx, param, queue, NULL); std::cout << "call cnrtSyncQueue" << std::endl; // sync ret = cnrtSyncQueue(queue); if (ret != CNRT_RET_SUCCESS) { std::cout << "call cnrtSyncQueue error, the return code is:" << ret << std::endl; } std::cout << "copy mlu result to cpu" << std::endl; // copy mlu result to cpu for (int i = 0; i < outputNum; i++) { cnrtMemcpy(outputCpuPtrS[i], outputMluPtrS[i], outputSizeS[i], CNRT_MEM_TRANS_DIR_DEV2HOST); } { uint16_t *output_bytes = reinterpret_cast(outputCpuPtrS)[0]; puts("\033[32m"); std::cout << "infer end" << std::endl; for (int i = 0; i < 1000; i++) { float tmp = 0.0; cnrtConvertHalfToFloat(&tmp, output_bytes[i]); printf("%.3f ", tmp); } puts("\033[0m"); } // free memory space for (int i = 0; i < inputNum; i++) { free(inputCpuPtrS[i]); cnrtFree(inputMluPtrS[i]); } for (int i = 0; i < outputNum; i++) { free(outputCpuPtrS[i]); cnrtFree(outputMluPtrS[i]); } free(inputCpuPtrS); free(outputCpuPtrS); free(param); cnrtDestroyQueue(queue); cnrtDestroyRuntimeContext(ctx); cnrtDestroyFunction(function); cnrtUnloadModel(model); cnrtDestroy(); } catch (std::exception &err) { std::cout << err.what() << std::endl; } return 0; }