切换版块
×
基础软件平台
PyTorch开发
TensorFlow开发
编解码及SDK开发
BANG语言与计算库
开发工具链
MagicMind开发
软件栈百科
云平台集成
硬件产品专区
大模型专区
寒武纪® AIDC® MLU370®系列加速卡
MLU200系列AI加速产品
经验方案交流区
经验案例与实践分享
开发者服务
开发者活动
公告与版务
高校支持
《智能计算系统》
签到
版块
社区
文档
SDK下载
370系列
200系列
开发平台
官网首页
注册
登录
全部版块
基础软件平台
硬件产品专区
经验方案交流区
开发者服务
高校支持
发布新帖
登录/注册
LV.1
zwb
115
积分
2
赞
14
帖子
75
回复
2
收藏
TA的动态
TA的帖子
TA的回复
yolov5离线推理时的问题
我的回复:因为只有mlu220边缘设备,所以无法看到融合模式结果,只能生成离线模型,对于同一张图在batchsize=4的情况下,输出的boxnumber分别为56,27,0,0,为什么会出现检测框数目不一样的情况呢,后处理部分代码如下void writeVisualizeBBox( const vector<cv::Mat>& images, const vector<vector<vector<float>>> detections, const vector<string>& labelToDisplayName, const vector<string>& imageNames, int input_dim) { // Retrieve detections. for (int i = 0; i < images.size(); ++i) { if (imageNames[i] == "null") continue; cv::Mat image; image = images[i]; vector<vector<float>> result = detections[i]; std::string name = imageNames[i]; int positionMap = imageNames[i].rfind("/"); if (positionMap > 0 && positionMap < imageNames[i].size()) { name = name.substr(positionMap + 1); } positionMap = name.find("."); if (positionMap > 0 && positionMap < name.size()) { name = name.substr(0, positionMap); } string filename = "result/"+name + ".txt"; std::ofstream fileMap(filename); float scaling_factors = std::min( static_cast<float>(input_dim) / static_cast<float>(images[i].cols), static_cast<float>(input_dim) / static_cast<float>(images[i].rows)); for (int j = 0; j < result.size(); j++) { result[j][0] = result[j][0] - static_cast<float>(input_dim - scaling_factors * image.cols) / 2.0; result[j][2] = result[j][2] - static_cast<float>(input_dim - scaling_factors * image.cols) / 2.0; result[j][1] = result[j][1] - static_cast<float>(input_dim - scaling_factors * image.rows) / 2.0; result[j][3] = result[j][3] - static_cast<float>(input_dim - scaling_factors * image.rows) / 2.0; for (int k = 0; k < 4; k++) { result[j][k] = result[j][k] / scaling_factors; // cout << result[j][k] << " "; } // cout << endl; } for (int j = 0; j < result.size(); j++) { result[j][0] = result[j][0] < 0 ? 0 : result[j][0]; result[j][2] = result[j][2] < 0 ? 0 : result[j][2]; result[j][1] = result[j][1] < 0 ? 0 : result[j][1]; result[j][3] = result[j][3] < 0 ? 0 : result[j][3]; result[j][0] = result[j][0] > image.cols ? image.cols : result[j][0]; result[j][2] = result[j][2] > image.cols ? image.cols : result[j][2]; result[j][1] = result[j][1] > image.rows ? image.rows : result[j][1]; result[j][3] = result[j][3] > image.rows ? image.rows : result[j][3]; } for (int j = 0; j < result.size(); j++) { int x0 = static_cast<int>(result[j][0]); int y0 = static_cast<int>(result[j][1]); int x1 = static_cast<int>(result[j][2]); int y1 = static_cast<int>(result[j][3]); //cout << "(" <<x0 << "," << y0 << ")" << "(" << y0 << "," << y1 << ")" << endl; cv::Point p1(x0, y0); cv::Point p2(x1, y1); cv::rectangle(image, p1, p2, cv::Scalar(0, 0, 255), 2); stringstream ss; ss << round(result[j][4] * 1000) / 1000.0; std::string str = labelToDisplayName[static_cast<int>(result[j][5])] + ":" + ss.str(); //std::string str = ss.str(); cv::Point p5(x0, y0 - 2); // cv::rectanle(image, p1, p2, ) cv::putText(image, str, p5, cv::FONT_HERSHEY_SIMPLEX, 0.6, cv::Scalar(255, 0, 0), 2); //std::cout << "it is " << result[j][5] << ":" << result[j][4] << std::endl; fileMap << labelToDisplayName[static_cast<int>(result[j][5])] << " " << ss.str() << " " << static_cast<float>(result[j][0]) << " " << static_cast<float>(result[j][1]) << " " << static_cast<float>(result[j][2]) << " " << static_cast<float>(result[j][3]) << " " << image.cols << " " << image.rows << std::endl; } fileMap.close(); stringstream ss; string outFile; ss << "result/yolov5_" << name << ".jpg"; ss >> outFile; cv::imwrite(outFile.c_str(), image); } } void readLabels(string filename, vector<string>& labels) { std::ifstream file(filename); if (file.fail()) std::cerr << "failed to open labels file!"; std::string line; while (getline(file, line)) { labels.push_back(line); } file.close(); } using std::vector; using std::string; /*vector<vector<vector<float>>> getResults(float* outputData, int dimNumm, int *dimValues) { vector<vector<vector<float>>> detections; // BangOp implementation float max_limit = 1; float min_limit = 0; float input_size = 640; int batchSize = dimValues[0]; int count = dimValues[3]; for (int i = 0; i < batchSize; i++) { int num_boxes = static_cast<int>(outputData[i * count]); vector<vector<float>> batch_box; for (int k = 0; k < num_boxes; k++) { int index = i * count + 64 + k * 7; vector<float> single_box; float bl = std::max( min_limit, std::min(max_limit, outputData[index + 3]/input_size)); // x1 float br = std::max( min_limit, std::min(max_limit, outputData[index + 5]/input_size)); // x2 float bt = std::max( min_limit, std::min(max_limit, outputData[index + 4]/input_size)); // y1 float bb = std::max( min_limit, std::min(max_limit, outputData[index + 6]/input_size)); // y2 single_box.push_back(bl); single_box.push_back(bt); single_box.push_back(br); single_box.push_back(bb); single_box.push_back(outputData[index + 2]); single_box.push_back(outputData[index + 1]); for(auto s:single_box) cout << s << " "; cout << endl; if ((br - bl) > 0 && (bb - bt) > 0) { batch_box.push_back(single_box); } } detections.push_back(batch_box); } return detections; }*/ vector<vector<vector<float>>> getResults(float* outputData, int dimNumm, int *dimValues) { vector<vector<vector<float>>> detections; int batchSize = dimValues[0]; int count = dimValues[3]; for (int i = 0; i < batchSize; i++) { int num_boxes = static_cast<int>(outputData[i * count]); vector<vector<float>> batch_box; for (int k = 0; k < num_boxes; k++) { int index = i * count + 64 + k * 7; vector<float> single_box; float bl = outputData[index + 3]; float br = outputData[index + 5]; float bt = outputData[index + 4]; float bb = outputData[index + 6]; single_box.push_back(bl); single_box.push_back(bt); single_box.push_back(br); single_box.push_back(bb); single_box.push_back(outputData[index + 2]); single_box.push_back(outputData[index + 1]); // for(auto s:single_box) // cout << s << " "; // cout << endl; if ((br - bl) > 0 && (bb - bt) > 0) { batch_box.push_back(single_box); } } detections.push_back(batch_box); } return detections; }
1
pytorch yolov3移植
我的回复:#1lanhao回复参见软件栈中的yolov3样例 cambricon_pytorch/pytorch/examples/offline/c++/yolov3展开您好,请问官方有转换yolov3离线模型的相关样例吗
0
pytorch yolov3移植
我的回复:def mlu_forward(imgfile, use_mlu=True): ct.set_core_version("MLU220") #转换 MLU220 的离线模型,这个离线模型只能在MLU220的核心运行 # ct.set_core_version("MLU270") #转换 MLU270 的模型,可以在仿真环境下运行(有MLU270显卡) ct.set_device(-1) ct.set_core_number(4) # set core number #设置离线实际运行,NPU用几个核心来运行这个模型,一般选4,性能上比较好 if use_mlu: device = ct.mlu_device() else: device = torch.device('cpu') Img = cv2.imread(imgfile) #读入图片 img = Image.fromarray(Img).convert('RGB') #转换格式 img = cv2.resize(np.array(img),(64,128)) #缩放在模型输入大小 input_w = img.shape[0] input_h = img.shape[1] batch_size = 1 fusion = 1 with torch.no_grad(): model = Net(reid = True) #构建网络 model.eval().float() if use_mlu: model = mlu_quantize.quantize_dynamic_mlu(model) model.load_state_dict(torch.load(save_path),strict=False) model.to(device) ''' if not use_mlu: sized = ((img / 255) - np.array(mean)) / np.array(std) sized = np.transpose(sized, (2, 0, 1)) ''' sized = ((np.array(img) / 255) - np.array(mean)) / np.array(std) sized = sized.transpose((2, 0, 1)) sized = sized.astype(np.float32) input_img = torch.from_numpy(np.stack([sized]*batch_size)) input_img = input_img.to(device) logging.info(input_img.shape) # fusion mode if use_mlu and fusion: #进行算法融合并转换成离线模型 ct.save_as_cambricon('deepsort') example_tensor = torch.randn((4,3,128,64), dtype=torch.float).to(device) #C, D, H, W model = torch.jit.trace(model, example_tensor, check_trace=False) out = model(example_tensor) ct.save_as_cambricon('') # generate offline model print('fusion success')
0
pytorch yolov3移植
我的回复:#3lanhao回复pytorch没有mlu270真正运行是怎么生成离线模型的呢?如果生成了离线模型,判断模型精度的方式和在线没什么区别。展开def mlu_forward(imgfile, use_mlu=True): ct.set_core_version(\"MLU220\") #转换 MLU220 的离线模型,这个离线模型只能在MLU220的核心运行 # ct.set_core_version(\"MLU270\") #转换 MLU270 的模型,可以在仿真环境下运行(有MLU270显卡) ct.set_device(-1) ct.set_core_number(4) # set core number #设置离线实际运行,NPU用几个核心来运行这个模型,一般选4,性能上比较好 if use_mlu: device = ct.mlu_device() else: device = torch.device(\'cpu\') Img = cv2.imread(imgfile) #读入图片 img = Image.fromarray(Img).convert(\'RGB\') #转换格式 img = cv2.resize(np.array(img),(64,128)) #缩放在模型输入大小 input_w = img.shape[0] input_h = img.shape[1] batch_size = 1 fusion = 1 with torch.no_grad(): model = Net(reid = True) #构建网络 model.eval().float() if use_mlu: model = mlu_quantize.quantize_dynamic_mlu(model) model.load_state_dict(torch.load(save_path),strict=False) model.to(device) \'\'\' if not use_mlu: sized = ((img / 255) - np.array(mean)) / np.array(std) sized = np.transpose(sized, (2, 0, 1)) \'\'\' sized = ((np.array(img) / 255) - np.array(mean)) / np.array(std) sized = sized.transpose((2, 0, 1)) sized = sized.astype(np.float32) input_img = torch.from_numpy(np.stack([sized]*batch_size)) input_img = input_img.to(device) logging.info(input_img.shape) # fusion mode if use_mlu and fusion: #进行算法融合并转换成离线模型 ct.save_as_cambricon(\'deepsort\') example_tensor = torch.randn((4,3,128,64), dtype=torch.float).to(device) #C, D, H, W model = torch.jit.trace(model, example_tensor, check_trace=False) out = model(example_tensor) ct.save_as_cambricon(\'\') # generate offline model print(\'fusion success\') 采用这种方法在主机上生成的离线模型,然后放到mlu220的盒子设备上推理,因为在主机上没有mlu270设备,所以看不到结果,只有一个cambricon。这样生成的离线模型是正确的吗
0
pytorch yolov3移植
我的回复:#1lanhao回复生成.cambricon后缀的离线模型后,在mlu220编写离线代码进行推理即可。保存220离线模型时在线融合推理结果不对是正常现象,能够保证在线融合的结果没问题就行了展开您好,那如果没有mlu270等设备在线推理,只有mlu220的设备,如何判断生成的离线模型是没有问题的呢
0
上一页
1
2
3
下一页
Github
开发平台
文档中心
新手必读
官方微信
版权所有 © 2025 寒武纪 Cambricon.com 备案/许可证号:
京ICP备17003415号-1
关闭