已解决上述问题,但270上生成的离线模型 在mlu220芯片盒子上推理失败:
CNRT: 4.7.12 03ea1d9
2022-04-19 17:31:13.735909: [cnrtWarning] [22803] [Card : 0] The device you run is not the same as the platform in kernel!
2022-04-19 17:31:13.736012: [cnrtWarning] [22803] [Card : 0] The device you run is MLU220, but the platform in kernel is MLU270!
2022-04-19 17:31:13.736065: [cnrtError] [22803] [Card : 0] Model is compiled for MLU270, now device is MLU2202022-04-19 17:31:13.736116: [cnrtError] [22803] [Card : 0] [../src/al CNRT error, code=632020(Unsupported operation.) "cnrtInitRuntimeContext(ctx, NULL)"
2022-04-19 17:31:13.736172: [cnrtError] [22803] [Card : 0] Runtime context is not initialized yet.
2022-04-19 17:31:13.736220: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:299] CNRT error, code=632024(Uninitialized.) "cnrtRuntimeContextCreateQueue(ctx, &queue)"
input shape:
1 640 480 3
output shape:
1 1 1 7232
ccccc:960
2022-04-19 17:31:13.877099: [cnrtError] [22803] [Card : 0] Runtime context is not initialized yet.
2022-04-19 17:31:13.877214: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:449] CNRT error, code=632024(Uninitialized.) "cnrtRuntimeContextCreateNotifier(ctx, ¬ifi
2022-04-19 17:31:13.877275: [cnrtError] [22803] [Card : 0] Runtime context is not initialized yet.
2022-04-19 17:31:13.877323: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:450] CNRT error, code=632024(Uninitialized.) "cnrtRuntimeContextCreateNotifier(ctx, ¬ifi
2022-04-19 17:31:13.877386: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:451] CNRT error, code=632034(Failure on notifier operation.) "cnrtPlaceNotifier(notifier_st
2022-04-19 17:31:13.877445: [cnrtError] [22803] [Card : 0] Runtime context is not initialized yet.
2022-04-19 17:31:13.877492: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:458] CNRT error, code=632024(Uninitialized.) "cnrtInvokeRuntimeContext_V2(ctx, nullptr, parm)"
2022-04-19 17:31:13.877561: [cnrtError] [22803] [Card : 0] [../src/algorithem_test.cpp:459] CNRT error, code=632034(Failure on notifier operation.) "cnrtPlaceNotifier(notifier_en
2022-04-19 17:31:13.877635: [cnrtError] [22803] [Card : 0] MLU unfinished. cnrtStream fail.
2022-04-19 17:31:13.877682: [cnrtError] [22803] [Card : 0] the mlu device is unknown or doesn't exist.
代码:
from pathlib import Path
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
import argparse
import time
from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, non_max_suppression, apply_classifier, scale_coords, xyxy2xywh, \
strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized
import os
import torch_mlu
import torch_mlu.core.mlu_quantize as mlu_quantize
import torch_mlu.core.mlu_model as ct
import numpy as np
batchsize = 1
corenum = 1
def get_boxes(prediction, batch_size=1, img_size=640):
reshape_value = torch.reshape(prediction, (-1, 1))
num_boxes_final = reshape_value[0].item()
print('num_boxes_final: ',num_boxes_final)
all_list = [[] for _ in range(batch_size)]
for i in range(int(num_boxes_final)):
batch_idx = int(reshape_value[64 + i * 7 + 0].item())
if batch_idx >= 0 and batch_idx < batch_size:
bl = reshape_value[64 + i * 7 + 3].item()
br = reshape_value[64 + i * 7 + 4].item()
bt = reshape_value[64 + i * 7 + 5].item()
bb = reshape_value[64 + i * 7 + 6].item()
if bt - bl > 0 and bb -br > 0:
all_list[batch_idx].append(bl)
all_list[batch_idx].append(br)
all_list[batch_idx].append(bt)
all_list[batch_idx].append(bb)
all_list[batch_idx].append(reshape_value[64 + i * 7 + 2].item())
# all_list[batch_idx].append(reshape_value[64 + i * 7 + 2].item())
all_list[batch_idx].append(reshape_value[64 + i * 7 + 1].item())
output = [np.array(all_list[i]).reshape(-1, 6) for i in range(batch_size)]
# outputs = [torch.FloatTensor(all_list[i]).reshape(-1, 6) for i in range(batch_size)]
return output
def detect(save_img=False):
source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size
webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
('rtsp://', 'rtmp://', 'http://'))
# Directories
save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir
# Initialize
#set_logging()
device = select_device(opt.device)
half = device.type !='cpu' # half precision only supported on CUDA
# Load model
model = attempt_load(weights, map_location=device) # load FP32 model
# torch.save(model.state_dict(), "v4_head_uzip.pt", _use_new_zipfile_serialization=False)
imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size
if half:
model.half() # to FP16
# Second-stage classifier
classify = False
if classify:
modelc = load_classifier(name='resnet101', n=2) # initialize
modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()
# Set Dataloader
#vid_path, vid_writer = None, None
#if webcam:
# view_img = True
# cudnn.benchmark = True # set True to speed up constant image size inference
# dataset = LoadStreams(source, img_size=imgsz)
#else:
# save_img = True
dataset = LoadImages(source, img_size=imgsz)
# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]
# Run inference
global quantized_model
global quantized_net
if opt.cfg == 'qua':
qconfig = {'iteration':2,'firstconv':False}
quantized_model = mlu_quantize.quantize_dynamic_mlu(model, qconfig, dtype='int8', gen_quant=True)
elif opt.cfg =='mlu':
from models.yolo import Model
model = Model("models/yolov5s.yaml").to(torch.device("cpu"))
model.float().fuse().eval()
quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(model)
state_dict = torch.load("yolov5sv4_head_int8_20220419.pt")
quantized_net.load_state_dict(state_dict, strict=False)
quantized_net.eval()
quantized_net.to(ct.mlu_device())
print("load success!", ct.mlu_device())
if opt.jit:
ct.save_as_combraicon("yolov5_v4_head")
torch.set_grad_enabled(False)
ct.set_core_number(4)
trace_input = torch.randn(1, 3, 640, 640, dtype=torch.float)
trace_input=trace_input.to(ct.mlu_device())
quantized_net = torch.jit.trace(quantized_net, trace_input, check_trace = False)
t0 = time.time()
img = torch.zeros((1, 3, imgsz, imgsz), device=device) #init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None # run once
for path, img, im0s, vid_cap in dataset:
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
# Inference
t1 = time_synchronized()
if opt.cfg == 'cpu':
pred = model(img, augment=opt.augment)[0]
print('run cpu')
elif opt.cfg == 'qua':
pred = quantized_model(img)[0]
torch.save(quantized_model.state_dict(), 'yolov5sv4_head_int8_20220419.pt')
print('run qua')
print(pred)
elif opt.cfg == 'mlu':
img = img.type(torch.HalfTensor).to(ct.mlu_device())
img = img.to(ct.mlu_device())
pred = quantized_net(img)[0]
pred = pred.data.cpu().type(torch.FloatTensor)
print("pred_shape", pred.shape)
box_result = get_boxes(pred)
print("im0s.shape:", im0s.shape)
print(box_result)
res = box_result[0].tolist()
with open("yolov5s_mlu_output.txt", "w+") as f:
for pt in sorted(res, key=lambda x: (x[0], x[1])):
f.write("{}\n{}\n{}\n{}\n".format(pt[0], pt[1], pt[2], pt[3]))
cv2.rectangle(im0s, (int(pt[0]), int(pt[1])), (int(pt[2]), int(pt[3])), (255, 0, 0), 2)
cv2.imwrite("mlu_out_{}.jpg".format(os.path.basename(path).split('.')[0]), im0s)
print('run mlu')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, help='model.pt path(s)')
parser.add_argument('--source', type=str, default="data/images/bus.jpg", help='source') # file/folder, 0 for webcam
parser.add_argument('--cfg', default='cpu', help='qua and off')
parser.add_argument('--jit', type=bool)
parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--view-img', action='store_true',default=True, help='display results')
parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')
parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
parser.add_argument('--augment', action='store_true', help='augmented inference')
parser.add_argument('--update', action='store_true', help='update all models')
parser.add_argument('--project', default='runs/detect', help='save results to project/name')
parser.add_argument('--name', default='exp', help='save results to project/name')
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
opt = parser.parse_args()
print(opt)
with torch.no_grad():
if opt.update: # update all models (to fix SourceChangeWarning)
for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
detect()
strip_optimizer(opt.weights)
else:
detect()
请登录后评论