diff --git a/tools/demo.py b/tools/demo.py index b16598d..2e7cb76 100644 --- a/tools/demo.py +++ b/tools/demo.py @@ -10,14 +10,28 @@ from loguru import logger import cv2 import torch - +import numpy as np from yolox.data.data_augment import ValTransform from yolox.data.datasets import COCO_CLASSES from yolox.exp import get_exp from yolox.utils import fuse_model, get_model_info, postprocess, vis + +import torch_mlu +import torch_mlu.core.mlu_model as ct +import torch_mlu.core.mlu_quantize as mlu_quantize + IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"] +def quantize(model, imgs, quant_type, quant_model_name): + mean = [0, 0, 0] + std = [1.0 / 255.0 , 1.0 / 255.0, 1.0 / 255.0] + qconfig = {'iimport numpy as npteration' : 1, 'mean': mean, 'std': std, 'firstconv' : True} + quantized_model = mlu_quantize.quantize_dynamic_mlu( + model, qconfig, dtype=quant_type, gen_quant=True) + for img in imgs: + pred = quantized_model(img) + torch.save(quantized_model.state_dict(), quant_model_name) def make_parser(): parser = argparse.ArgumentParser("YOLOX Demo!") @@ -27,6 +41,9 @@ def make_parser(): parser.add_argument("-expn", "--experiment-name", type=str, default=None) parser.add_argument("-n", "--name", type=str, default=None, help="model name") + parser.add_argument("-q", "--quant", type=str, default=None, help="model quant type") + parser.add_argument("-cn", "--core_num", type=str, default=None, help="mlu core number") + parser.add_argument("-md", "--mlu_device", type=str, default=None, help="mlu device:270 or 220") parser.add_argument( "--path", default="./assets/dog.jpg", help="path to images or video" ) @@ -155,15 +172,74 @@ class Predictor(object): with torch.no_grad(): t0 = time.time() - outputs = self.model(img) - if self.decoder is not None: - outputs = self.decoder(outputs, dtype=outputs.type()) - outputs = postprocess( - outputs, self.num_classes, self.confthre, + cpu_outputs = self.model(img) + self.model.head.decode_outputs(cpu_outputs, dtype=cpu_outputs.type()) + # if self.decoder is not None: + # cpu_outputs = self.decoder(cpu_outputs, dtype=cpu_outputs.type()) + cpu_outputs = postprocess( + cpu_outputs, self.num_classes, self.confthre, + self.nmsthre, class_agnostic=True + ) + logger.info("CPU Infer time: {:.4f}s".format(time.time() - t0)) + print("cpu outputs: ", cpu_outputs[0]) + # mlu infer + quant_path = args.name + "_" + args.quant +".pth" + # quant + quantize(self.model, [img], args.quant, quant_path) + state_dict = torch.load(quant_path) + ct.set_core_number(int(args.core_num)) + ct.set_core_version(args.mlu_device.upper()) + quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(self.model) + quantized_net.load_state_dict(state_dict,strict=False) + quantized_net.eval() + quantized_net.to(ct.mlu_device()) + t1 = time.time() + pred = quantized_net(img.half().to(ct.mlu_device())) + mlu_outputs = pred.cpu().float() + self.model.head.decode_outputs(mlu_outputs, dtype=mlu_outputs.type()) + # if self.decoder is not None: + # mlu_outputs = self.decoder(mlu_outputs, dtype=mlu_outputs.type()) + mlu_outputs = postprocess( + mlu_outputs, self.num_classes, self.confthre, self.nmsthre, class_agnostic=True ) - logger.info("Infer time: {:.4f}s".format(time.time() - t0)) - return outputs, img_info + logger.info("MLU by layer Infer time: {:.4f}s".format(time.time() - t1)) + print("mlu output: ", mlu_outputs[0]) + np.allclose(mlu_outputs[0].detach().numpy(), cpu_outputs[0].detach().numpy()) + # save cambricon offline model + if True: + ct.save_as_cambricon("./offline_models/"+args.name+"_"+args.quant+"_core"+args.core_num+"_"+args.mlu_device) + torch.set_grad_enabled(False) + trace_input = torch.randn(1, 3, 640, 640, dtype=torch.float) + trace_input=trace_input.half() + trace_input=trace_input.to(ct.mlu_device()) + quantized_net_jit = torch.jit.trace(quantized_net, trace_input, check_trace = False) + pred = quantized_net_jit(trace_input) + ct.save_as_cambricon("") + + ct.set_core_version("MLU270") + sum_time = 0.0 + # infer using offline model + for i in range(16): + inference_start = time.time() + pred = quantized_net_jit(img.half().to(ct.mlu_device())) + # print("mlu inference time: ", i, " ", time.time() - inference_start) + mlu_outputs = pred.cpu().float() + #print(mlu_outputs.shape) + self.model.head.decode_outputs(mlu_outputs, dtype=mlu_outputs.type()) + # if self.decoder is not None: + # mlu_outputs = self.decoder(mlu_outputs, dtype=mlu_outputs.type()) + mlu_outputs = postprocess( + mlu_outputs, self.num_classes, self.confthre, + self.nmsthre, class_agnostic=True + ) + once_time = time.time() - inference_start + np.allclose(mlu_outputs[0].detach().numpy(), cpu_outputs[0].detach().numpy()) + sum_time = sum_time + once_time + # print("mlu e2e time:", i, " ", once_time) + logger.info("MLU FUSION AVG Infer time: {:.4f}s", sum_time/16) + + return mlu_outputs, img_info def visual(self, output, img_info, cls_conf=0.35): ratio = img_info["ratio"] @@ -282,7 +358,23 @@ def main(exp, args): logger.info("loading checkpoint") ckpt = torch.load(ckpt_file, map_location="cpu") # load the model state dict - model.load_state_dict(ckpt["model"]) + # model.load_state_dict(ckpt["model"]) + # 修改权值文件,为新添加的卷积增加权值数据。 + weight = [[1.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[1.,0.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,0.,0.],[1.,0.,0.,0.], + [0.,0.,1.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,1.,0.], + [0.,1.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,1.,0.,0.], + [0.,0.,0.,1.],[0.,0.,0.,0.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,0.,1.],[0.,0.,0.,0.], + [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,1.]] + ckpt["backbone.backbone.stem.space_to_depth_conv.weight"] = torch.from_numpy(np.array(weight).reshape(12,3,2,2).astype(np.float32)) + model.load_state_dict(ckpt) + logger.info("loaded checkpoint done.") if args.fuse: