diff --git a/tools/demo.py b/tools/demo.py
index b16598d..2e7cb76 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -10,14 +10,28 @@ from loguru import logger
 import cv2
 
 import torch
-
+import numpy as np
 from yolox.data.data_augment import ValTransform
 from yolox.data.datasets import COCO_CLASSES
 from yolox.exp import get_exp
 from yolox.utils import fuse_model, get_model_info, postprocess, vis
 
+
+import torch_mlu
+import torch_mlu.core.mlu_model as ct
+import torch_mlu.core.mlu_quantize as mlu_quantize
+
 IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
 
+def quantize(model, imgs, quant_type, quant_model_name):
+    mean = [0, 0, 0]
+    std = [1.0 / 255.0 , 1.0 / 255.0, 1.0 / 255.0]
+    qconfig = {'iimport numpy as npteration' : 1, 'mean': mean, 'std': std, 'firstconv' : True}
+    quantized_model = mlu_quantize.quantize_dynamic_mlu(
+            model, qconfig, dtype=quant_type, gen_quant=True)
+    for img in imgs:
+        pred = quantized_model(img)
+    torch.save(quantized_model.state_dict(), quant_model_name)
 
 def make_parser():
     parser = argparse.ArgumentParser("YOLOX Demo!")
@@ -27,6 +41,9 @@ def make_parser():
     parser.add_argument("-expn", "--experiment-name", type=str, default=None)
     parser.add_argument("-n", "--name", type=str, default=None, help="model name")
 
+    parser.add_argument("-q", "--quant", type=str, default=None, help="model quant type")
+    parser.add_argument("-cn", "--core_num", type=str, default=None, help="mlu core number")
+    parser.add_argument("-md", "--mlu_device", type=str, default=None, help="mlu device:270 or 220")
     parser.add_argument(
         "--path", default="./assets/dog.jpg", help="path to images or video"
     )
@@ -155,15 +172,74 @@ class Predictor(object):
 
         with torch.no_grad():
             t0 = time.time()
-            outputs = self.model(img)
-            if self.decoder is not None:
-                outputs = self.decoder(outputs, dtype=outputs.type())
-            outputs = postprocess(
-                outputs, self.num_classes, self.confthre,
+            cpu_outputs = self.model(img)
+            self.model.head.decode_outputs(cpu_outputs, dtype=cpu_outputs.type())
+            # if self.decoder is not None:
+            #     cpu_outputs = self.decoder(cpu_outputs, dtype=cpu_outputs.type())
+            cpu_outputs = postprocess(
+                cpu_outputs, self.num_classes, self.confthre,
+                self.nmsthre, class_agnostic=True
+            )
+            logger.info("CPU Infer time: {:.4f}s".format(time.time() - t0))
+            print("cpu outputs: ", cpu_outputs[0])
+            # mlu infer
+            quant_path = args.name + "_" + args.quant +".pth"
+            # quant
+            quantize(self.model, [img], args.quant, quant_path)
+            state_dict = torch.load(quant_path)
+            ct.set_core_number(int(args.core_num))
+            ct.set_core_version(args.mlu_device.upper())
+            quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(self.model)
+            quantized_net.load_state_dict(state_dict,strict=False)
+            quantized_net.eval()
+            quantized_net.to(ct.mlu_device())
+            t1 = time.time()
+            pred = quantized_net(img.half().to(ct.mlu_device()))
+            mlu_outputs = pred.cpu().float()
+            self.model.head.decode_outputs(mlu_outputs, dtype=mlu_outputs.type())
+            # if self.decoder is not None:
+            #     mlu_outputs = self.decoder(mlu_outputs, dtype=mlu_outputs.type())
+            mlu_outputs = postprocess(
+                mlu_outputs, self.num_classes, self.confthre,
                 self.nmsthre, class_agnostic=True
             )
-            logger.info("Infer time: {:.4f}s".format(time.time() - t0))
-        return outputs, img_info
+            logger.info("MLU by layer Infer time: {:.4f}s".format(time.time() - t1))
+            print("mlu output: ", mlu_outputs[0])
+            np.allclose(mlu_outputs[0].detach().numpy(), cpu_outputs[0].detach().numpy())
+            # save cambricon offline model
+            if True:
+                ct.save_as_cambricon("./offline_models/"+args.name+"_"+args.quant+"_core"+args.core_num+"_"+args.mlu_device)
+                torch.set_grad_enabled(False)
+                trace_input = torch.randn(1, 3, 640, 640, dtype=torch.float)
+                trace_input=trace_input.half()
+                trace_input=trace_input.to(ct.mlu_device())
+                quantized_net_jit = torch.jit.trace(quantized_net, trace_input, check_trace = False)
+                pred = quantized_net_jit(trace_input)
+                ct.save_as_cambricon("")
+
+            ct.set_core_version("MLU270")
+            sum_time = 0.0
+            # infer using offline model
+            for i in range(16):
+                inference_start = time.time()
+                pred = quantized_net_jit(img.half().to(ct.mlu_device()))
+                # print("mlu inference time: ", i, "   ", time.time() - inference_start)
+                mlu_outputs = pred.cpu().float()
+                #print(mlu_outputs.shape)
+                self.model.head.decode_outputs(mlu_outputs, dtype=mlu_outputs.type())
+                # if self.decoder is not None:
+                #     mlu_outputs = self.decoder(mlu_outputs, dtype=mlu_outputs.type())
+                mlu_outputs = postprocess(
+                    mlu_outputs, self.num_classes, self.confthre,
+                    self.nmsthre, class_agnostic=True
+                )
+                once_time = time.time() - inference_start
+                np.allclose(mlu_outputs[0].detach().numpy(), cpu_outputs[0].detach().numpy())
+                sum_time = sum_time + once_time
+                # print("mlu e2e time:", i, "    ", once_time)
+            logger.info("MLU FUSION AVG Infer time: {:.4f}s", sum_time/16)
+
+        return mlu_outputs, img_info
 
     def visual(self, output, img_info, cls_conf=0.35):
         ratio = img_info["ratio"]
@@ -282,7 +358,23 @@ def main(exp, args):
         logger.info("loading checkpoint")
         ckpt = torch.load(ckpt_file, map_location="cpu")
         # load the model state dict
-        model.load_state_dict(ckpt["model"])
+        # model.load_state_dict(ckpt["model"])
+        # 修改权值文件，为新添加的卷积增加权值数据。
+        weight = [[1.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[1.,0.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,0.,0.],[1.,0.,0.,0.],
+                  [0.,0.,1.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,1.,0.],
+                  [0.,1.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,1.,0.,0.],
+                  [0.,0.,0.,1.],[0.,0.,0.,0.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,0.,1.],[0.,0.,0.,0.],
+                  [0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,1.]]
+        ckpt["backbone.backbone.stem.space_to_depth_conv.weight"] = torch.from_numpy(np.array(weight).reshape(12,3,2,2).astype(np.float32))
+        model.load_state_dict(ckpt)
+        
         logger.info("loaded checkpoint done.")
 
     if args.fuse: