xrt support TensorRT int8 (#2637)

* Add tensorrt int8 calibrator * Generate calibration correctly. * Refine xrt int8 and readme * Update readme * Add xrt int8 unittest * merge develop * leaky relu test * function->global_function * fix LookupOrCreate * OF_CHECK->CHECK_OR_RETURN Co-authored-by: N guo-ran <360112263@qq.com>

xrt support TensorRT int8 (#2637)
* Add tensorrt int8 calibrator * Generate calibration correctly. * Refine xrt int8 and readme * Update readme * Add xrt int8 unittest * merge develop * leaky relu test * function->global_function * fix LookupOrCreate * OF_CHECK->CHECK_OR_RETURN Co-authored-by: N guo-ran <360112263@qq.com>
d4d84e60 · Houjiang Chen · GitHub · 9a3a541e · d4d84e60 · d4d84e60
26 changed file
--- a/oneflow/core/job/job.proto
+++ b/oneflow/core/job/job.proto
@@ -45,11 +45,12 @@ message MemoryAllocationAlgorithmConf {

 message XrtConfig {
  message XlaConfig {
-    // TODO
+    // TODO(hjchen2)
  }
  message TensorRTConfig {
    optional bool use_fp16 = 1 [default = false];
    optional bool use_int8 = 2 [default = false];
+    optional string int8_calibration = 3;
  }
  optional bool use_xla_jit = 1 [default = false];
  optional bool use_tensorrt = 2 [default = false];

--- a/oneflow/python/contrib/__init__.py
+++ b/oneflow/python/contrib/__init__.py
+from .tensorrt import *
--- a/oneflow/python/contrib/tensorrt/__init__.py
+++ b/oneflow/python/contrib/tensorrt/__init__.py
--- a/oneflow/python/contrib/tensorrt/tensorrt_api.py
+++ b/oneflow/python/contrib/tensorrt/tensorrt_api.py
+from __future__ import absolute_import
+
+import oneflow.oneflow_internal as oneflow_internal
+from oneflow.python.oneflow_export import oneflow_export
+
+
+@oneflow_export("tensorrt.write_int8_calibration")
+def write_int8_calibration(path):
+    oneflow_internal.WriteInt8Calibration(path)
+
+
+@oneflow_export("tensorrt.cache_int8_calibration")
+def cache_int8_calibration():
+    oneflow_internal.CacheInt8Calibration()
--- a/oneflow/python/framework/function_util.py
+++ b/oneflow/python/framework/function_util.py
@@ -414,6 +414,12 @@ def set_tensorrt_use_int8(func_desc, value=True):
    func_desc.job_config_proto.xrt_config.tensorrt_config.use_int8 = value


+@oneflow_function_config("tensorrt.int8_calibration")
+def set_tensorrt_int8_calibration(func_desc, value):
+    assert func_desc.job_config_proto.xrt_config.tensorrt_config.use_int8
+    func_desc.job_config_proto.xrt_config.tensorrt_config.int8_calibration = value
+
+
 @oneflow_function_config("default_distribute_strategy")
 def set_default_distribute_strategy(func_desc, value):
    assert isinstance(value, distribute_ctx.DistributeStrategy)

--- a/oneflow/python/oneflow_internal.h
+++ b/oneflow/python/oneflow_internal.h
@@ -211,3 +211,11 @@ void OfBlob_CurMutTensorCopyShapeFrom(uint64_t of_blob_ptr, long* array, int siz
  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
  return of_blob->CurMutTensorCopyShapeFrom(array, size);
 }
+
+void CacheInt8Calibration(std::string* error_str) {
+  oneflow::CacheInt8Calibration().GetDataAndSerializedErrorProto(error_str);
+}
+
+void WriteInt8Calibration(const std::string& path, std::string* error_str) {
+  oneflow::WriteInt8Calibration(path).GetDataAndSerializedErrorProto(error_str);
+}
--- a/oneflow/python/oneflow_internal_helper.h
+++ b/oneflow/python/oneflow_internal_helper.h
@@ -24,6 +24,10 @@
 #include "oneflow/core/framework/op_registration.h"
 #include "oneflow/core/persistence/tee_persistent_log_stream.h"

+#ifdef WITH_TENSORRT
+#include "oneflow/xrt/api.h"
+#endif  // WITH_TENSORRT
+
 namespace oneflow {

 Maybe<void> RegisterWatcherOnlyOnce(ForeignWatcher* watcher) {
@@ -177,6 +181,25 @@ Maybe<std::string> GetSerializedMachineId2DeviceIdListOFRecord(
  return PbMessage2TxtString(*JUST(ParseMachineAndDeviceIdList(parallel_conf)));
 }

+Maybe<void> CacheInt8Calibration() {
+#ifdef WITH_TENSORRT
+  xrt::tensorrt::CacheInt8Calibration();
+#else
+  CHECK_OR_RETURN(0) << "Please recompile with TensorRT.";
+#endif  // WITH_TENSORRT
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> WriteInt8Calibration(const std::string& path) {
+#ifdef WITH_TENSORRT
+  xrt::tensorrt::CacheInt8Calibration();
+  xrt::tensorrt::WriteInt8Calibration(path);
+#else
+  CHECK_OR_RETURN(0) << "Please recompile with TensorRT.";
+#endif  // WITH_TENSORRT
+  return Maybe<void>::Ok();
+}
+
 Maybe<std::string> CheckAndCompleteUserOpConf(const std::string& op_conf_str) {
  OperatorConf op_conf;
  CHECK_OR_RETURN(TxtString2PbMessage(op_conf_str, &op_conf)) << "operator conf parse failed";

--- a/oneflow/python/test/xrt/test_leaky_relu.py
+++ b/oneflow/python/test/xrt/test_leaky_relu.py
+import unittest
+import numpy as np
+
+import oneflow as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, alpha, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.leaky_relu(x, alpha=alpha)
+
+    return leaky_relu_job
+
+
+def make_trt_job(input_shape, alpha, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.leaky_relu(x, alpha=alpha)
+
+    return trt_leaky_relu_job
+
+
+class TestLeakyRelu(unittest.TestCase):
+    def _test_body(self, x, alpha, dtype=np.float32):
+        f1 = make_job(x.shape, alpha, dtype=flow.float32)
+        f2 = make_trt_job(x.shape, alpha, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("oneflow: ", a)
+        print("oneflow with tensorrt: ", b)
+        self.assertTrue(np.allclose(a.ndarray(), b.ndarray(), rtol=1e-03, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, alpha=0.1, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, alpha, dtype=dtype)
+
+    def _test_random_body(self, shape, alpha=0.1, dtype=np.float32):
+        # np.random.random generates float range from 0 to 1.
+        x = 100 * (np.random.random(shape).astype(dtype) - 0.5)
+        self._test_body(x, alpha, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1), alpha=0.1)
+        self._test_ones_body((1, 10), alpha=0.1)
+        self._test_ones_body((2, 10, 2), alpha=0.1)
+        self._test_ones_body((2, 5, 2, 2), alpha=0.1)
+
+        self._test_ones_body((1), alpha=0.33)
+        self._test_ones_body((1, 10), alpha=0.33)
+        self._test_ones_body((2, 10, 2), alpha=0.33)
+        self._test_ones_body((2, 5, 2, 2), alpha=0.33)
+
+    def test_random_input(self):
+        self._test_random_body((1), alpha=0.1)
+        self._test_random_body((1, 10), alpha=0.1)
+        self._test_random_body((2, 10, 2), alpha=0.1)
+        self._test_random_body((2, 5, 2, 2), alpha=0.1)
+
+        self._test_random_body((1), alpha=0.33)
+        self._test_random_body((1, 10), alpha=0.33)
+        self._test_random_body((2, 10, 2), alpha=0.33)
+        self._test_random_body((2, 5, 2, 2), alpha=0.33)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/oneflow/python/test/xrt/test_online_int8.py
+++ b/oneflow/python/test/xrt/test_online_int8.py
+import unittest
+import numpy as np
+
+import oneflow as flow
+
+config = flow.function_config()
+
+
+def make_trt_job(
+    x_shape,
+    w_shape,
+    kernel_size=None,
+    strides=None,
+    padding="valid",
+    data_format="NCHW",
+    dilation_rate=None,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+    config.tensorrt.use_int8()
+
+    @flow.global_function(config)
+    def trt_conv2d_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        weight=flow.FixedTensorDef(w_shape, dtype=dtype),
+    ):
+        return flow.nn.conv2d(x, weight, strides, padding, data_format, dilation_rate)
+
+    return trt_conv2d_job
+
+
+class TestConv2d(unittest.TestCase):
+    def make_filter_shape(self, shape, filters, kernel_size, data_format):
+        if data_format == "NCHW":
+            return [filters, shape[1], kernel_size, kernel_size]
+        else:
+            return [filters, kernel_size, kernel_size, shape[3]]
+
+    def _test_body(
+        self,
+        x,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        f2 = make_trt_job(
+            x.shape,
+            filters.shape,
+            kernel_size,
+            strides,
+            padding,
+            data_format,
+            dilation_rate,
+            dtype=flow.float32,
+        )
+
+        for i in range(1):
+            b = f2(x, filters).get()
+            print("with tensorrt float32: ", b)
+
+        flow.tensorrt.cache_int8_calibration()
+
+        for i in range(1):
+            b = f2(x, filters).get()
+            print("with tensorrt int8: ", b)
+
+        flow.clear_default_session()
+
+    def _test_ones_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.ones(shape, dtype=dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def _test_random_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.random.random(shape).astype(dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def test_random_kernel_1x1(self):
+        self._test_random_body(
+            shape=[3, 3, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/oneflow/xrt/README.md
+++ b/oneflow/xrt/README.md
@@ -119,8 +119,64 @@ OneFlow中XRT的使用默认是关闭的，可以通过前端的Python接口和
  # TensorRT float16
  config.tensorrt.use_fp16()

-  # TensorRT int8 (目前尚未支持)
+  # TensorRT int8 (离线加载Calibration的方式)
  config.tensorrt.use_int8()
+  # Set int8 calibration table path
+  int8_calibration_path = "./int8_calibration"
+  config.tensorrt.int8_calibration(int8_calibration_path)
+  ```
+
+#### 使用Int8量化计算
+
+XRT支持离线加载和在线生成量化校准表两种方式来启动Int8的量化计算。离线加载的方式需要提前生成一个TensorRT格式的量化校准表，而且该量化校准表通常可以被重复使用，而在线生成的方式则在同一份脚本中，同时进行正常精度的计算和量化校准表的生成，一旦校准表生成后，则会在下一个迭代中自动切换到Int8精度的计算。
+
+- 生成Int8量化校准表(Int8 Calibration Table)
+
+  首先你需要为生成量化校准表准备一个校准数据集，通常可以是训练集或验证集的一个子集。然后按照正常的网络配置，开启TensorRT Int8。比如：
+
+  ```python
+  import oneflow as flow
+
+  config = flow.function_config()
+
+  config.use_tensorrt()
+  config.tensorrt.use_int8()
+
+  @flow.function(config)
+  def Job(input):
+      # define your network
+      pass
+  ```
+  当开启Int8，但又没有指定对应的量化校准表时，XRT会自动进入量化表生成模式，之后feed的数据都会按照正常的精度（fp32或fp16）进行计算，计算的结果会被用于生成对应的Int8量化校准表。最后将生成的量化校准表保存到指定的目录，在该目录下，每一个子图都会生成一个对应的量化校准表文件。
+  
+  ```python
+  # 使用10个batch的数据生成Int8量化校准表
+  for _ in range(10):
+      input = next_calibration_batch() # 加载校准数据集
+      Job(input).get()
+
+  # 保存量化校准表
+  flow.tensorrt.write_int8_calibration("./int8_calibration") # int8_calibration目录需要手动创建
+  ```
+  当Int8量化校准表生成完成后，你就可以按照上面介绍的离线加载Calibration的方式启动TensorRT Int8的量化计算。
+
+- 在线生成量化校准表并进行int8计算
+
+  在线方式分成两个步骤，首先利用校准数据集生成量化校准表，然后直接利用生成的量化校准表进行Int8的构图和计算。同样以上面的Job为例，
+
+  ```python
+  # 使用10个batch的数据生成Int8量化校准表
+  for _ in range(10):
+    input = next_calibration_batch() # 加载校准数据集
+    Job(input).get()
+
+  # 缓存量化校准表
+  flow.tensorrt.cache_int8_calibration()
+
+  # 当量化校准表cache完成后，XRT会自动切换到int8的计算
+  for _ in range(100):
+    input = next_batch() # 加载数据
+    Job(input).get()
  ```

 ### BenchMark

--- a/oneflow/xrt/api.cpp
+++ b/oneflow/xrt/api.cpp
 #include "oneflow/xrt/api.h"

+#include "absl/strings/str_cat.h"
 #include "glog/logging.h"

 #include "oneflow/core/operator/operator.h"  // GenLogicalBlobName, GenLogicalBlobId
 #include "oneflow/xrt/build_graph.h"
 #include "oneflow/xrt/utility/env.h"

+#include <fstream>
+#include <mutex>
+
+#ifdef WITH_TENSORRT
+#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
+#endif  // WITH_TENSORRT
+
 DEFINE_int32(clustering_minimum_nodes, EnvToInt(FLAGS_clustering_minimum_nodes, 1),
             "Minium nodes of a cluster after clustering.");
 DEFINE_int32(clustering_maximum_nodes, EnvToInt(FLAGS_clustering_maximum_nodes, 1000),
@@ -24,6 +32,11 @@ DEFINE_bool(tensorrt_fp16, EnvToBool(FLAGS_tensorrt_fp16, false),
 DEFINE_bool(tensorrt_int8, EnvToBool(FLAGS_tensorrt_int8, false),
            "Enable int8 precision for TENSORRT engine.");

+DEFINE_string(int8_calibration, EnvToString(FLAGS_int8_calibration, ""),
+              "TensorRT int8 calibration table directory. "
+              "Default is empty, and this means the calibration table will be "
+              "implictly generated if tensorrt_int8 flag is true.");
+
 namespace oneflow {
 namespace xrt {

@@ -73,6 +86,7 @@ static std::unordered_map<std::string, std::string> user_op_type_name2string_map
    {"layer_norm_grad", "LayerNormGrad"},
    {"scalar_add", "ScalarAdd"},
    {"scalar_mul", "ScalarMul"},
+    {"leaky_relu", "LeakyRelu"},
 };

 std::string ExtractOpTypeAsString(const OperatorConf &conf) {
@@ -159,7 +173,12 @@ void InitXrtConfigurations(const XrtConfig &config) {
  if (config.has_tensorrt_config()) {
    const XrtConfig::TensorRTConfig &trt_config = config.tensorrt_config();
    if (trt_config.has_use_fp16()) { FLAGS_tensorrt_fp16 = trt_config.use_fp16(); }
-    if (trt_config.has_use_int8()) { FLAGS_tensorrt_int8 = trt_config.use_int8(); }
+    if (trt_config.has_use_int8()) {
+      FLAGS_tensorrt_int8 = trt_config.use_int8();
+      if (trt_config.has_int8_calibration()) {
+        FLAGS_int8_calibration = trt_config.int8_calibration();
+      }
+    }
  }
 }

@@ -193,5 +212,41 @@ void RunCompilationTimeXrtPasses(const OpGraph &op_graph, Job *job, bool train_p
  RunXrtPass("RebuildCompiledJob", graph.get(), options, job);
 }

+#ifdef WITH_TENSORRT
+namespace tensorrt {
+void CacheInt8Calibration() {
+  const auto &calib_resources = TRTInt8CalibratorResource::All();
+  for (const auto &res : calib_resources) {
+    std::lock_guard<std::mutex> lock(res.second->mutex_);
+    if (!res.second->calibrator_->isDone()) {
+      res.second->calibrator_->waitAndSetDone();
+      res.second->thread_->join();
+    }
+    res.second->calibrator_->ReleaseDevBuffers();
+  }
+}
+
+void WriteInt8Calibration(const std::string &path) {
+  const auto &calib_resources = TRTInt8CalibratorResource::All();
+  for (const auto &res : calib_resources) {
+    CHECK(res.second->calibrator_->isDone())  // NOLINT
+        << "Calibration table maybe has not been generated "
+        << "since the calibrator has not been done.";
+
+    const std::string &calibration_table_data =
+        res.second->calibrator_->getCalibrationTableAsString();
+    CHECK(calibration_table_data.size()) << "Calibration table data is empty.";
+
+    std::string calib_store_path =  // NOLINT
+        absl::StrCat(path, "/", res.first /*calibrator name*/);
+    std::ofstream ofile(calib_store_path, std::ios::out);
+    CHECK(ofile.good()) << "Could not open calibration file: " << calib_store_path;
+    ofile << calibration_table_data;
+    ofile.close();
+  }
+}
+}  // namespace tensorrt
+#endif  // WITH_TENSORRT
+
 }  // namespace xrt
 }  // namespace oneflow
--- a/oneflow/xrt/api.h
+++ b/oneflow/xrt/api.h
@@ -66,6 +66,13 @@ inline void RunXrtPass(const std::string &pass, XrtGraph *graph, const XrtPassOp

 void RunCompilationTimeXrtPasses(const OpGraph &op_graph, Job *job, bool train_phase);

+#ifdef WITH_TENSORRT
+namespace tensorrt {
+void CacheInt8Calibration();
+void WriteInt8Calibration(const std::string &path);
+}  // namespace tensorrt
+#endif  // WITH_TENSORRT
+
 }  // namespace xrt
 }  // namespace oneflow


--- a/oneflow/xrt/executable.h
+++ b/oneflow/xrt/executable.h
@@ -35,6 +35,8 @@ struct ExecutableRunOptions {
  // Enable TensorRT int8
  bool tensorrt_int8 = false;

+  std::string tensorrt_int8_calibration = "";
+
  // Feed the return parameters to reuse it's storage while running
  // the executable.
  std::vector<Parameter> return_params;
@@ -42,21 +44,29 @@ struct ExecutableRunOptions {

 class Executable {
 public:
-  Executable(const XrtEngine &engine) : engine_(engine) {}
+  Executable(const std::string &name, const XrtEngine &engine)  // NOLINT
+      : name_(name), engine_(engine) {}
  virtual ~Executable() = default;

  const XrtEngine &engine() const { return engine_; }

-  virtual bool Run(const std::vector<Parameter> &inputs, const ExecutableRunOptions &run_options,
+  const std::string &name() const { return name_; }
+
+  virtual bool Run(const std::vector<Parameter> &inputs,     // NOLINT
+                   const ExecutableRunOptions &run_options,  // NOLINT
                   bool block_until_done = true) = 0;

-  bool RunAsync(const std::vector<Parameter> inputs, const ExecutableRunOptions &run_options) {
+  bool RunAsync(const std::vector<Parameter> inputs,  // NOLINT
+                const ExecutableRunOptions &run_options) {
    return Run(inputs, run_options, false);
  }

  const std::vector<Parameter> &Results() const { return results_; }

 protected:
+  // Executable name.
+  std::string name_;
+  // Executable engine, XLA or TensorRT.
  XrtEngine engine_;
  std::vector<Parameter> results_;
 };

--- a/oneflow/xrt/launch_kernel.cpp
+++ b/oneflow/xrt/launch_kernel.cpp
@@ -15,6 +15,7 @@ DEFINE_int32(max_batch_size, EnvToInt(FLAGS_max_batch_size, 1),

 DECLARE_bool(tensorrt_fp16);
 DECLARE_bool(tensorrt_int8);
+DECLARE_string(int8_calibration);

 namespace oneflow {
 namespace xrt {
@@ -165,6 +166,8 @@ void XrtLaunchKernel<device_type>::ForwardDataContent(
    CHECK_EQ(device_type, DeviceType::kGPU);
    run_options.max_batch_size = FLAGS_max_batch_size;
    run_options.tensorrt_fp16 = FLAGS_tensorrt_fp16;
+    run_options.tensorrt_int8 = FLAGS_tensorrt_int8;
+    run_options.tensorrt_int8_calibration = FLAGS_int8_calibration;
  }
  bool status = executable->Run(entry_params, run_options, block_until_done);
  CHECK(status) << "Executable is running failed.";

--- a/oneflow/xrt/platform.cpp
+++ b/oneflow/xrt/platform.cpp
@@ -32,6 +32,25 @@ int GetDeviceId(const XrtDevice &device) {
  return 0;  // Compiler warning free
 }

+void SetDeviceId(const XrtDevice &device, const int device_id) {
+  switch (device) {
+    case XrtDevice::CPU_X86: return;
+    case XrtDevice::GPU_CUDA: {
+#ifdef WITH_CUDA
+      CHECK_EQ(cudaSuccess, cudaSetDevice(device_id));
+      return;
+#endif
+    }
+    case XrtDevice::GPU_CL:
+    // TODO(hjchen2)
+    case XrtDevice::CPU_ARM:
+    // TODO(hjchen2)
+    case XrtDevice::GPU_ARM:
+      // TODO(hjchen2)
+      return;
+  }
+}
+
 }  // namespace platform

 }  // namespace xrt

--- a/oneflow/xrt/platform.h
+++ b/oneflow/xrt/platform.h
@@ -10,6 +10,8 @@ namespace platform {

 int GetDeviceId(const XrtDevice &device);

+void SetDeviceId(const XrtDevice &device, const int device_id);
+
 }  // namespace platform

 }  // namespace xrt

--- a/oneflow/xrt/tensorrt/ops/activation_op.cpp
+++ b/oneflow/xrt/tensorrt/ops/activation_op.cpp
@@ -28,6 +28,22 @@ REGISTER_TRT_OP_KERNEL(Sigmoid, ActivationOp<nvinfer1::ActivationType::kSIGMOID>
    .EnableTrainPhase()
    .Finalize();

+template<>
+class ActivationOp<nvinfer1::ActivationType::kLEAKY_RELU> : public TrtOpKernel {
+ public:
+  void Compile(TrtOpContext *ctx) override {
+    nvinfer1::ITensor *in = ctx->SoleInput();
+    auto *layer = ctx->builder()->addActivation(*in, nvinfer1::ActivationType::kLEAKY_RELU);
+    layer->setAlpha(ctx->Attr<float>("alpha"));
+    layer->setName(ctx->op_name().c_str());
+    ctx->SetSoleOutput(layer->getOutput(0));
+  }
+};
+
+REGISTER_TRT_OP_KERNEL(LeakyRelu, ActivationOp<nvinfer1::ActivationType::kLEAKY_RELU>)
+    .EnableTrainPhase()
+    .Finalize();
+
 }  // namespace tensorrt
 }  // namespace xrt
 }  // namespace oneflow
--- a/oneflow/xrt/tensorrt/trt_builder.h
+++ b/oneflow/xrt/tensorrt/trt_builder.h
@@ -65,7 +65,7 @@ class TrtBuilder {
  std::string builder_name_;

  // The next new handle number.
-  int64_t next_handle_ = 0;
+  int64_t next_handle_ = -1;

  nv::unique_ptr<nvinfer1::IBuilder> builder_;
  nv::unique_ptr<nvinfer1::INetworkDefinition> network_;
@@ -78,7 +78,7 @@ class TrtBuilder {
  util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> host_weights_;

 public:
-  explicit TrtBuilder(const std::string &name) : builder_name_(name), next_handle_(0) {
+  explicit TrtBuilder(const std::string &name) : builder_name_(name), next_handle_(-1) {
    static nv::Logger logger;
    builder_.reset(nvinfer1::createInferBuilder(logger));
    nvinfer1::NetworkDefinitionCreationFlags flags =
@@ -87,6 +87,8 @@ class TrtBuilder {
    network_.reset(builder_->createNetworkV2(flags));
  }

+  const std::string &name() const { return builder_name_; }
+
  nvinfer1::ITensor *GetTensor(int64_t handle);

  nvinfer1::Weights &GetWeight(int64_t handle);
@@ -138,7 +140,7 @@ class TrtBuilder {
    CHECK_GT(params_.count(handle), 0) << "Parameter is not found for handle " << handle;
  }

-  int64_t IncreaseHandle() { return next_handle_++; }
+  int64_t IncreaseHandle() { return ++next_handle_; }
 };

 }  // namespace tensorrt

--- a/oneflow/xrt/tensorrt/trt_executable.cpp
+++ b/oneflow/xrt/tensorrt/trt_executable.cpp
-#include "cuda_runtime.h"
-
 #include "oneflow/xrt/tensorrt/trt_executable.h"
+#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
+#include "oneflow/xrt/platform.h"
+
+#include <iostream>
+#include <sstream>
+#include "cuda_runtime.h"
+#include "absl/strings/str_cat.h"

 namespace oneflow {
 namespace xrt {

 namespace tensorrt {

-bool TrtExecutable::CreateExecutableEngine(const ExecutableRunOptions &run_options,
-                                           const int batch_size) {
-  if (!builder_ || !network_) { return false; }
-  auto build_config = nv::unique_ptr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
-  int64_t max_workspace_size = 1U << 24;  // 16MiB
-  if (run_options.device_memory_limit > 0) { max_workspace_size = run_options.device_memory_limit; }
+nvinfer1::ICudaEngine *TrtExecutable::CreateExecutableEngine(
+    const ExecutableRunOptions &run_options, const int batch_size /*= 1*/,
+    TRTInt8Calibrator *calibrator /*= nullptr*/) {
+  CHECK(builder_ && network_) << "Builder and network should be setup before.";
+
+  auto build_config =  // NOLINT
+      nv::unique_ptr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
+  int64_t max_workspace_size = 1U << 24;      // 16MiB
+  if (run_options.device_memory_limit > 0) {  // NOLINT
+    max_workspace_size = run_options.device_memory_limit;
+  }
  build_config->setMaxWorkspaceSize(max_workspace_size);

  nvinfer1::BuilderFlags flags = 0U;
  if (run_options.tensorrt_fp16) {
    if (builder_->platformHasFastFp16()) {
      flags |= (1U << int(nvinfer1::BuilderFlag::kFP16));
-      // It does not guarantee using half precision if only set kFP16 flag,
-      // but you can set kSTRICT_TYPES to force using half precision.
-      // flags |= (1U << int(nvinfer1::BuilderFlag::kSTRICT_TYPES));
    } else {
      LOG(INFO) << "TensorRT couldn't use fp16 precision since the GPU "
                   "hardware does not support.";
    }
  }
-  // flags |= (1U << int(nvinfer1::BuilderFlag::kINT8));
+  if (run_options.tensorrt_int8) {
+    if (builder_->platformHasFastInt8()) {
+      if (calibrator) {
+        flags |= (1U << int(nvinfer1::BuilderFlag::kINT8));
+        if (builder_->platformHasFastFp16()) {  // NOLINT
+          flags |= (1U << int(nvinfer1::BuilderFlag::kFP16));
+        }
+        build_config->setInt8Calibrator(calibrator);
+      }
+    } else {
+      LOG(INFO) << "TensorRT couldn't use int8 precision since the GPU "
+                   "hardware does not support.";
+    }
+  }
+  // It does not guarantee to use low precision if just set kFP16 or kint8 flag,
+  // but you can set kSTRICT_TYPES to enforce using half or int8 precision.
+  // flags |= (1U << int(nvinfer1::BuilderFlag::kSTRICT_TYPES));
+
  // flags |= (1U << int(nvinfer1::BuilderFlag::kREFIT));
  build_config->setFlags(flags);
-  // build_config->setInt8Calibrator();

  int32_t max_batch_size = std::max(run_options.max_batch_size, batch_size);
  builder_->setMaxBatchSize(max_batch_size);
  // builder_->setGpuAllocator();
-  engine_.reset(builder_->buildEngineWithConfig(*network_, *build_config));
-  return true;
+  return builder_->buildEngineWithConfig(*network_, *build_config);
 }

 bool TrtExecutable::ExecuteEngine(int batch_size, void **buffers, void *stream,
                                  bool block_until_done) {
-  if (!execution_context_) { execution_context_.reset(engine_->createExecutionContext()); }
+  if (!execution_context_) {  // NOLINT
+    execution_context_.reset(engine_->createExecutionContext());
+  }
  cudaStream_t cu_stream = reinterpret_cast<cudaStream_t>(stream);
  bool status =
      // execution_context_->enqueue(batch_size, buffers, cu_stream, nullptr);
      execution_context_->enqueueV2(buffers, cu_stream, nullptr);
-  if (block_until_done) { CHECK_EQ(cudaSuccess, cudaStreamSynchronize(cu_stream)); }
+  if (block_until_done) {  // NOLINT
+    CHECK_EQ(cudaSuccess, cudaStreamSynchronize(cu_stream));
+  }
  return status;
 }

+std::string TrtExecutable::LoadCalibrationTable(  // NOLINT
+    const std::string &calibration_path) {
+  std::string calib_restore_path(absl::StrCat(calibration_path, "/", this->name()));
+  std::ifstream infile(calib_restore_path, std::ios::in);
+  CHECK(infile.good()) << "Could not open calibration file: "  // NOLINT
+                       << calib_restore_path;
+  std::stringstream buffer;
+  buffer << infile.rdbuf();
+  return std::move(buffer.str());
+}
+
 bool TrtExecutable::Run(const std::vector<Parameter> &inputs,
-                        const ExecutableRunOptions &run_options, bool block_until_done) {
-  // TODO(hjchen2)
+                        const ExecutableRunOptions &run_options,  // NOLINT
+                        bool block_until_done) {
+  // TODO(hjchen2): Refactor
+  if (run_options.tensorrt_int8 && !calibrator_ &&  // NOLINT
+      run_options.tensorrt_int8_calibration.size()) {
+    std::string calibration_data =  // NOLINT
+        LoadCalibrationTable(run_options.tensorrt_int8_calibration);
+    CHECK(calibration_data.size()) << "Calibration data is empty.";
+    calibrator_.reset(new TRTInt8Calibrator(calibration_data));
+  }
  if (!execution_context_ && !engine_) {
-    CHECK(CreateExecutableEngine(run_options)) << "Cannot create TensorRT executanble engine.";
+    engine_.reset(CreateExecutableEngine(run_options, 1 /*batch size*/,  // NOLINT
+                                         calibrator_.get()));
+    CHECK(engine_) << "Cannot create TensorRT executable engine.";
  }
-  // All return params are results of the executable.
+
+  // All return params are the results of the executable.
  this->results_ = run_options.return_params;

+  // TODO(hjchen2): Cache the parameters raw address.
  util::Map<std::string, const Parameter *> all_params;
-  for (const Parameter &input : inputs) { all_params.emplace(input.name(), &input); }
-  for (const Parameter &output : this->results_) { all_params.emplace(output.name(), &output); }
+  for (const Parameter &input : inputs) {      // NOLINT
+    all_params.emplace(input.name(), &input);  // NOLINT
+  }
+  for (const Parameter &output : this->results_) {  // NOLINT
+    all_params.emplace(output.name(), &output);     // NOLINT
+  }

  const int num_bindings = engine_->getNbBindings();
  std::vector<const Parameter *> binding_params(num_bindings);
@@ -75,13 +128,47 @@ bool TrtExecutable::Run(const std::vector<Parameter> &inputs,
  // TODO(hjchen2): Check batch size is same for all binding parameters.
  const int batch_size = binding_params[0]->shape().At(0);
  if (batch_size > engine_->getMaxBatchSize()) {
-    LOG(WARNING) << "Rebuild engine since the maximum batch size " << engine_->getMaxBatchSize()
+    LOG(WARNING) << "Rebuild engine since the maximum batch size "  // NOLINT
+                 << engine_->getMaxBatchSize()                      // NOLINT
                 << " is less than the input batch size " << batch_size;
-    CHECK(CreateExecutableEngine(run_options, batch_size))
-        << "Failed to create engine with batch size " << batch_size;
+    engine_.reset(CreateExecutableEngine(run_options, batch_size,  // NOLINT
+                                         calibrator_.get()));
+    CHECK(engine_) << "Failed to create engine with batch size " << batch_size;
    execution_context_.reset(engine_->createExecutionContext());
  }
-  return ExecuteEngine(batch_size, buffers.data(), run_options.stream, block_until_done);
+
+  if (run_options.tensorrt_int8 && !calibrator_) {
+    auto *res = TRTInt8CalibratorResource::LookupOrCreate(this->name());
+    {
+      std::lock_guard<std::mutex> lock(res->mutex_);
+      if (!res->calibrator_) {
+        res->calibrator_.reset(new TRTInt8Calibrator());
+        int ordinal = platform::GetDeviceId(XrtDevice::GPU_CUDA);
+        res->thread_.reset(new std::thread([this, ordinal, batch_size, res,  // NOLINT
+                                            run_options]() {
+          platform::SetDeviceId(XrtDevice::GPU_CUDA, ordinal);
+          // TODO(hjchen2): TensorRT maybe crash if calibrator batch size > 1
+          res->calibrator_->setBatchSize(1 /*batch_size*/);
+          res->engine_.reset(                                        // NOLINT
+              this->CreateExecutableEngine(run_options, batch_size,  // NOLINT
+                                           res->calibrator_.get()));
+        }));
+      }
+    }
+
+    if (res->calibrator_->isDone()) {
+      CHECK_EQ(cudaSuccess, cudaStreamSynchronize(                                     // NOLINT
+                                reinterpret_cast<cudaStream_t>(run_options.stream)));  // NOLINT
+      calibrator_ = res->calibrator_;
+      // engine_ = std::move(res->engine_);
+      execution_context_.reset(res->engine_->createExecutionContext());
+    } else {
+      res->calibrator_->setBatch(binding_params);
+    }
+  }
+
+  return ExecuteEngine(batch_size, buffers.data(), run_options.stream,  // NOLINT
+                       block_until_done);
 }

 }  // namespace tensorrt

--- a/oneflow/xrt/tensorrt/trt_executable.h
+++ b/oneflow/xrt/tensorrt/trt_executable.h
@@ -7,6 +7,7 @@
 #include "oneflow/xrt/executable.h"
 #include "oneflow/xrt/parameter.h"
 #include "oneflow/xrt/tensorrt/trt_unique_ptr.h"
+#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
 #include "oneflow/xrt/utility/stl.h"

 namespace oneflow {
@@ -17,15 +18,17 @@ namespace tensorrt {
 class TrtExecutable : public Executable {
 public:
  explicit TrtExecutable(
-      nv::unique_ptr<nvinfer1::ICudaEngine> &&engine,
+      const std::string &name, nv::unique_ptr<nvinfer1::ICudaEngine> &&engine,
      const util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> &host_weights)
-      : Executable(XrtEngine::TENSORRT), engine_(std::move(engine)), host_weights_(host_weights) {}
+      : Executable(name, XrtEngine::TENSORRT),
+        engine_(std::move(engine)),  // NOLINT
+        host_weights_(host_weights) {}

  explicit TrtExecutable(
-      nv::unique_ptr<nvinfer1::IBuilder> &&builder,
+      const std::string &name, nv::unique_ptr<nvinfer1::IBuilder> &&builder,
      nv::unique_ptr<nvinfer1::INetworkDefinition> &&network,
      const util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> &host_weights)
-      : Executable(XrtEngine::TENSORRT),
+      : Executable(name, XrtEngine::TENSORRT),
        builder_(std::move(builder)),
        network_(std::move(network)),
        host_weights_(host_weights) {}
@@ -36,16 +39,22 @@ class TrtExecutable : public Executable {
           bool block_until_done = true) override;

 private:
-  bool CreateExecutableEngine(const ExecutableRunOptions &run_options, const int batch_size = -1);
+  nvinfer1::ICudaEngine *CreateExecutableEngine(const ExecutableRunOptions &run_options,
+                                                const int batch_size = 1,
+                                                TRTInt8Calibrator *calibrator = nullptr);

  bool ExecuteEngine(const int batch_size, void **buffers, void *stream, bool block_until_done);

+  std::string LoadCalibrationTable(const std::string &calibration_path);
+
 private:
  nv::unique_ptr<nvinfer1::ICudaEngine> engine_;
  nv::unique_ptr<nvinfer1::IBuilder> builder_;
  nv::unique_ptr<nvinfer1::INetworkDefinition> network_;
  nv::unique_ptr<nvinfer1::IExecutionContext> execution_context_;

+  std::shared_ptr<TRTInt8Calibrator> calibrator_;
+
  util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> host_weights_;
 };


--- a/oneflow/xrt/tensorrt/trt_graph_compiler.cpp
+++ b/oneflow/xrt/tensorrt/trt_graph_compiler.cpp
@@ -81,9 +81,9 @@ std::shared_ptr<Executable> TrtGraphCompiler::Compile(
    builder_->MarkOutput(value.handle());
  }

-  // return std::make_shared<TrtExecutable>(builder_->BuildCudaEngine());
-  return std::make_shared<TrtExecutable>(builder_->ReleaseBuilder(), builder_->ReleaseNetwork(),
-                                         builder_->host_weights());
+  // return std::make_shared<TrtExecutable>(builder_->name(), builder_->BuildCudaEngine());
+  return std::make_shared<TrtExecutable>(builder_->name(), builder_->ReleaseBuilder(),
+                                         builder_->ReleaseNetwork(), builder_->host_weights());
 }

 REGISTER_GRAPH_COMPILER(XrtEngine::TENSORRT, TrtGraphCompiler);

--- a/oneflow/xrt/tensorrt/trt_int8_calibrator.cpp
+++ b/oneflow/xrt/tensorrt/trt_int8_calibrator.cpp
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "cuda_runtime.h"
+
+#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
+
+namespace oneflow {
+namespace xrt {
+
+namespace tensorrt {
+
+void TRTInt8Calibrator::setBatchSize(const int batch_size) {  // NOLINT
+  batch_size_ = batch_size;
+}
+
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const {  // NOLINT
+  return batch_size_;
+}
+
+TRTInt8Calibrator::TRTInt8Calibrator()  // NOLINT
+    : done_(false), calib_running_(true), batch_is_set_(false) {}
+
+TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
+    : batch_size_(0),
+      done_(true),
+      calib_running_(false),
+      batch_is_set_(false),
+      calibration_table_(calib_data) {}
+
+TRTInt8Calibrator::~TRTInt8Calibrator() { ReleaseDevBuffers(); }
+
+void TRTInt8Calibrator::waitAndSetDone() {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  while ((calib_running_ || batch_is_set_) && !done_) cond_.wait(lk);
+  if (!done_) {
+    done_ = true;
+    cond_.notify_all();
+  }
+}
+
+void* TRTInt8Calibrator::createDevBuffer(const size_t buffer_size) {
+  LOG(INFO) << "Alloc memory buffer which size is " << buffer_size;
+  void* dev_buffer = nullptr;
+  CHECK_EQ(cudaSuccess, cudaMalloc(&dev_buffer, buffer_size))  // NOLINT
+      << "Failed to alloc " << buffer_size << " bytes for calibrator.";
+  CHECK(dev_buffer) << "Failed to alloc " << buffer_size  // NOLINT
+                    << " bytes for calibrator.";
+  return dev_buffer;
+}
+
+void TRTInt8Calibrator::ReleaseDevBuffers() {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  CHECK(done_) << "Calibrator could not release the device buffers "
+               << "since it had not been done.";
+  for (auto it : dev_buffers_) { CHECK_EQ(cudaSuccess, cudaFree(it.second.first)); }
+  dev_buffers_.clear();
+}
+
+// There might be more than one input for trt subgraph,
+// So, we use a map to store input information.
+bool TRTInt8Calibrator::setBatch(const std::vector<const Parameter*>& params) {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  //  There is a producer and a consumer. The producer set the batch data and
+  //  the consumer get the batch data. The size of the data pool is one.
+  //  So, the producer has to wait for the consumer to finish processing before
+  //  they can set the data.
+  while ((calib_running_ || batch_is_set_) && (!done_)) cond_.wait(lk);
+  // The done_ is set to true using waitAndSetDone, When all calibration data
+  // are processed.
+  if (done_) return false;
+
+  // Sets the batch.
+  for (const auto& it : params) {
+    auto dataptr = dev_buffers_.find(it->name());
+    if (dataptr == dev_buffers_.end()) {
+      void* buffer = createDevBuffer(it->byte_size());
+      dataptr = dev_buffers_
+                    .emplace(it->name(),  // NOLINT
+                             std::make_pair(buffer, it->byte_size()))
+                    .first;
+      // dataptr = dev_buffers_.emplace(it->name(), std::make_pair(it->data(),
+      // it->byte_size())).first;
+    }
+    CHECK(dataptr != dev_buffers_.end())  // NOLINT
+        << "Buffer '" << it->name() << "' does not exist.";
+
+    const auto& d = dataptr->second;
+    CHECK_EQ(cudaSuccess,                               // NOLINT
+             cudaMemcpy(d.first, it->data(), d.second,  // NOLINT
+                        cudaMemcpyDeviceToDevice))      // NOLINT
+        << "Fail to cudaMemcpy for " << it->name();
+  }
+
+  batch_is_set_ = true;
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,  // NOLINT
+                                 int num_bindings) {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  // Notify finish of last round of calibration.
+  calib_running_ = false;
+  cond_.notify_all();
+
+  // As long as there is data in the pool, the consumer can get it.
+  while (!batch_is_set_ && !done_) cond_.wait(lk);
+  if (done_) return false;
+
+  // Gets the batch
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = dev_buffers_.find(names[i]);
+    if (it == dev_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"  // NOLINT
+                 << names[i] << "' at position " << i;
+    }
+    bindings[i] = it->second.first;
+  }
+
+  batch_is_set_ = false;
+  calib_running_ = true;
+  return true;
+}
+
+void TRTInt8Calibrator::setDone() {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  done_ = true;
+  cond_.notify_all();
+}
+
+bool TRTInt8Calibrator::isDone() const {
+  std::unique_lock<std::mutex> lk(cond_mtx_);
+  return done_;
+}
+
+const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
+  if (calibration_table_.empty()) return nullptr;
+  length = calibration_table_.size();
+  return calibration_table_.data();
+}
+
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,  // NOLINT
+                                              std::size_t length) {
+  calibration_table_ = std::string((const char*)ptr, length);
+}
+
+static std::unordered_map<std::string,  // NOLINT
+                          TRTInt8CalibratorResource*>
+    resources;
+
+/*static*/ TRTInt8CalibratorResource*  // NOLINT
+TRTInt8CalibratorResource::LookupOrCreate(const std::string& name) {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+  auto it = resources.find(name);
+  if (it == resources.end()) {
+      it = resources.emplace(name, new TRTInt8CalibratorResource).first;
+    }
+  return it->second;
+}
+
+/*static*/ const std::unordered_map<std::string,  // NOLINT
+                                    TRTInt8CalibratorResource*>&
+TRTInt8CalibratorResource::All() {
+  return resources;
+}
+
+}  // namespace tensorrt
+
+}  // namespace xrt
+}  // namespace oneflow
--- a/oneflow/xrt/tensorrt/trt_int8_calibrator.h
+++ b/oneflow/xrt/tensorrt/trt_int8_calibrator.h
+#ifndef ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
+#define ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
+
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "NvInfer.h"
+#include "oneflow/xrt/parameter.h"
+#include "oneflow/xrt/tensorrt/trt_unique_ptr.h"
+
+namespace oneflow {
+namespace xrt {
+
+namespace tensorrt {
+
+// Refered from tensorflow
+class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+  // class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  // Construct a calibrator for future calibration.
+  TRTInt8Calibrator();
+
+  // Construct a finalized calibrator where we don't need to run calibration any
+  // more, as the calibration data is provided.
+  TRTInt8Calibrator(const std::string& calibration_data);
+
+  ~TRTInt8Calibrator();
+
+  int getBatchSize() const override;
+
+  bool getBatch(void* bindings[], const char* names[],  // NOLINT
+                int num_bindings) override;
+
+  void setBatchSize(const int batch_size);
+
+  // Feed calibration data to the calibrator, and return true if the data is
+  // accepted. Return false if the calibrator has been terminated.
+  bool setBatch(const std::vector<const Parameter*>& params);
+
+  // Wait until the last batch is consumed by the calibrator and set done.
+  void waitAndSetDone();
+
+  // Notify that calibration is done and future batches provided by setBatch()
+  // will be ignored.
+  void setDone();
+
+  bool isDone() const;
+
+  void* createDevBuffer(const size_t buffer_size);
+
+  void ReleaseDevBuffers();
+
+  // If not null, calibration is skipped.
+  const void* readCalibrationCache(std::size_t& length) override;
+
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+
+  const std::string& getCalibrationTableAsString() {  // NOLINT
+    return calibration_table_;
+  }
+
+ private:
+  int batch_size_;
+
+  // mutex for condition_variable
+  mutable std::mutex cond_mtx_;
+
+  // condition variable to implement producer-consumer queue for calibration
+  std::condition_variable cond_;
+
+  // Is calibration finished?
+  bool done_;
+
+  // Map to keep tensorrt input buffers and sizes keyed with buffer names
+  std::unordered_map<std::string, std::pair<void*, size_t>> dev_buffers_;
+
+  bool calib_running_;
+  bool batch_is_set_;
+
+  std::string calibration_table_;
+};
+
+struct TRTInt8CalibratorResource {
+ public:
+  static TRTInt8CalibratorResource* LookupOrCreate(const std::string& name);
+
+  static const std::unordered_map<std::string,  // NOLINT
+                                  TRTInt8CalibratorResource*>&
+  All();
+
+  // Individual mutex
+  mutable std::mutex mutex_;
+
+  std::shared_ptr<TRTInt8Calibrator> calibrator_;
+  std::shared_ptr<std::thread> thread_;
+
+  nv::unique_ptr<nvinfer1::ICudaEngine> engine_;
+};
+
+}  // namespace tensorrt
+
+}  // namespace xrt
+}  // namespace oneflow
+
+#endif  // ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
--- a/oneflow/xrt/tensorrt/trt_logger.cpp
+++ b/oneflow/xrt/tensorrt/trt_logger.cpp
@@ -21,7 +21,7 @@ void Logger::log(ILogger::Severity severity, const char* msg) {
      break;
    }
    case ILogger::Severity::kERROR: {
-      LOG(ERROR) << name_ << ": " << msg;
+      LOG(FATAL) << name_ << ": " << msg;
      break;
    }
    case ILogger::Severity::kINTERNAL_ERROR: {

--- a/oneflow/xrt/xla/xla_executable.h
+++ b/oneflow/xrt/xla/xla_executable.h
@@ -10,9 +10,10 @@ namespace mola {

 class XlaExecutable : public Executable {
 public:
-  XlaExecutable(const XrtDevice &device, const std::vector<xla::Shape> &input_shapes,
+  XlaExecutable(const std::string &name, const XrtDevice &device,
+                const std::vector<xla::Shape> &input_shapes,
                const xla::Shape &output_shape, std::unique_ptr<xla::LocalExecutable> &&executable)
-      : Executable(XrtEngine::XLA),
+      : Executable(name, XrtEngine::XLA),
        device_(device),
        input_shapes_(input_shapes),
        output_shape_(output_shape),

--- a/oneflow/xrt/xla/xla_graph_compiler.cpp
+++ b/oneflow/xrt/xla/xla_graph_compiler.cpp
@@ -124,7 +124,7 @@ std::shared_ptr<Executable> XlaGraphCompiler::BuildExecutable(
  build_options.set_result_layout(xla_output_shape);
  MOLA_CHECK_AND_ASSIGN(auto executable,
                        client->Compile(computation, argument_layouts, build_options));
-  return std::make_shared<XlaExecutable>(this->device_, xla_input_shapes, xla_output_shape,
+  return std::make_shared<XlaExecutable>(builder_->name(), this->device_, xla_input_shapes, xla_output_shape,
                                         std::move(executable));
 }