未验证 提交 d4d84e60 编写于 作者: H Houjiang Chen 提交者: GitHub

xrt support TensorRT int8 (#2637)

* Add tensorrt int8 calibrator

* Generate calibration correctly.

* Refine xrt int8 and readme

* Update readme

* Add xrt int8 unittest

* merge develop

* leaky relu test

* function->global_function

* fix LookupOrCreate

* OF_CHECK->CHECK_OR_RETURN
Co-authored-by: Nguo-ran <360112263@qq.com>
上级 9a3a541e
......@@ -45,11 +45,12 @@ message MemoryAllocationAlgorithmConf {
message XrtConfig {
message XlaConfig {
// TODO
// TODO(hjchen2)
}
message TensorRTConfig {
optional bool use_fp16 = 1 [default = false];
optional bool use_int8 = 2 [default = false];
optional string int8_calibration = 3;
}
optional bool use_xla_jit = 1 [default = false];
optional bool use_tensorrt = 2 [default = false];
......
from __future__ import absolute_import
import oneflow.oneflow_internal as oneflow_internal
from oneflow.python.oneflow_export import oneflow_export
@oneflow_export("tensorrt.write_int8_calibration")
def write_int8_calibration(path):
oneflow_internal.WriteInt8Calibration(path)
@oneflow_export("tensorrt.cache_int8_calibration")
def cache_int8_calibration():
oneflow_internal.CacheInt8Calibration()
......@@ -414,6 +414,12 @@ def set_tensorrt_use_int8(func_desc, value=True):
func_desc.job_config_proto.xrt_config.tensorrt_config.use_int8 = value
@oneflow_function_config("tensorrt.int8_calibration")
def set_tensorrt_int8_calibration(func_desc, value):
assert func_desc.job_config_proto.xrt_config.tensorrt_config.use_int8
func_desc.job_config_proto.xrt_config.tensorrt_config.int8_calibration = value
@oneflow_function_config("default_distribute_strategy")
def set_default_distribute_strategy(func_desc, value):
assert isinstance(value, distribute_ctx.DistributeStrategy)
......
......@@ -211,3 +211,11 @@ void OfBlob_CurMutTensorCopyShapeFrom(uint64_t of_blob_ptr, long* array, int siz
auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
return of_blob->CurMutTensorCopyShapeFrom(array, size);
}
void CacheInt8Calibration(std::string* error_str) {
oneflow::CacheInt8Calibration().GetDataAndSerializedErrorProto(error_str);
}
void WriteInt8Calibration(const std::string& path, std::string* error_str) {
oneflow::WriteInt8Calibration(path).GetDataAndSerializedErrorProto(error_str);
}
......@@ -24,6 +24,10 @@
#include "oneflow/core/framework/op_registration.h"
#include "oneflow/core/persistence/tee_persistent_log_stream.h"
#ifdef WITH_TENSORRT
#include "oneflow/xrt/api.h"
#endif // WITH_TENSORRT
namespace oneflow {
Maybe<void> RegisterWatcherOnlyOnce(ForeignWatcher* watcher) {
......@@ -177,6 +181,25 @@ Maybe<std::string> GetSerializedMachineId2DeviceIdListOFRecord(
return PbMessage2TxtString(*JUST(ParseMachineAndDeviceIdList(parallel_conf)));
}
Maybe<void> CacheInt8Calibration() {
#ifdef WITH_TENSORRT
xrt::tensorrt::CacheInt8Calibration();
#else
CHECK_OR_RETURN(0) << "Please recompile with TensorRT.";
#endif // WITH_TENSORRT
return Maybe<void>::Ok();
}
Maybe<void> WriteInt8Calibration(const std::string& path) {
#ifdef WITH_TENSORRT
xrt::tensorrt::CacheInt8Calibration();
xrt::tensorrt::WriteInt8Calibration(path);
#else
CHECK_OR_RETURN(0) << "Please recompile with TensorRT.";
#endif // WITH_TENSORRT
return Maybe<void>::Ok();
}
Maybe<std::string> CheckAndCompleteUserOpConf(const std::string& op_conf_str) {
OperatorConf op_conf;
CHECK_OR_RETURN(TxtString2PbMessage(op_conf_str, &op_conf)) << "operator conf parse failed";
......
import unittest
import numpy as np
import oneflow as flow
config = flow.function_config()
def make_job(input_shape, alpha, dtype=flow.float32):
config.use_xla_jit(False)
config.use_tensorrt(False)
@flow.global_function(config)
def leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
return flow.nn.leaky_relu(x, alpha=alpha)
return leaky_relu_job
def make_trt_job(input_shape, alpha, dtype=flow.float32):
config.use_xla_jit(False)
config.use_tensorrt(True)
@flow.global_function(config)
def trt_leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
return flow.nn.leaky_relu(x, alpha=alpha)
return trt_leaky_relu_job
class TestLeakyRelu(unittest.TestCase):
def _test_body(self, x, alpha, dtype=np.float32):
f1 = make_job(x.shape, alpha, dtype=flow.float32)
f2 = make_trt_job(x.shape, alpha, dtype=flow.float32)
a = f1(x).get()
b = f2(x).get()
print("oneflow: ", a)
print("oneflow with tensorrt: ", b)
self.assertTrue(np.allclose(a.ndarray(), b.ndarray(), rtol=1e-03, atol=1e-05))
flow.clear_default_session()
def _test_ones_body(self, shape, alpha=0.1, dtype=np.float32):
x = np.ones(shape, dtype=dtype)
self._test_body(x, alpha, dtype=dtype)
def _test_random_body(self, shape, alpha=0.1, dtype=np.float32):
# np.random.random generates float range from 0 to 1.
x = 100 * (np.random.random(shape).astype(dtype) - 0.5)
self._test_body(x, alpha, dtype=dtype)
def test_ones_input(self):
self._test_ones_body((1), alpha=0.1)
self._test_ones_body((1, 10), alpha=0.1)
self._test_ones_body((2, 10, 2), alpha=0.1)
self._test_ones_body((2, 5, 2, 2), alpha=0.1)
self._test_ones_body((1), alpha=0.33)
self._test_ones_body((1, 10), alpha=0.33)
self._test_ones_body((2, 10, 2), alpha=0.33)
self._test_ones_body((2, 5, 2, 2), alpha=0.33)
def test_random_input(self):
self._test_random_body((1), alpha=0.1)
self._test_random_body((1, 10), alpha=0.1)
self._test_random_body((2, 10, 2), alpha=0.1)
self._test_random_body((2, 5, 2, 2), alpha=0.1)
self._test_random_body((1), alpha=0.33)
self._test_random_body((1, 10), alpha=0.33)
self._test_random_body((2, 10, 2), alpha=0.33)
self._test_random_body((2, 5, 2, 2), alpha=0.33)
if __name__ == "__main__":
unittest.main()
import unittest
import numpy as np
import oneflow as flow
config = flow.function_config()
def make_trt_job(
x_shape,
w_shape,
kernel_size=None,
strides=None,
padding="valid",
data_format="NCHW",
dilation_rate=None,
dtype=flow.float32,
):
config.use_xla_jit(False)
config.use_tensorrt(True)
config.tensorrt.use_int8()
@flow.global_function(config)
def trt_conv2d_job(
x=flow.FixedTensorDef(x_shape, dtype=dtype),
weight=flow.FixedTensorDef(w_shape, dtype=dtype),
):
return flow.nn.conv2d(x, weight, strides, padding, data_format, dilation_rate)
return trt_conv2d_job
class TestConv2d(unittest.TestCase):
def make_filter_shape(self, shape, filters, kernel_size, data_format):
if data_format == "NCHW":
return [filters, shape[1], kernel_size, kernel_size]
else:
return [filters, kernel_size, kernel_size, shape[3]]
def _test_body(
self,
x,
filters,
kernel_size,
strides,
padding,
data_format,
dilation_rate,
dtype=np.float32,
):
f2 = make_trt_job(
x.shape,
filters.shape,
kernel_size,
strides,
padding,
data_format,
dilation_rate,
dtype=flow.float32,
)
for i in range(1):
b = f2(x, filters).get()
print("with tensorrt float32: ", b)
flow.tensorrt.cache_int8_calibration()
for i in range(1):
b = f2(x, filters).get()
print("with tensorrt int8: ", b)
flow.clear_default_session()
def _test_ones_body(
self,
shape,
filters,
kernel_size,
strides,
padding,
data_format,
dilation_rate,
dtype=np.float32,
):
assert len(shape) == 4
x = np.ones(shape, dtype=dtype)
w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
weight = np.random.random(w_shape).astype(dtype)
self._test_body(
x,
weight,
kernel_size=kernel_size,
strides=strides,
padding=padding,
data_format=data_format,
dilation_rate=dilation_rate,
)
def _test_random_body(
self,
shape,
filters,
kernel_size,
strides,
padding,
data_format,
dilation_rate,
dtype=np.float32,
):
assert len(shape) == 4
x = np.random.random(shape).astype(dtype)
w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
weight = np.random.random(w_shape).astype(dtype)
self._test_body(
x,
weight,
kernel_size=kernel_size,
strides=strides,
padding=padding,
data_format=data_format,
dilation_rate=dilation_rate,
)
def test_random_kernel_1x1(self):
self._test_random_body(
shape=[3, 3, 5, 5],
filters=1,
kernel_size=1,
strides=1,
padding="VALID",
data_format="NCHW",
dilation_rate=1,
)
if __name__ == "__main__":
unittest.main()
......@@ -119,8 +119,64 @@ OneFlow中XRT的使用默认是关闭的,可以通过前端的Python接口和
# TensorRT float16
config.tensorrt.use_fp16()
# TensorRT int8 (目前尚未支持)
# TensorRT int8 (离线加载Calibration的方式)
config.tensorrt.use_int8()
# Set int8 calibration table path
int8_calibration_path = "./int8_calibration"
config.tensorrt.int8_calibration(int8_calibration_path)
```
#### 使用Int8量化计算
XRT支持离线加载和在线生成量化校准表两种方式来启动Int8的量化计算。离线加载的方式需要提前生成一个TensorRT格式的量化校准表,而且该量化校准表通常可以被重复使用,而在线生成的方式则在同一份脚本中,同时进行正常精度的计算和量化校准表的生成,一旦校准表生成后,则会在下一个迭代中自动切换到Int8精度的计算。
- 生成Int8量化校准表(Int8 Calibration Table)
首先你需要为生成量化校准表准备一个校准数据集,通常可以是训练集或验证集的一个子集。然后按照正常的网络配置,开启TensorRT Int8。比如:
```python
import oneflow as flow
config = flow.function_config()
config.use_tensorrt()
config.tensorrt.use_int8()
@flow.function(config)
def Job(input):
# define your network
pass
```
当开启Int8,但又没有指定对应的量化校准表时,XRT会自动进入量化表生成模式,之后feed的数据都会按照正常的精度(fp32或fp16)进行计算,计算的结果会被用于生成对应的Int8量化校准表。最后将生成的量化校准表保存到指定的目录,在该目录下,每一个子图都会生成一个对应的量化校准表文件。
```python
# 使用10个batch的数据生成Int8量化校准表
for _ in range(10):
input = next_calibration_batch() # 加载校准数据集
Job(input).get()
# 保存量化校准表
flow.tensorrt.write_int8_calibration("./int8_calibration") # int8_calibration目录需要手动创建
```
当Int8量化校准表生成完成后,你就可以按照上面介绍的离线加载Calibration的方式启动TensorRT Int8的量化计算。
- 在线生成量化校准表并进行int8计算
在线方式分成两个步骤,首先利用校准数据集生成量化校准表,然后直接利用生成的量化校准表进行Int8的构图和计算。同样以上面的Job为例,
```python
# 使用10个batch的数据生成Int8量化校准表
for _ in range(10):
input = next_calibration_batch() # 加载校准数据集
Job(input).get()
# 缓存量化校准表
flow.tensorrt.cache_int8_calibration()
# 当量化校准表cache完成后,XRT会自动切换到int8的计算
for _ in range(100):
input = next_batch() # 加载数据
Job(input).get()
```
### BenchMark
......
#include "oneflow/xrt/api.h"
#include "absl/strings/str_cat.h"
#include "glog/logging.h"
#include "oneflow/core/operator/operator.h" // GenLogicalBlobName, GenLogicalBlobId
#include "oneflow/xrt/build_graph.h"
#include "oneflow/xrt/utility/env.h"
#include <fstream>
#include <mutex>
#ifdef WITH_TENSORRT
#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
#endif // WITH_TENSORRT
DEFINE_int32(clustering_minimum_nodes, EnvToInt(FLAGS_clustering_minimum_nodes, 1),
"Minium nodes of a cluster after clustering.");
DEFINE_int32(clustering_maximum_nodes, EnvToInt(FLAGS_clustering_maximum_nodes, 1000),
......@@ -24,6 +32,11 @@ DEFINE_bool(tensorrt_fp16, EnvToBool(FLAGS_tensorrt_fp16, false),
DEFINE_bool(tensorrt_int8, EnvToBool(FLAGS_tensorrt_int8, false),
"Enable int8 precision for TENSORRT engine.");
DEFINE_string(int8_calibration, EnvToString(FLAGS_int8_calibration, ""),
"TensorRT int8 calibration table directory. "
"Default is empty, and this means the calibration table will be "
"implictly generated if tensorrt_int8 flag is true.");
namespace oneflow {
namespace xrt {
......@@ -73,6 +86,7 @@ static std::unordered_map<std::string, std::string> user_op_type_name2string_map
{"layer_norm_grad", "LayerNormGrad"},
{"scalar_add", "ScalarAdd"},
{"scalar_mul", "ScalarMul"},
{"leaky_relu", "LeakyRelu"},
};
std::string ExtractOpTypeAsString(const OperatorConf &conf) {
......@@ -159,7 +173,12 @@ void InitXrtConfigurations(const XrtConfig &config) {
if (config.has_tensorrt_config()) {
const XrtConfig::TensorRTConfig &trt_config = config.tensorrt_config();
if (trt_config.has_use_fp16()) { FLAGS_tensorrt_fp16 = trt_config.use_fp16(); }
if (trt_config.has_use_int8()) { FLAGS_tensorrt_int8 = trt_config.use_int8(); }
if (trt_config.has_use_int8()) {
FLAGS_tensorrt_int8 = trt_config.use_int8();
if (trt_config.has_int8_calibration()) {
FLAGS_int8_calibration = trt_config.int8_calibration();
}
}
}
}
......@@ -193,5 +212,41 @@ void RunCompilationTimeXrtPasses(const OpGraph &op_graph, Job *job, bool train_p
RunXrtPass("RebuildCompiledJob", graph.get(), options, job);
}
#ifdef WITH_TENSORRT
namespace tensorrt {
void CacheInt8Calibration() {
const auto &calib_resources = TRTInt8CalibratorResource::All();
for (const auto &res : calib_resources) {
std::lock_guard<std::mutex> lock(res.second->mutex_);
if (!res.second->calibrator_->isDone()) {
res.second->calibrator_->waitAndSetDone();
res.second->thread_->join();
}
res.second->calibrator_->ReleaseDevBuffers();
}
}
void WriteInt8Calibration(const std::string &path) {
const auto &calib_resources = TRTInt8CalibratorResource::All();
for (const auto &res : calib_resources) {
CHECK(res.second->calibrator_->isDone()) // NOLINT
<< "Calibration table maybe has not been generated "
<< "since the calibrator has not been done.";
const std::string &calibration_table_data =
res.second->calibrator_->getCalibrationTableAsString();
CHECK(calibration_table_data.size()) << "Calibration table data is empty.";
std::string calib_store_path = // NOLINT
absl::StrCat(path, "/", res.first /*calibrator name*/);
std::ofstream ofile(calib_store_path, std::ios::out);
CHECK(ofile.good()) << "Could not open calibration file: " << calib_store_path;
ofile << calibration_table_data;
ofile.close();
}
}
} // namespace tensorrt
#endif // WITH_TENSORRT
} // namespace xrt
} // namespace oneflow
......@@ -66,6 +66,13 @@ inline void RunXrtPass(const std::string &pass, XrtGraph *graph, const XrtPassOp
void RunCompilationTimeXrtPasses(const OpGraph &op_graph, Job *job, bool train_phase);
#ifdef WITH_TENSORRT
namespace tensorrt {
void CacheInt8Calibration();
void WriteInt8Calibration(const std::string &path);
} // namespace tensorrt
#endif // WITH_TENSORRT
} // namespace xrt
} // namespace oneflow
......
......@@ -35,6 +35,8 @@ struct ExecutableRunOptions {
// Enable TensorRT int8
bool tensorrt_int8 = false;
std::string tensorrt_int8_calibration = "";
// Feed the return parameters to reuse it's storage while running
// the executable.
std::vector<Parameter> return_params;
......@@ -42,21 +44,29 @@ struct ExecutableRunOptions {
class Executable {
public:
Executable(const XrtEngine &engine) : engine_(engine) {}
Executable(const std::string &name, const XrtEngine &engine) // NOLINT
: name_(name), engine_(engine) {}
virtual ~Executable() = default;
const XrtEngine &engine() const { return engine_; }
virtual bool Run(const std::vector<Parameter> &inputs, const ExecutableRunOptions &run_options,
const std::string &name() const { return name_; }
virtual bool Run(const std::vector<Parameter> &inputs, // NOLINT
const ExecutableRunOptions &run_options, // NOLINT
bool block_until_done = true) = 0;
bool RunAsync(const std::vector<Parameter> inputs, const ExecutableRunOptions &run_options) {
bool RunAsync(const std::vector<Parameter> inputs, // NOLINT
const ExecutableRunOptions &run_options) {
return Run(inputs, run_options, false);
}
const std::vector<Parameter> &Results() const { return results_; }
protected:
// Executable name.
std::string name_;
// Executable engine, XLA or TensorRT.
XrtEngine engine_;
std::vector<Parameter> results_;
};
......
......@@ -15,6 +15,7 @@ DEFINE_int32(max_batch_size, EnvToInt(FLAGS_max_batch_size, 1),
DECLARE_bool(tensorrt_fp16);
DECLARE_bool(tensorrt_int8);
DECLARE_string(int8_calibration);
namespace oneflow {
namespace xrt {
......@@ -165,6 +166,8 @@ void XrtLaunchKernel<device_type>::ForwardDataContent(
CHECK_EQ(device_type, DeviceType::kGPU);
run_options.max_batch_size = FLAGS_max_batch_size;
run_options.tensorrt_fp16 = FLAGS_tensorrt_fp16;
run_options.tensorrt_int8 = FLAGS_tensorrt_int8;
run_options.tensorrt_int8_calibration = FLAGS_int8_calibration;
}
bool status = executable->Run(entry_params, run_options, block_until_done);
CHECK(status) << "Executable is running failed.";
......
......@@ -32,6 +32,25 @@ int GetDeviceId(const XrtDevice &device) {
return 0; // Compiler warning free
}
void SetDeviceId(const XrtDevice &device, const int device_id) {
switch (device) {
case XrtDevice::CPU_X86: return;
case XrtDevice::GPU_CUDA: {
#ifdef WITH_CUDA
CHECK_EQ(cudaSuccess, cudaSetDevice(device_id));
return;
#endif
}
case XrtDevice::GPU_CL:
// TODO(hjchen2)
case XrtDevice::CPU_ARM:
// TODO(hjchen2)
case XrtDevice::GPU_ARM:
// TODO(hjchen2)
return;
}
}
} // namespace platform
} // namespace xrt
......
......@@ -10,6 +10,8 @@ namespace platform {
int GetDeviceId(const XrtDevice &device);
void SetDeviceId(const XrtDevice &device, const int device_id);
} // namespace platform
} // namespace xrt
......
......@@ -28,6 +28,22 @@ REGISTER_TRT_OP_KERNEL(Sigmoid, ActivationOp<nvinfer1::ActivationType::kSIGMOID>
.EnableTrainPhase()
.Finalize();
template<>
class ActivationOp<nvinfer1::ActivationType::kLEAKY_RELU> : public TrtOpKernel {
public:
void Compile(TrtOpContext *ctx) override {
nvinfer1::ITensor *in = ctx->SoleInput();
auto *layer = ctx->builder()->addActivation(*in, nvinfer1::ActivationType::kLEAKY_RELU);
layer->setAlpha(ctx->Attr<float>("alpha"));
layer->setName(ctx->op_name().c_str());
ctx->SetSoleOutput(layer->getOutput(0));
}
};
REGISTER_TRT_OP_KERNEL(LeakyRelu, ActivationOp<nvinfer1::ActivationType::kLEAKY_RELU>)
.EnableTrainPhase()
.Finalize();
} // namespace tensorrt
} // namespace xrt
} // namespace oneflow
......@@ -65,7 +65,7 @@ class TrtBuilder {
std::string builder_name_;
// The next new handle number.
int64_t next_handle_ = 0;
int64_t next_handle_ = -1;
nv::unique_ptr<nvinfer1::IBuilder> builder_;
nv::unique_ptr<nvinfer1::INetworkDefinition> network_;
......@@ -78,7 +78,7 @@ class TrtBuilder {
util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> host_weights_;
public:
explicit TrtBuilder(const std::string &name) : builder_name_(name), next_handle_(0) {
explicit TrtBuilder(const std::string &name) : builder_name_(name), next_handle_(-1) {
static nv::Logger logger;
builder_.reset(nvinfer1::createInferBuilder(logger));
nvinfer1::NetworkDefinitionCreationFlags flags =
......@@ -87,6 +87,8 @@ class TrtBuilder {
network_.reset(builder_->createNetworkV2(flags));
}
const std::string &name() const { return builder_name_; }
nvinfer1::ITensor *GetTensor(int64_t handle);
nvinfer1::Weights &GetWeight(int64_t handle);
......@@ -138,7 +140,7 @@ class TrtBuilder {
CHECK_GT(params_.count(handle), 0) << "Parameter is not found for handle " << handle;
}
int64_t IncreaseHandle() { return next_handle_++; }
int64_t IncreaseHandle() { return ++next_handle_; }
};
} // namespace tensorrt
......
#include "cuda_runtime.h"
#include "oneflow/xrt/tensorrt/trt_executable.h"
#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
#include "oneflow/xrt/platform.h"
#include <iostream>
#include <sstream>
#include "cuda_runtime.h"
#include "absl/strings/str_cat.h"
namespace oneflow {
namespace xrt {
namespace tensorrt {
bool TrtExecutable::CreateExecutableEngine(const ExecutableRunOptions &run_options,
const int batch_size) {
if (!builder_ || !network_) { return false; }
auto build_config = nv::unique_ptr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
int64_t max_workspace_size = 1U << 24; // 16MiB
if (run_options.device_memory_limit > 0) { max_workspace_size = run_options.device_memory_limit; }
nvinfer1::ICudaEngine *TrtExecutable::CreateExecutableEngine(
const ExecutableRunOptions &run_options, const int batch_size /*= 1*/,
TRTInt8Calibrator *calibrator /*= nullptr*/) {
CHECK(builder_ && network_) << "Builder and network should be setup before.";
auto build_config = // NOLINT
nv::unique_ptr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
int64_t max_workspace_size = 1U << 24; // 16MiB
if (run_options.device_memory_limit > 0) { // NOLINT
max_workspace_size = run_options.device_memory_limit;
}
build_config->setMaxWorkspaceSize(max_workspace_size);
nvinfer1::BuilderFlags flags = 0U;
if (run_options.tensorrt_fp16) {
if (builder_->platformHasFastFp16()) {
flags |= (1U << int(nvinfer1::BuilderFlag::kFP16));
// It does not guarantee using half precision if only set kFP16 flag,
// but you can set kSTRICT_TYPES to force using half precision.
// flags |= (1U << int(nvinfer1::BuilderFlag::kSTRICT_TYPES));
} else {
LOG(INFO) << "TensorRT couldn't use fp16 precision since the GPU "
"hardware does not support.";
}
}
// flags |= (1U << int(nvinfer1::BuilderFlag::kINT8));
if (run_options.tensorrt_int8) {
if (builder_->platformHasFastInt8()) {
if (calibrator) {
flags |= (1U << int(nvinfer1::BuilderFlag::kINT8));
if (builder_->platformHasFastFp16()) { // NOLINT
flags |= (1U << int(nvinfer1::BuilderFlag::kFP16));
}
build_config->setInt8Calibrator(calibrator);
}
} else {
LOG(INFO) << "TensorRT couldn't use int8 precision since the GPU "
"hardware does not support.";
}
}
// It does not guarantee to use low precision if just set kFP16 or kint8 flag,
// but you can set kSTRICT_TYPES to enforce using half or int8 precision.
// flags |= (1U << int(nvinfer1::BuilderFlag::kSTRICT_TYPES));
// flags |= (1U << int(nvinfer1::BuilderFlag::kREFIT));
build_config->setFlags(flags);
// build_config->setInt8Calibrator();
int32_t max_batch_size = std::max(run_options.max_batch_size, batch_size);
builder_->setMaxBatchSize(max_batch_size);
// builder_->setGpuAllocator();
engine_.reset(builder_->buildEngineWithConfig(*network_, *build_config));
return true;
return builder_->buildEngineWithConfig(*network_, *build_config);
}
bool TrtExecutable::ExecuteEngine(int batch_size, void **buffers, void *stream,
bool block_until_done) {
if (!execution_context_) { execution_context_.reset(engine_->createExecutionContext()); }
if (!execution_context_) { // NOLINT
execution_context_.reset(engine_->createExecutionContext());
}
cudaStream_t cu_stream = reinterpret_cast<cudaStream_t>(stream);
bool status =
// execution_context_->enqueue(batch_size, buffers, cu_stream, nullptr);
execution_context_->enqueueV2(buffers, cu_stream, nullptr);
if (block_until_done) { CHECK_EQ(cudaSuccess, cudaStreamSynchronize(cu_stream)); }
if (block_until_done) { // NOLINT
CHECK_EQ(cudaSuccess, cudaStreamSynchronize(cu_stream));
}
return status;
}
std::string TrtExecutable::LoadCalibrationTable( // NOLINT
const std::string &calibration_path) {
std::string calib_restore_path(absl::StrCat(calibration_path, "/", this->name()));
std::ifstream infile(calib_restore_path, std::ios::in);
CHECK(infile.good()) << "Could not open calibration file: " // NOLINT
<< calib_restore_path;
std::stringstream buffer;
buffer << infile.rdbuf();
return std::move(buffer.str());
}
bool TrtExecutable::Run(const std::vector<Parameter> &inputs,
const ExecutableRunOptions &run_options, bool block_until_done) {
// TODO(hjchen2)
const ExecutableRunOptions &run_options, // NOLINT
bool block_until_done) {
// TODO(hjchen2): Refactor
if (run_options.tensorrt_int8 && !calibrator_ && // NOLINT
run_options.tensorrt_int8_calibration.size()) {
std::string calibration_data = // NOLINT
LoadCalibrationTable(run_options.tensorrt_int8_calibration);
CHECK(calibration_data.size()) << "Calibration data is empty.";
calibrator_.reset(new TRTInt8Calibrator(calibration_data));
}
if (!execution_context_ && !engine_) {
CHECK(CreateExecutableEngine(run_options)) << "Cannot create TensorRT executanble engine.";
engine_.reset(CreateExecutableEngine(run_options, 1 /*batch size*/, // NOLINT
calibrator_.get()));
CHECK(engine_) << "Cannot create TensorRT executable engine.";
}
// All return params are results of the executable.
// All return params are the results of the executable.
this->results_ = run_options.return_params;
// TODO(hjchen2): Cache the parameters raw address.
util::Map<std::string, const Parameter *> all_params;
for (const Parameter &input : inputs) { all_params.emplace(input.name(), &input); }
for (const Parameter &output : this->results_) { all_params.emplace(output.name(), &output); }
for (const Parameter &input : inputs) { // NOLINT
all_params.emplace(input.name(), &input); // NOLINT
}
for (const Parameter &output : this->results_) { // NOLINT
all_params.emplace(output.name(), &output); // NOLINT
}
const int num_bindings = engine_->getNbBindings();
std::vector<const Parameter *> binding_params(num_bindings);
......@@ -75,13 +128,47 @@ bool TrtExecutable::Run(const std::vector<Parameter> &inputs,
// TODO(hjchen2): Check batch size is same for all binding parameters.
const int batch_size = binding_params[0]->shape().At(0);
if (batch_size > engine_->getMaxBatchSize()) {
LOG(WARNING) << "Rebuild engine since the maximum batch size " << engine_->getMaxBatchSize()
LOG(WARNING) << "Rebuild engine since the maximum batch size " // NOLINT
<< engine_->getMaxBatchSize() // NOLINT
<< " is less than the input batch size " << batch_size;
CHECK(CreateExecutableEngine(run_options, batch_size))
<< "Failed to create engine with batch size " << batch_size;
engine_.reset(CreateExecutableEngine(run_options, batch_size, // NOLINT
calibrator_.get()));
CHECK(engine_) << "Failed to create engine with batch size " << batch_size;
execution_context_.reset(engine_->createExecutionContext());
}
return ExecuteEngine(batch_size, buffers.data(), run_options.stream, block_until_done);
if (run_options.tensorrt_int8 && !calibrator_) {
auto *res = TRTInt8CalibratorResource::LookupOrCreate(this->name());
{
std::lock_guard<std::mutex> lock(res->mutex_);
if (!res->calibrator_) {
res->calibrator_.reset(new TRTInt8Calibrator());
int ordinal = platform::GetDeviceId(XrtDevice::GPU_CUDA);
res->thread_.reset(new std::thread([this, ordinal, batch_size, res, // NOLINT
run_options]() {
platform::SetDeviceId(XrtDevice::GPU_CUDA, ordinal);
// TODO(hjchen2): TensorRT maybe crash if calibrator batch size > 1
res->calibrator_->setBatchSize(1 /*batch_size*/);
res->engine_.reset( // NOLINT
this->CreateExecutableEngine(run_options, batch_size, // NOLINT
res->calibrator_.get()));
}));
}
}
if (res->calibrator_->isDone()) {
CHECK_EQ(cudaSuccess, cudaStreamSynchronize( // NOLINT
reinterpret_cast<cudaStream_t>(run_options.stream))); // NOLINT
calibrator_ = res->calibrator_;
// engine_ = std::move(res->engine_);
execution_context_.reset(res->engine_->createExecutionContext());
} else {
res->calibrator_->setBatch(binding_params);
}
}
return ExecuteEngine(batch_size, buffers.data(), run_options.stream, // NOLINT
block_until_done);
}
} // namespace tensorrt
......
......@@ -7,6 +7,7 @@
#include "oneflow/xrt/executable.h"
#include "oneflow/xrt/parameter.h"
#include "oneflow/xrt/tensorrt/trt_unique_ptr.h"
#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
#include "oneflow/xrt/utility/stl.h"
namespace oneflow {
......@@ -17,15 +18,17 @@ namespace tensorrt {
class TrtExecutable : public Executable {
public:
explicit TrtExecutable(
nv::unique_ptr<nvinfer1::ICudaEngine> &&engine,
const std::string &name, nv::unique_ptr<nvinfer1::ICudaEngine> &&engine,
const util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> &host_weights)
: Executable(XrtEngine::TENSORRT), engine_(std::move(engine)), host_weights_(host_weights) {}
: Executable(name, XrtEngine::TENSORRT),
engine_(std::move(engine)), // NOLINT
host_weights_(host_weights) {}
explicit TrtExecutable(
nv::unique_ptr<nvinfer1::IBuilder> &&builder,
const std::string &name, nv::unique_ptr<nvinfer1::IBuilder> &&builder,
nv::unique_ptr<nvinfer1::INetworkDefinition> &&network,
const util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> &host_weights)
: Executable(XrtEngine::TENSORRT),
: Executable(name, XrtEngine::TENSORRT),
builder_(std::move(builder)),
network_(std::move(network)),
host_weights_(host_weights) {}
......@@ -36,16 +39,22 @@ class TrtExecutable : public Executable {
bool block_until_done = true) override;
private:
bool CreateExecutableEngine(const ExecutableRunOptions &run_options, const int batch_size = -1);
nvinfer1::ICudaEngine *CreateExecutableEngine(const ExecutableRunOptions &run_options,
const int batch_size = 1,
TRTInt8Calibrator *calibrator = nullptr);
bool ExecuteEngine(const int batch_size, void **buffers, void *stream, bool block_until_done);
std::string LoadCalibrationTable(const std::string &calibration_path);
private:
nv::unique_ptr<nvinfer1::ICudaEngine> engine_;
nv::unique_ptr<nvinfer1::IBuilder> builder_;
nv::unique_ptr<nvinfer1::INetworkDefinition> network_;
nv::unique_ptr<nvinfer1::IExecutionContext> execution_context_;
std::shared_ptr<TRTInt8Calibrator> calibrator_;
util::Map<std::string, std::shared_ptr<std::vector<uint8_t>>> host_weights_;
};
......
......@@ -81,9 +81,9 @@ std::shared_ptr<Executable> TrtGraphCompiler::Compile(
builder_->MarkOutput(value.handle());
}
// return std::make_shared<TrtExecutable>(builder_->BuildCudaEngine());
return std::make_shared<TrtExecutable>(builder_->ReleaseBuilder(), builder_->ReleaseNetwork(),
builder_->host_weights());
// return std::make_shared<TrtExecutable>(builder_->name(), builder_->BuildCudaEngine());
return std::make_shared<TrtExecutable>(builder_->name(), builder_->ReleaseBuilder(),
builder_->ReleaseNetwork(), builder_->host_weights());
}
REGISTER_GRAPH_COMPILER(XrtEngine::TENSORRT, TrtGraphCompiler);
......
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include "cuda_runtime.h"
#include "oneflow/xrt/tensorrt/trt_int8_calibrator.h"
namespace oneflow {
namespace xrt {
namespace tensorrt {
void TRTInt8Calibrator::setBatchSize(const int batch_size) { // NOLINT
batch_size_ = batch_size;
}
// set the batch size before constructing the thread to execute engine
int TRTInt8Calibrator::getBatchSize() const { // NOLINT
return batch_size_;
}
TRTInt8Calibrator::TRTInt8Calibrator() // NOLINT
: done_(false), calib_running_(true), batch_is_set_(false) {}
TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
: batch_size_(0),
done_(true),
calib_running_(false),
batch_is_set_(false),
calibration_table_(calib_data) {}
TRTInt8Calibrator::~TRTInt8Calibrator() { ReleaseDevBuffers(); }
void TRTInt8Calibrator::waitAndSetDone() {
std::unique_lock<std::mutex> lk(cond_mtx_);
while ((calib_running_ || batch_is_set_) && !done_) cond_.wait(lk);
if (!done_) {
done_ = true;
cond_.notify_all();
}
}
void* TRTInt8Calibrator::createDevBuffer(const size_t buffer_size) {
LOG(INFO) << "Alloc memory buffer which size is " << buffer_size;
void* dev_buffer = nullptr;
CHECK_EQ(cudaSuccess, cudaMalloc(&dev_buffer, buffer_size)) // NOLINT
<< "Failed to alloc " << buffer_size << " bytes for calibrator.";
CHECK(dev_buffer) << "Failed to alloc " << buffer_size // NOLINT
<< " bytes for calibrator.";
return dev_buffer;
}
void TRTInt8Calibrator::ReleaseDevBuffers() {
std::unique_lock<std::mutex> lk(cond_mtx_);
CHECK(done_) << "Calibrator could not release the device buffers "
<< "since it had not been done.";
for (auto it : dev_buffers_) { CHECK_EQ(cudaSuccess, cudaFree(it.second.first)); }
dev_buffers_.clear();
}
// There might be more than one input for trt subgraph,
// So, we use a map to store input information.
bool TRTInt8Calibrator::setBatch(const std::vector<const Parameter*>& params) {
std::unique_lock<std::mutex> lk(cond_mtx_);
// There is a producer and a consumer. The producer set the batch data and
// the consumer get the batch data. The size of the data pool is one.
// So, the producer has to wait for the consumer to finish processing before
// they can set the data.
while ((calib_running_ || batch_is_set_) && (!done_)) cond_.wait(lk);
// The done_ is set to true using waitAndSetDone, When all calibration data
// are processed.
if (done_) return false;
// Sets the batch.
for (const auto& it : params) {
auto dataptr = dev_buffers_.find(it->name());
if (dataptr == dev_buffers_.end()) {
void* buffer = createDevBuffer(it->byte_size());
dataptr = dev_buffers_
.emplace(it->name(), // NOLINT
std::make_pair(buffer, it->byte_size()))
.first;
// dataptr = dev_buffers_.emplace(it->name(), std::make_pair(it->data(),
// it->byte_size())).first;
}
CHECK(dataptr != dev_buffers_.end()) // NOLINT
<< "Buffer '" << it->name() << "' does not exist.";
const auto& d = dataptr->second;
CHECK_EQ(cudaSuccess, // NOLINT
cudaMemcpy(d.first, it->data(), d.second, // NOLINT
cudaMemcpyDeviceToDevice)) // NOLINT
<< "Fail to cudaMemcpy for " << it->name();
}
batch_is_set_ = true;
cond_.notify_all();
return true;
}
bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, // NOLINT
int num_bindings) {
std::unique_lock<std::mutex> lk(cond_mtx_);
// Notify finish of last round of calibration.
calib_running_ = false;
cond_.notify_all();
// As long as there is data in the pool, the consumer can get it.
while (!batch_is_set_ && !done_) cond_.wait(lk);
if (done_) return false;
// Gets the batch
for (int i = 0; i < num_bindings; i++) {
auto it = dev_buffers_.find(names[i]);
if (it == dev_buffers_.end()) {
LOG(FATAL) << "Calibration engine asked for unknown tensor name '" // NOLINT
<< names[i] << "' at position " << i;
}
bindings[i] = it->second.first;
}
batch_is_set_ = false;
calib_running_ = true;
return true;
}
void TRTInt8Calibrator::setDone() {
std::unique_lock<std::mutex> lk(cond_mtx_);
done_ = true;
cond_.notify_all();
}
bool TRTInt8Calibrator::isDone() const {
std::unique_lock<std::mutex> lk(cond_mtx_);
return done_;
}
const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
if (calibration_table_.empty()) return nullptr;
length = calibration_table_.size();
return calibration_table_.data();
}
void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, // NOLINT
std::size_t length) {
calibration_table_ = std::string((const char*)ptr, length);
}
static std::unordered_map<std::string, // NOLINT
TRTInt8CalibratorResource*>
resources;
/*static*/ TRTInt8CalibratorResource* // NOLINT
TRTInt8CalibratorResource::LookupOrCreate(const std::string& name) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
auto it = resources.find(name);
if (it == resources.end()) {
it = resources.emplace(name, new TRTInt8CalibratorResource).first;
}
return it->second;
}
/*static*/ const std::unordered_map<std::string, // NOLINT
TRTInt8CalibratorResource*>&
TRTInt8CalibratorResource::All() {
return resources;
}
} // namespace tensorrt
} // namespace xrt
} // namespace oneflow
#ifndef ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
#define ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include "NvInfer.h"
#include "oneflow/xrt/parameter.h"
#include "oneflow/xrt/tensorrt/trt_unique_ptr.h"
namespace oneflow {
namespace xrt {
namespace tensorrt {
// Refered from tensorflow
class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
// class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
public:
// Construct a calibrator for future calibration.
TRTInt8Calibrator();
// Construct a finalized calibrator where we don't need to run calibration any
// more, as the calibration data is provided.
TRTInt8Calibrator(const std::string& calibration_data);
~TRTInt8Calibrator();
int getBatchSize() const override;
bool getBatch(void* bindings[], const char* names[], // NOLINT
int num_bindings) override;
void setBatchSize(const int batch_size);
// Feed calibration data to the calibrator, and return true if the data is
// accepted. Return false if the calibrator has been terminated.
bool setBatch(const std::vector<const Parameter*>& params);
// Wait until the last batch is consumed by the calibrator and set done.
void waitAndSetDone();
// Notify that calibration is done and future batches provided by setBatch()
// will be ignored.
void setDone();
bool isDone() const;
void* createDevBuffer(const size_t buffer_size);
void ReleaseDevBuffers();
// If not null, calibration is skipped.
const void* readCalibrationCache(std::size_t& length) override;
void writeCalibrationCache(const void* ptr, std::size_t length) override;
const std::string& getCalibrationTableAsString() { // NOLINT
return calibration_table_;
}
private:
int batch_size_;
// mutex for condition_variable
mutable std::mutex cond_mtx_;
// condition variable to implement producer-consumer queue for calibration
std::condition_variable cond_;
// Is calibration finished?
bool done_;
// Map to keep tensorrt input buffers and sizes keyed with buffer names
std::unordered_map<std::string, std::pair<void*, size_t>> dev_buffers_;
bool calib_running_;
bool batch_is_set_;
std::string calibration_table_;
};
struct TRTInt8CalibratorResource {
public:
static TRTInt8CalibratorResource* LookupOrCreate(const std::string& name);
static const std::unordered_map<std::string, // NOLINT
TRTInt8CalibratorResource*>&
All();
// Individual mutex
mutable std::mutex mutex_;
std::shared_ptr<TRTInt8Calibrator> calibrator_;
std::shared_ptr<std::thread> thread_;
nv::unique_ptr<nvinfer1::ICudaEngine> engine_;
};
} // namespace tensorrt
} // namespace xrt
} // namespace oneflow
#endif // ONEFLOW_XRT_TENSORRT_TRT_INT8_CALIBRATOR_H_
......@@ -21,7 +21,7 @@ void Logger::log(ILogger::Severity severity, const char* msg) {
break;
}
case ILogger::Severity::kERROR: {
LOG(ERROR) << name_ << ": " << msg;
LOG(FATAL) << name_ << ": " << msg;
break;
}
case ILogger::Severity::kINTERNAL_ERROR: {
......
......@@ -10,9 +10,10 @@ namespace mola {
class XlaExecutable : public Executable {
public:
XlaExecutable(const XrtDevice &device, const std::vector<xla::Shape> &input_shapes,
XlaExecutable(const std::string &name, const XrtDevice &device,
const std::vector<xla::Shape> &input_shapes,
const xla::Shape &output_shape, std::unique_ptr<xla::LocalExecutable> &&executable)
: Executable(XrtEngine::XLA),
: Executable(name, XrtEngine::XLA),
device_(device),
input_shapes_(input_shapes),
output_shape_(output_shape),
......
......@@ -124,7 +124,7 @@ std::shared_ptr<Executable> XlaGraphCompiler::BuildExecutable(
build_options.set_result_layout(xla_output_shape);
MOLA_CHECK_AND_ASSIGN(auto executable,
client->Compile(computation, argument_layouts, build_options));
return std::make_shared<XlaExecutable>(this->device_, xla_input_shapes, xla_output_shape,
return std::make_shared<XlaExecutable>(builder_->name(), this->device_, xla_input_shapes, xla_output_shape,
std::move(executable));
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册