fix(mgb): fix TensorRT runtime opr profiling

GitOrigin-RevId: 3545aa53b2ee215e64d22c89e94171fadb6b11b0

fix(mgb): fix TensorRT runtime opr profiling
GitOrigin-RevId: 3545aa53b2ee215e64d22c89e94171fadb6b11b0
bcbfbbd7 · Megvii Engine Team · 702ed9ee · bcbfbbd7 · bcbfbbd7 · bcbfbbd7
4 changed file
--- a/src/tensorrt/impl/tensorrt_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_opr.cpp
@@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() {
    printf("Total time: %4.3fms\n", total_time);
 }

-std::shared_ptr<json::Value> TensorRTProfiler::to_json() {
-    using namespace json;
-    auto prof_arr = Array::make();
-    for (auto&& rec : profile) {
-        auto&& item = Array::make();
-        item->add(String::make(rec.first));
-        item->add(Number::make(rec.second));
-        prof_arr->add(item);
-    }
-    return prof_arr;
-}
 #endif  // MGB_ENABLE_JSON


@@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) {
 void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                           CompNode comp_node_check,
                           nvinfer1::ICudaEngine* engine,
-                           size_t batch) {
+                           size_t batch, bool use_trt_profiler) {

    auto comp_node = opr->comp_node();
    // ICudaEngine is bound to the currently active device
@@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                   comp_node_check.to_string().c_str(),
                   comp_node.to_string().c_str());
    }
-#if MGB_ENABLE_JSON
-    auto pf_holder_pair =
-            opr->owner_graph()
-                    ->options()
-                    .user_data.get_user_data<opr_profile::OprProfileHolder>();
-    if (m_has_profiler && !pf_holder_pair.second) {
-        m_context.reset();
-        m_has_profiler = false;
-    }
-#endif
    auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr();
    bool should_reinit_device_memory =
            !m_context || m_device_workspace_memory_ptr != workspace_ptr;
    if (!m_context) {
        m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}};
-        m_has_profiler = false;
    }
    m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1);
    bool is_trt_opr = false;
@@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,

    bool exec_success = false;

-#if MGB_ENABLE_JSON
-    if (!pf_holder_pair.second) {
-        mgb_assert(!m_has_profiler,
-                   "Invalid state of TensorRTRuntimeOpr: should not have "
-                   "profiler.");
+    if (!use_trt_profiler) {
 #if NV_TENSOR_RT_VERSION >= 6001
        if (is_trt_opr)
            exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
@@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
    } else {
        TensorRTProfiler trt_profiler;
        m_context->setProfiler(&trt_profiler);
-        m_has_profiler = true;
        // TensorRT documentation stated that IExecutionContext->execute
        // "Synchronously execute inference on a batch", and it does not take a
        // cudaStream_t, we expect it do a device synchronize. But it seems like
@@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
        exec_success = m_context->execute(batch, m_trt_iobuf.data());
 #endif
        mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
-        pf_holder_pair.first[0]->id2object_map[opr] = trt_profiler.to_json();
        printf("TRT profile info of opr %s:\n", opr->name().c_str());
        trt_profiler.print_layer_times();
    }
-#else
-#if NV_TENSOR_RT_VERSION >= 6001
-    if (is_trt_opr)
-        exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
-                                            env.cuda_env().stream, nullptr);
-    else
-        exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
-                                          env.cuda_env().stream, nullptr);
-#else
-    exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
-                                      env.cuda_env().stream, nullptr);
-#endif
-    mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
-#endif
 }

 /* ========================== TensorRTOpr ========================== */

--- a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
+++ b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
@@ -50,11 +50,11 @@ class TensorRTManager {
    std::vector<void*> m_trt_iobuf;
    TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context;
    void* m_device_workspace_memory_ptr;
-    bool m_has_profiler;

 public:
    void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check,
-              nvinfer1::ICudaEngine* engine, size_t batch = 1);
+              nvinfer1::ICudaEngine* engine, size_t batch = 1,
+              bool use_trt_profiler = false);

    void clear_trt_context() { m_context.reset(); }


--- a/src/tensorrt/test/tensorrt.cpp
+++ b/src/tensorrt/test/tensorrt.cpp
@@ -28,50 +28,6 @@ using namespace mgb;
 using namespace nvinfer1;
 using namespace opr;

-TEST(TestOprTensorRT, Profile) {
-    REQUIRE_GPU(1);
-    intl::ConcatConvTensorRTNetwork net;
-
-    auto p = net.create_trt_network(true);
-
-    auto y2 = TensorRTOpr::make(TensorRTOpr::to_shared_ptr_builder(p.first),
-                                TensorRTOpr::to_shared_ptr_network(p.second),
-                                intl::TensorRTGraphFeatureBits::NCHW_FLOAT, {},
-                                {net.x0, net.x1})[0];
-
-    HostTensorND host_z1;
-    HostTensorND host_z2;
-    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
-                                    make_callback_copy(y2, host_z2)});
-    {
-        mgb::GraphProfiler profiler(net.graph.get());
-
-        func->execute();
-
-        profiler.to_json()->writeto_fpath(
-                output_file("TestOprTensorRT.Profile.FromProfiler.json"));
-        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());
-
-        auto record_obj =
-                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
-        auto opr_prof_arr = *static_cast<json::Array*>(
-                record_obj[y2.node()->owner_opr()->id_str()].get());
-        for (auto item_arr : opr_prof_arr.get_impl()) {
-            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
-            auto layer_time =
-                    *static_cast<json::Number*>(layer_info_arr[1].get());
-
-            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
-        }
-
-        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-    }
-    // Run it again after profiler is not in existance.
-    func->execute();
-
-    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-}
-
 TEST(TestOprTensorRT, Basic) {
    REQUIRE_GPU(1);
    intl::SimpleTensorRTNetwork net;

--- a/src/tensorrt/test/tensorrt_runtime.cpp
+++ b/src/tensorrt/test/tensorrt_runtime.cpp
@@ -10,7 +10,6 @@
 */

 #include "megbrain/comp_node_env.h"
-#include "megbrain/plugin/profiler.h"
 #include "megbrain/test/autocheck.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/test/megdnn_helper.h"
@@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) {
    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
 }

-TEST(TestOprTensorRT, RuntimeProfile) {
-    REQUIRE_GPU(1);
-    intl::ConcatConvTensorRTNetwork net;
-    SymbolVar y2;
-    {
-        auto p = net.create_trt_network(false);
-        TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
-        TensorRTUniquePtr<IBuilder> builder{p.first, {}};
-        builder->setMaxBatchSize(5);
-#if NV_TENSOR_RT_VERSION >= 6001
-        TensorRTUniquePtr<IBuilderConfig> build_config{
-                builder->createBuilderConfig()};
-        auto cuda_engine =
-                builder->buildEngineWithConfig(*trt_net, *build_config);
-#else
-        auto cuda_engine = builder->buildCudaEngine(*trt_net);
-#endif
-        TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};
-
-        FILE* fout = fopen(output_file("trt_cuda_engine").c_str(), "wb");
-        auto wr = fwrite(mem->data(), 1, mem->size(), fout);
-        mgb_assert(wr == mem->size());
-        fclose(fout);
-
-        y2 = TensorRTRuntimeOpr::make(
-                TensorRTRuntimeOpr::to_shared_ptr_engine(cuda_engine), {},
-                {net.x0, net.x1})[0];
-    }
-
-    HostTensorND host_z1;
-    HostTensorND host_z2;
-    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
-                                    make_callback_copy(y2, host_z2)});
-
-    {
-        mgb::GraphProfiler profiler(net.graph.get());
-
-        func->execute();
-
-        profiler.to_json()->writeto_fpath(output_file(
-                "TestOprTensorRT.RuntimeProfile.FromProfiler.json"));
-
-        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());
-        auto record_obj =
-                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
-        auto opr_prof_arr = *static_cast<json::Array*>(
-                record_obj[y2.node()->owner_opr()->id_str()].get());
-        for (auto item_arr : opr_prof_arr.get_impl()) {
-            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
-            auto layer_time =
-                    *static_cast<json::Number*>(layer_info_arr[1].get());
-
-            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
-        }
-
-        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-    }
-    // Run it again after profiler is not in existance.
-    func->execute();
-
-    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-}
-
 TEST(TestOprTensorRT, RuntimeChangeBatchSize) {
    REQUIRE_GPU(1);
    intl::SimpleTensorRTNetwork net;