diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index ce0ada788ef8a65bdf6eb4dd0f8aa5a80b2448ce..87abae825ec1339939ffe2f88403d3f2c2640c64 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -649,6 +649,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( add_pass(cv_type); add_pass(); add_pass(); + add_pass(); #if MGB_JIT bool need_jit = false; diff --git a/src/gopt/impl/misc.cpp b/src/gopt/impl/misc.cpp index 6ab3f155bc7e3ef1bf115859fa96f940f9253a5a..043797724544e0014ae9c4714bc3ffb3f6b8e125 100644 --- a/src/gopt/impl/misc.cpp +++ b/src/gopt/impl/misc.cpp @@ -682,6 +682,69 @@ void RemoveRedundantTypeCvtPass::apply(OptState& opt) const { MIDOUT_E } +/* ======================= RemoveRedundantCopyPass ====================== */ + +const char* RemoveRedundantCopyPass::name() const { + return "remove_redundant_copy"; +} + +bool RemoveRedundantCopyPass::should_remove(const CompNode& A, + const CompNode& B) { + //! if A and B has the same memnode and cpu <-> atlas/cpu <-> cuda, as only + //! these two compnode support crosscncopy + if (A.mem_node() == B.mem_node() || + ((A.device_type() == CompNode::DeviceType::CPU || + A.device_type() == CompNode::DeviceType::MULTITHREAD) && + (B.device_type() == CompNode::DeviceType::ATLAS || + B.device_type() == CompNode::DeviceType::CUDA)) || + ((B.device_type() == CompNode::DeviceType::CPU || + B.device_type() == CompNode::DeviceType::MULTITHREAD) && + (A.device_type() == CompNode::DeviceType::ATLAS || + A.device_type() == CompNode::DeviceType::CUDA))) { + return true; + } else { + return false; + } +} + +void RemoveRedundantCopyPass::apply(OptState& opt) const { + MIDOUT_B("RemoveRedundantCopyPass::apply") + auto rewriter = opt.graph().make_rewriter(); + + auto on_opr = [&](OperatorNodeBase* opr) { + if (auto copy0 = try_cast_as_op(opr)) { + auto inp0 = rewriter.get_var(copy0->input(0)); + if (auto copy1= try_cast_as_op(inp0)) { + auto inp1 = copy1->input(0); + if (should_remove(inp1->comp_node(), + copy0->output(0)->comp_node())) { + mgb_assert(!rewriter.has_manual_replace(inp1)); + if (inp1->comp_node() == copy0->output(0)->comp_node()) { + rewriter.replace_var( + copy0->output(0), inp1, + mgb_cstr_log("copy(copy(a0, a1), a0) -> " + "a0")); + return; + } else { + auto fold = opr::Copy::make( + inp1, copy0->output(0)->comp_node()); + rewriter.replace_var( + copy0->output(0), fold.node(), + mgb_cstr_log("copy(copy(a0, a1), a2) -> " + "copy(a0, a2)")); + return; + } + } + } + } + rewriter.auto_replace_outputs(opr); + }; + + opt.graph().iter(on_opr); + rewriter.apply_inplace(); + MIDOUT_E +} + #if MGB_ENABLE_OPR_MM #include "megbrain/opr/collective_comm.h" diff --git a/src/gopt/include/megbrain/gopt/misc.h b/src/gopt/include/megbrain/gopt/misc.h index e8a833b5b3d801764ae5ce30fdda730e24a2a5e8..c6aa63c1e43b6cd4b6ae654548eb88cb624d6635 100644 --- a/src/gopt/include/megbrain/gopt/misc.h +++ b/src/gopt/include/megbrain/gopt/misc.h @@ -85,6 +85,16 @@ namespace gopt { void apply(OptState &opt) const override; }; + class RemoveRedundantCopyPass final : public Pass { + private: + //! Remove the copy chain of form cpu -> cpu -> cpu, + //! cpu -> gpu -> cpu + static bool should_remove(const CompNode& A, const CompNode& B); + public: + const char * name() const override; + void apply(OptState &opt) const override; + }; + //! remove execution mask for const PPVs in conditional execution class CondExecConstPredicateFolding final : public Pass { public: diff --git a/src/gopt/test/helper.h b/src/gopt/test/helper.h index 2fcf4bb6cc255f400206ad78d9151db012717db3..6ebd92dc43ebfc27ce2ce8e442a3d32ddaf5bb0a 100644 --- a/src/gopt/test/helper.h +++ b/src/gopt/test/helper.h @@ -26,14 +26,16 @@ namespace mgb { HostTensorGenerator<> gen; std::shared_ptr graph = ComputingGraph::make(); - SymbolVar mkvar(const char *name, const TensorShape &shp = {1}) { - return opr::Host2DeviceCopy::make( - *graph, gen(shp)).rename(name); + SymbolVar mkvar(const char* name, const TensorShape& shp = {1}, + CompNode cn = CompNode::load("xpu0")) { + return opr::Host2DeviceCopy::make(*graph, gen(shp), cn) + .rename(name); } - SymbolVar mkcvar(const char *name, const TensorShape &shp = {1}) { + SymbolVar mkcvar(const char* name, const TensorShape& shp = {1}, + CompNode cn = CompNode::load("xpu0")) { return opr::SharedDeviceTensor::make( - *graph, *gen(shp)).rename(name); + *graph, *gen(shp), cn).rename(name); } template @@ -73,4 +75,3 @@ namespace mgb { TEST_F(TestGopt##pass, name) // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} - diff --git a/src/gopt/test/misc.cpp b/src/gopt/test/misc.cpp index 9cdeaf38efef3163b319091917fa8603b11f8df6..8f1688bc09e14838e1a774bd824a5232e973e0a0 100644 --- a/src/gopt/test/misc.cpp +++ b/src/gopt/test/misc.cpp @@ -16,6 +16,7 @@ #include "megbrain/opr/basic_arith_wrapper.h" #include "megbrain/opr/blas.h" #include "megbrain/opr/cond.h" +#include "megbrain/opr/io.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/utility.h" @@ -411,6 +412,71 @@ TEST_PASS(RemoveRedundantTypeCvtPass, Basic) { check(x_q8_q8, x_q8_fp32_q8_); } +TEST_PASS(RemoveRedundantCopyPass, Basic) { + auto x = mkvar("x", {2, 3, 3}, CompNode::load("cpu0")); + { + auto x_cpu1 = opr::Copy::make(x, CompNode::load("cpu1")); + auto x_cpu0 = opr::Copy::make(x_cpu1, CompNode::load("cpu0")); + auto x_cpu2 = opr::Copy::make(x_cpu0, CompNode::load("cpu2")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu2")); + check(x, x_cpu0); + check(x_expected, x_cpu2); + } + + { + auto x_cpu1 = opr::Copy::make(x, CompNode::load("cpu1")); + auto x_cpu2 = opr::Copy::make(x_cpu1, CompNode::load("cpu2")); + auto x_cpu3 = opr::Copy::make(x_cpu2, CompNode::load("cpu3")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu3")); + check(x_expected, x_cpu3); + } + + { + auto x_cpu1 = opr::Copy::make(x, CompNode::load("cpu0:1")); + auto x_cpu2 = opr::Copy::make(x_cpu1, CompNode::load("cpu0:2")); + auto x_cpu3 = opr::Copy::make(x_cpu2, CompNode::load("cpu0:3")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu0:3")); + check(x_expected, x_cpu3); + } + + { + auto x_cpu1 = opr::Copy::make(x, CompNode::load("cpu0:1")); + auto x_mt = opr::Copy::make(x_cpu1, CompNode::load("multithread8:0")); + auto x_cpu3 = opr::Copy::make(x_mt, CompNode::load("cpu0:3")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu0:3")); + check(x_expected, x_cpu3); + } + +#if MGB_ATLAS + { + auto x_atlas0 = opr::Copy::make(x, CompNode::load("atlas0")); + auto x_cpu2 = opr::Copy::make(x_atlas0, CompNode::load("cpu0:2")); + auto x_cpu3 = opr::Copy::make(x_cpu2, CompNode::load("cpu0:3")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu0:3")); + check(x_expected, x_cpu3); + } +#endif + +#if MGB_CUDA + { + auto x_cuda0 = opr::Copy::make(x, CompNode::load("gpu0")); + auto x_cpu2 = opr::Copy::make(x_cuda0, CompNode::load("cpu0:2")); + auto x_cpu3 = opr::Copy::make(x_cpu2, CompNode::load("cpu0:3")); + auto x_expected = opr::Copy::make(x, CompNode::load("cpu0:3")); + check(x_expected, x_cpu3); + } + + { + auto x_mt = opr::Copy::make(x, CompNode::load("multithread8:0")); + auto x_cpu2 = opr::Copy::make(x_mt , CompNode::load("gpu0:1")); + auto x_cpu3 = opr::Copy::make(x_cpu2, CompNode::load("multithread8:0")); + auto x_expected = opr::Copy::make(x, CompNode::load("multithread8:0")); + check(x_expected, x_cpu3); + } + +#endif +} + #if MGB_ENABLE_OPR_MM #include "megbrain/opr/collective_comm.h" #include "../../opr-mm/test/mock_client.h"