提交 3bd8ef35 编写于 作者: M Megvii Engine Team

feat(mgb/compnode): add atlas compnode

GitOrigin-RevId: 19f3c330039c3d0accd9787446c391495f425b6e
上级 aa147b74
......@@ -143,6 +143,15 @@ if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
endif()
option(MGE_WITH_JIT "Build MegEngine with JIT." ON)
option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" ON)
option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF)
option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON)
option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
if(NOT MGE_WITH_JIT)
if(MGE_WITH_HALIDE)
message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled")
......
......@@ -84,6 +84,7 @@ megcoreStatus_t megcoreGetDeviceFlags(
unsigned int *flags);
megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle);
megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle);
megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle,
void **devPtr, size_t sizeInBytes);
megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle,
......
......@@ -86,6 +86,7 @@ if (BUILD_SHARED_LIBS)
else()
target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS})
endif()
if(CMAKE_THREAD_LIBS_INIT)
target_link_libraries(megdnn PRIVATE Threads::Threads)
endif()
......
......@@ -38,6 +38,7 @@ class DeviceContext {
virtual size_t mem_alignment_in_bytes() const noexcept = 0;
virtual void activate() = 0;
virtual void deactivate() {}
virtual void *malloc(size_t size_in_bytes) = 0;
virtual void free(void *ptr) = 0;
......
......@@ -74,6 +74,13 @@ megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle)
return megcoreSuccess;
}
megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle)
{
megdnn_assert(handle);
handle->content->deactivate();
return megcoreSuccess;
}
megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle,
void **devPtr, size_t sizeInBytes)
{
......
......@@ -27,7 +27,6 @@ endif()
add_executable(megdnn_test ${SOURCES})
target_link_libraries(megdnn_test gtest)
target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS})
......
......@@ -246,6 +246,7 @@ SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs,
}
#endif
SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off,
const OperatorNodeConfig& config) {
auto tensor = std::make_shared<HostTensorND>(
......
......@@ -118,6 +118,8 @@ static SymbolVarArray tensor_rt_runtime(const SymbolVarArray& inputs,
PyObject* data_bytes,
const OperatorNodeConfig& config);
static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off,
const OperatorNodeConfig& config);
......
......@@ -18,7 +18,6 @@
#if MGB_ENABLE_OPR_MM
#include "megbrain/opr/collective_comm.h"
#endif
using AxisIndexer = mgb::opr::indexing::AxisIndexer;
/*!
......
......@@ -88,7 +88,7 @@ if (MGB_WITH_FLATBUFFERS)
${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
COMMAND
${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY}
DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY}
VERBATIM
)
add_custom_command(
......@@ -124,7 +124,6 @@ if (MGB_WITH_FLATBUFFERS)
target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include)
target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1)
target_link_libraries(megbrain PUBLIC flatbuffers)
set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles)
set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py)
file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH})
......
......@@ -96,7 +96,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info(
cn.free_device(ptr);
}
};
megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0,0,0,0}};
megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0, 0, 0, 0}};
auto ptr = static_cast<megcore::AsyncErrorInfo*>(
env.comp_node().alloc_device(sizeof(zero_info)));
cn.copy_to_device(ptr, &zero_info, sizeof(zero_info));
......@@ -106,7 +106,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info(
}
#endif
/* =================== misc =================== */
/* =================== misc =================== */
#if MGB_CUDA
......
......@@ -123,9 +123,9 @@ StaticDeviceMemoryManager::make_default_impl() {
}
#endif // MGB_THREAD_SAFE
/* ==================== CUDAAsyncVarReleaser ==================== */
#if MGB_CUDA
class VarNodeMemManager::CUDAAsyncVarReleaser {
/* ==================== AsyncVarReleaser ==================== */
#if MGB_CUDA
class VarNodeMemManager::AsyncVarReleaser {
struct WaiterParam {
CompNode cn;
CompNode::Event *event;
......@@ -133,10 +133,10 @@ class VarNodeMemManager::CUDAAsyncVarReleaser {
};
class Waiter final: public AsyncQueueSC<WaiterParam, Waiter> {
CUDAAsyncVarReleaser *m_par_releaser;
AsyncVarReleaser *m_par_releaser;
public:
Waiter(CUDAAsyncVarReleaser *releaser):
Waiter(AsyncVarReleaser *releaser):
m_par_releaser(releaser)
{
}
......@@ -159,7 +159,7 @@ class VarNodeMemManager::CUDAAsyncVarReleaser {
Spinlock m_event_pool_lock;
public:
~CUDAAsyncVarReleaser() {
~AsyncVarReleaser() {
wait_release_finish();
}
......@@ -247,15 +247,16 @@ bool VarNodeMemManager::ImpureMemPlanManager::check_need_realloc() {
VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph):
m_owner_graph(graph),
m_seq_mem_opt(graph)
#if MGB_CUDA
,m_cuda_asyn_var_releaser(new CUDAAsyncVarReleaser)
#if MGB_CUDA
,m_asyn_var_releaser(new AsyncVarReleaser)
#endif
{
auto on_comp_seq_finish = [this](const event::CompSeqExecFinished& ev) {
MGB_MARK_USED_VAR(ev);
// async release is only used for sync between multiple comp nodes, and
// does not wait for device to finish
#if MGB_CUDA
m_cuda_asyn_var_releaser->wait_release_finish();
#if MGB_CUDA
m_asyn_var_releaser->wait_release_finish();
#endif
m_cpu_async_release_barrier.wait_zero();
};
......@@ -295,9 +296,10 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph):
graph->event().register_receiver_permanent<event::CompSeqExecError>(
on_comp_seq_error);
#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER
#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA \
)
auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) {
m_cuda_asyn_var_releaser->wait_release_finish();
m_asyn_var_releaser->wait_release_finish();
};
graph->event().register_receiver_permanent<event::BeforeMemDefrag>(
on_mem_defrag_start);
......@@ -1341,7 +1343,7 @@ void VarNodeMemManager::decr_var_mem_refcnt(
}
#if MGB_CUDA
case DT::CUDA:
m_cuda_asyn_var_releaser->add(dispatch_cn, var);
m_asyn_var_releaser->add(dispatch_cn, var);
break;
#endif
default:
......
......@@ -431,10 +431,10 @@ class VarNodeMemManager {
SyncableCounter m_cpu_async_release_barrier;
#if MGB_CUDA
//! release dynamic var on after cuda event finishes
class CUDAAsyncVarReleaser;
std::unique_ptr<CUDAAsyncVarReleaser> m_cuda_asyn_var_releaser;
#if MGB_CUDA
//! release dynamic var on after compnode event finishes
class AsyncVarReleaser;
std::unique_ptr<AsyncVarReleaser> m_asyn_var_releaser;
#endif
VarDevMemDefragmenter m_var_dev_mem_defragmenter{this};
......
......@@ -41,9 +41,9 @@
} \
} while (0)
#endif // MGB_ENABLE_LOGGING
#endif //MGB_ENABLE_LOGGING
#endif //MGB_CUDA
#endif
//! whether to enable asynchronous initialization for CompNode and CompNodeEnv
#define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA)
......
......@@ -136,7 +136,6 @@ public:
* error message
*/
static std::string get_cuda_extra_info();
CudaError(const std::string& msg);
};
......
......@@ -59,9 +59,6 @@ TEST(TestCompNode, Parse) {
ASSERT_THROW(L::parse("cpu0:"), MegBrainError);
ASSERT_THROW(L::parse("cpu0:x"), MegBrainError);
ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError);
ASSERT_THROW(L::parse("heaxgon0"), MegBrainError);
ASSERT_THROW(L::parse("rcom0"), MegBrainError);
ASSERT_THROW(L::parse("cmabricon0"), MegBrainError);
ASSERT_THROW(L::parse("multithread"), MegBrainError);
ASSERT_THROW(L::parse("multithread1:"), MegBrainError);
ASSERT_THROW(L::parse("multithread1:default"), MegBrainError);
......
......@@ -53,6 +53,7 @@
#cmakedefine01 MEGDNN_THREADS_512
#cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS
// whether cuda is available
#ifndef MGB_CUDA
#define MGB_CUDA 1
......
......@@ -15,6 +15,7 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT)
list(APPEND SOURCES ${SOURCES_})
endif()
add_executable(megbrain_test ${SOURCES})
target_link_libraries(megbrain_test gtest)
target_link_libraries(megbrain_test megengine)
......
......@@ -98,22 +98,48 @@ dtype, RandomDistribution::UNIFORM>::operator ()(
return ret;
}
template<typename dtype>
std::shared_ptr<HostTensorND> HostTensorGenerator<
dtype, RandomDistribution::CONSTANT>::operator ()(
const TensorShape &shape, CompNode cn) {
if (!cn.valid())
cn = CompNode::load("xpu0");
std::shared_ptr<HostTensorND> ret =
std::make_shared<HostTensorND>(cn, shape, dtype());
auto ptr = ret->ptr<ctype>();
for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++ i) {
ptr[i] = m_default_val;
}
return ret;
}
// explicit instantialization of HostTensorGenerator
namespace mgb {
template class HostTensorGenerator<
dtype::Float32, RandomDistribution::GAUSSIAN>;
template class HostTensorGenerator<
dtype::Float32, RandomDistribution::UNIFORM>;
template class HostTensorGenerator<
dtype::Float32, RandomDistribution::CONSTANT>;
template class HostTensorGenerator<
dtype::Float16, RandomDistribution::GAUSSIAN>;
template class HostTensorGenerator<
dtype::Int8, RandomDistribution::UNIFORM>;
template class HostTensorGenerator<
dtype::Int8, RandomDistribution::CONSTANT>;
template class HostTensorGenerator<
dtype::Uint8, RandomDistribution::UNIFORM>;
template class HostTensorGenerator<
dtype::Uint8, RandomDistribution::CONSTANT>;
template class HostTensorGenerator<
dtype::Int16, RandomDistribution::UNIFORM>;
template class HostTensorGenerator<
dtype::Int16, RandomDistribution::CONSTANT>;
template class HostTensorGenerator<
dtype::Int32, RandomDistribution::UNIFORM>;
template class HostTensorGenerator<
dtype::Int32, RandomDistribution::CONSTANT>;
std::shared_ptr<HostTensorND>
HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM>::
operator()(const TensorShape& shape, CompNode cn) {
......
......@@ -175,7 +175,7 @@ class RNGxorshf {
};
enum class RandomDistribution {
GAUSSIAN, UNIFORM
GAUSSIAN, UNIFORM, CONSTANT
};
template<class dtype>
......@@ -322,6 +322,26 @@ class HostTensorGenerator<dtype, RandomDistribution::UNIFORM> final:
ctype m_lo, m_hi;
};
//! const value
template<class dtype>
class HostTensorGenerator<dtype, RandomDistribution::CONSTANT> final:
public HostTensorGeneratorBase {
public:
using ctype = typename DTypeTrait<dtype>::ctype;
HostTensorGenerator(ctype default_val)
: HostTensorGeneratorBase{next_rand_seed()},
m_default_val{default_val} {}
std::shared_ptr<HostTensorND> operator ()(
const TensorShape &shape, CompNode cn = {}) override;
using HostTensorGeneratorBase::operator();
private:
ctype m_default_val;
};
template <>
class HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM> final
: public HostTensorGeneratorBase {
......
......@@ -21,8 +21,8 @@ pdef('PersistentOutputStorage').add_fields(
(pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator').
add_enum('Strategy',
Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'),
Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, '
'and the chosen algorithm is reproducible'),
Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, '
'and the chosen algorithm is reproducible'),
Doc('PROFILE',
'run possible algorithms on real device to find the best'),
Doc('PROFILE_REPRODUCIBLE',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册