/** * \file src/core/include/megbrain/graph/var_node.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #pragma once #include "megbrain/graph/bases.h" #include "megbrain/utils/enum_class_bit.h" #include "megbrain/utils/comp_node_sync_manager.h" #include "megbrain/utils/small_vector.h" #include "megbrain/utils/mempool.h" #include "megbrain/tensor.h" #include #include namespace mgb { namespace imperative { class ProxyGraph; namespace proxy_graph { class ProxyGraph; } } // namespace imperative namespace cg { namespace static_infer { class StaticInferManagerImpl; } class VarDevMemDefragmenter; class EagerEvalManager; /*! * \brief memory allocation plan held by a variable * * A MemAllocPlan is a view (i.e. with offset and layout) for some Chunk; Memory * sharing between vars is implemented by sharing a Chunk of their mem plans. */ class MemAllocPlan final: public json::Serializable, public NonCopyableObj { public: /*! * \brief identifier for allocated memory * * Each Chunk object corresponds to an allocated memory chunk. Memory * forwarding and force updating are implemented by sharing Chunk * objects between vars. * * If mem_alloc_status is not invalid, the memory region for this chunk * is owner_var->dev_tensor().storage(). */ class Chunk : public NonCopyableObj { friend class MemAllocPlan; friend class VarDevMemDefragmenter; std::atomic_size_t m_refcnt; size_t m_size; public: /*! * \brief memory allocation status for this chunk * * Allocation status can either be INVALID, FROM_OWNER_VAR, or an * offset in a static allocation buffer. This status is compactly * represented by an integer value. No error check is performed in * the accessors. * * Note that for static_offset, it is set in * SeqMemOptimizer::plan_chunk_allocation() and accessed in * VarNodeMemManager::make_static_var_tensor_from_alloc_plan() */ class MemAllocStatus { static constexpr size_t INVALID = 0, FROM_OWNER_VAR = 1, OFFSET = 2; size_t m_val = INVALID; public: //! whether memory is not allocated yet bool is_invalid() const { return m_val == INVALID; } //! whether memory comes from owner_var->dev_tensor() bool is_from_owner_var() const { return m_val == FROM_OWNER_VAR; } //! whether memory is statically allocated bool is_static_offset() const { return m_val >= OFFSET; } size_t static_offset() const { return m_val - OFFSET; } void set_invalid() { m_val = INVALID; } void set_from_owner_var() { m_val = FROM_OWNER_VAR; } void set_static_offset(size_t offset) { m_val = offset + OFFSET; } }; //! var that first creates this chunk VarNode* const owner_var; MemAllocStatus mem_alloc_status; //! size of this chunk in bytes size_t size() const { return m_size; } //! update value of m_size, only used in dynamic var allocation void update_size_for_dynamic_alloc(size_t size) { m_size = size; } std::string id_str() const; #if MGB_ENABLE_JSON std::shared_ptr to_json() const; #endif explicit Chunk(VarNode* ov) : owner_var(ov) {} }; explicit MemAllocPlan(VarNode *owner_var); bool valid() const { return m_chunk; } //! dtype of owner var inline DType dtype() const; //! tensor format of owner var inline TensorFormat format() const; //! get associated chunk Chunk& chunk() { mgb_assert(valid()); return *m_chunk; } bool is_invalid_cond_exec() const { return m_chunk == &sm_chunk_invalid_cond_exec_marker; } //! get offset in bytes of this MemAllocPlan in associated chunk size_t offset_in_chunk_byte() const { return m_offset_byte; } const TensorLayout& layout() const { return m_layout; } MemAllocPlan& layout(const TensorLayout &dest, bool allow_shape_change = false); #if MGB_ENABLE_JSON std::shared_ptr to_json() const override; #endif /*! * \brief release current chunk and decr its refcnt * * Release tensor storage if refcnt drops to zero */ MemAllocPlan& release_chunk(); /*! * \brief reset chunk to a privately owned chunk, and setup offset and * layout from owner var, and clear tensor storage */ MemAllocPlan& reset_from_owner_var(); /*! * \brief reset to a special marker that indicates this var is not * computed in conditional execution * * This is used in VarNodeMemManager to detect if the var is invalid * without adding a new field. */ MemAllocPlan& reset_as_invalid_cond_exec() { m_chunk = &sm_chunk_invalid_cond_exec_marker; return *this; } /*! * \brief reset to uninitialized status * * This is called before calling OperatorNodeBase::init_output_mem_plan * and before memplan optimization. */ MemAllocPlan& reset_to_uninitialized() { m_layout.ndim = 0; m_chunk = nullptr; m_readonly_fwd_list.reset(); return *this; } //! assign layout, offset and chunk from another mem alloc plan MemAllocPlan& assign(const MemAllocPlan &src); //! assign for readonly forward MemAllocPlan& assign_for_forward( const MemAllocPlan &src, const SubTensorSpec &sub); /*! * \brief next readonly-forward reader of this MemAllocPlan * * All the readers of a MemAllocPlan form a singly-linked list which is * maintained by assign_for_forward(). */ MemAllocPlan* next_readonly_fwd_reader() const { return m_readonly_fwd_list.next(); } //! the var that owns this mem plan VarNode* owner_var() const { return m_chunk_storage.owner_var; } private: class ReadonlyFwdList { MemAllocPlan *m_prev = nullptr, *m_next = nullptr; static std::mutex list_mutex; public: MemAllocPlan* next() const { return m_next; } void reset(); inline void insert_after(const MemAllocPlan& prev, MemAllocPlan* self); inline void remove_self(); }; static Chunk sm_chunk_invalid_cond_exec_marker; TensorLayout m_layout; //!< actual layout; shape must equal to var shape size_t m_offset_byte = -1; //!< offset in m_chunk Chunk* m_chunk = nullptr; Chunk m_chunk_storage; mutable ReadonlyFwdList m_readonly_fwd_list; }; class VarNodeMemManager; /*! * \brief Node for a variable. * * It must be the output of exactly one OperatorNode and may be input to other * OperatorNode. * * Each variable has an owner, the operator that generates this variable as one * of the output. * * VarNode class exposes most commonly used memory management interface */ class VarNode final: public GraphNodeBase { public: /*! * \brief this constructor should only be called by * OperatorNodeBase::add_output * * implemented in core/impl/graph/operator_node.cpp */ inline VarNode(Maybe name, OperatorNodeBase *owner); /* ===================== memory optimization ===================== */ using LayoutConstraintCallback = thin_function; /*! * \brief add a callback function to check the validity of a particular * tensor layout * * If callback returns true, it means that this VarNode's dev_tensor * with given layout may be forwarded to opr directly, otherwise it * will be implicitly rearranged to a contiguous one. */ VarNode& add_layout_constraint(LayoutConstraintCallback callback); /*! * \brief requires the layout to be contiguous * * Note: since many oprs require inputs to be contiguous, this is * implemented by marking a flag on the var rather than adding a * LayoutConstraintCallback to check whether it is contiguous. All the * existing callbacks would be cleared and new callbacks would be * ignored after add_layout_constraint_contiguous() is invoked. */ VarNode& add_layout_constraint_contiguous(); /*! * \brief requires the layout to be monotone while allowing broadcast * * Note: similar to add_layout_constraint_contiguous() this is * implemented by marking a flag; however user-defined callbacks are * still invoked since they might impose stronger constraints. */ VarNode& add_layout_constraint_monotone(); /*! * \brief request that memory should be readonly forwarded from other * var * * Note that this function must be called from * OperatorNodeBase::mem_plan_fwd_in2out_readonly. * * \return whether this request could be satisfied */ MGB_WARN_UNUSED_RESULT bool set_fwd_in2out_readonly( VarNode* input, const SubTensorSpec& sub); /*! * \brief request that this var share memory with another var, whose * content would also be modified * * Note that this function must be called from * OperatorNodeBase::mem_plan_fwd_in2out_writable. */ VarNode& set_fwd_in2out_writable(VarNode *input); /*! * \brief require this var to share memory from another var; only used * for operators that have an explicit updating semantics * * Note that this function must be called during operator node * initialization */ VarNode& set_fwd_in2out_writable_force(VarNode *input); /* ===================== getter and setters ===================== */ OperatorNodeBase* owner_opr() const { return m_owner; } //! get name; if name is not valid, get name of owner opr const std::string& name() const; //! get name as C-string const char* cname() const { return name().c_str(); } //! whether name is explicitly set, bool has_name_set() const { return m_has_name_set; } //! set name explicitly VarNode& name(std::string name); //! get data type of data in this var DType dtype() const { return m_dev_tensor.dtype(); } //! get tensor format in this var TensorFormat format() const { return m_dev_tensor.format(); } //! set dtype; this function can only be called once VarNode& dtype(DType dtype); //! set format; this function can only be called once VarNode& format(TensorFormat format); MemAllocPlan& mem_plan() { return m_mem_plan; } bool dev_tensor_valid() const { return m_mem_plan.valid() && m_mem_plan.layout().eq_shape(m_shape) && m_dev_tensor.storage().comp_node_valid() && m_dev_tensor.layout().eq_layout(m_mem_plan.layout()) && m_dev_tensor.comp_node() == m_comp_node; } //! get the underlying device tensor to fill data const DeviceTensorND& dev_tensor() const; /*! * \brief get the underlying device tensor that can be modified(like * resize()) * * This should only be called from the owner opr of this var, and this * var must have flag NO_SYS_MEM_ALLOC. */ DeviceTensorND& mutable_dev_tensor(); /*! * \brief previous dev ptr before deallocating dev_tensor; used for * testing and debugging */ const void* prev_dev_ptr() const { return m_prev_dev_ptr; } /*! * \brief get the comp node on which this var is computed */ CompNode comp_node() const { return m_comp_node; } /*! * \brief set comp node; only the memory node could be changed if called * multiple times */ VarNode& comp_node(const CompNode &cn); const TensorShape& shape() const { return m_shape; } //! get current reference count; not thread safe, and only used for //! testing purposes size_t refcnt() const { return m_refcnt; } /*! * \brief reset VarNode shape * \return whether shape differs from old shape */ VarNode& shape(const TensorShape &shape); bool allow_shape_change() const { return m_allow_shape_change; } const TensorLayout& layout() const { mgb_assert(m_mem_plan.valid() && m_mem_plan.layout().eq_shape(m_shape)); return m_mem_plan.layout(); } #if MGB_ENABLE_JSON std::shared_ptr to_json() const override; #endif /*! * \brief add a callback to be executed when shape of this var is * updated * \param tag callback tag; each tag can have at most one callback */ void add_shape_update_callback(void *tag, thin_function cb) { m_shape_update_callback[tag] = cb; } enum class Flag: uint32_t; VarNode& add_flag(Flag flag); inline bool contain_flag(Flag flag) const; /* ===================== dynamic memory ===================== */ /*! * \brief set shape and alloc memory storage * * This function should only be called by this var's owner operator and * this var must have NO_SYS_MEM_ALLOC flag; if shape does not increase * and original tensor storage is valid, it is guaranteed that old data * would be retained. * * \warning Alloc size_req memory if size_req != 0. */ VarNode& shape_alloc(const TensorShape &shape, size_t size_req = 0); /*! * \brief directly reset device tensor from another var * * This function should only be called by this var's owner operator and * this var must have NO_SYS_MEM_ALLOC flag. It can be used to forward * var values in the same graph or between graph. If both \p src_var and * this var belong to same graph, memory forwarding may fail (e.g. when * \p src_var is force updated by another opr) * * \param src_var the var node to provide dev tensor, which must have * been initialized, and does not have to be in the same computing * graph. Its value must be contiguous. It can also be placed on a * different comp node. * * \return whether memory forwarding succeeds; if false is returned, a * new tensor would be allocated and its value is copied from src */ MGB_WARN_UNUSED_RESULT bool reset_dev_tensor_from_other_var( VarNode* src_var); /*! * \brief directly reset device tensor from a given tensor * * This function should only be called by this var's owner operator and * this var must have NO_SYS_MEM_ALLOC flag * * \param value the tensor to be used; it must be contiguous and be * placed on the same comp node of this var. */ VarNode& reset_dev_tensor_from_tensor(const DeviceTensorND &value); /*! * \brief add a var to add RT_FORCE_DYNAMIC_MEM_ALLOC flag if such flag * is added to this var * * The chains form a directed graph, and when a var is added * RT_FORCE_DYNAMIC_MEM_ALLOC by VarNodeMemManager, all nodes in the * connected component would be added with such flag. * * This method should be called from * OperatorNodeBase::init_rt_force_dynamic_mem_alloc_imply_chain impls. */ VarNode& add_rt_force_dynamic_mem_alloc_imply_chain(VarNode *dest); /* ===================== graph compiler special ===================== */ /*! * \brief initialize mem plan as a uniquely owned contiguous chunk * * this function should only be called from * OperatorNodeBase::init_output_mem_plan and shape and comp_node must * have been setup. * * \param fixed_alloc if not null, it should be a tensor providing * memory allocation for this var. */ MemAllocPlan& init_mem_plan( const DeviceTensorND* fixed_alloc = nullptr); private: //! whether its memory should be allocated by mgb system during graph //! execution; initialized in VarNodeMemManager::reset_opr_seq() bool m_should_sys_alloc = false; bool m_has_name_set = false; //! whether to allow shape being modified; used by eager const shape in //! static infer bool m_allow_shape_change = true; Maybe m_name; OperatorNodeBase * const m_owner; const void *m_prev_dev_ptr = nullptr; Flag m_flag = static_cast(0); TensorShape m_shape; CompNode m_comp_node; DeviceTensorND m_dev_tensor; MemAllocPlan m_mem_plan{this}; ThinHashMap> m_shape_update_callback; //! synchronizer that is managed by SeqCompNodeOptimizer CompNodeSyncManager* m_cn_sync_manager = nullptr; /*! * used by StaticInferManagerImpl to store the static infer trait * associated with this var. * * Almost every VarNode has an associated TagTraitContainer, so its * storage is inlined into VarNode. */ std::tuple m_static_infer_trait{nullptr, nullptr}; /*! * number of readers that rely on value of m_dev_tensor, used for * dynamic memory management. m_refcnt is initialized as m_refcnt_init. * For statically allocated vars and NO_MEM_RECLAIM vars, m_refcnt_init * is set to inf; otherwise it is the total number of outputs of reader * oprs that has DEV_VALUE dep on this var. After completion of each opr * on each comp node, or completion of a callback, m_refcnt would be * decreased; if it reaches zero, m_dev_tensor and m_mem_plan would be * released. */ std::atomic_size_t m_refcnt{0}; size_t m_refcnt_init = 0; std::vector m_rt_force_dynamic_mem_alloc_imply_chain; void modify_flag(Flag delta, Flag new_flag); void assign_dev_tensor_from_tensor(const DeviceTensorND &value); #if MGB_ENABLE_JSON std::shared_ptr dump_static_infer_info_to_json() const; #endif friend class static_infer::StaticInferManagerImpl; friend class VarNodeMemManager; friend class VarDevMemDefragmenter; friend class EagerEvalManager; friend class MemAllocPlan; friend class imperative::ProxyGraph; friend class imperative::proxy_graph::ProxyGraph; }; enum class VarNode::Flag : uint32_t { //! do not allocate memory by the system allocator even if shape could be //! inferred NO_SYS_MEM_ALLOC = 1 << 0, //! do not allocate memory if value of this var is not used (i.e. //! VarReceiverInfo::value_used() returns false) NO_ALLOC_IF_UNUSED = 1 << 1, /*! * do not allocate memory statically (would be allocated dynamically if * possible); useful if a var in subgraph would be directly forwarded to a * var in owner graph (e.g. in case for LAST output mode in Loop) */ NO_SYS_STATIC_MEM_ALLOC = 1 << 2, /*! * do not reclaim memory * if NO_SYS_MEM_ALLOC is set or this var has dynamic storage, memory would * not be reclaimed after all readers are processed * if this var has satic storage, its memory would not be reused by others */ NO_MEM_RECLAIM = 1 << 3, /*! * var node used as temporary storage, whose content should * not be read by others */ VOLATILE_CONTENT = 1 << 4, /*! * allow this var to have empty shape, which means it would not consume any * memory and it has nullptr as the underlying pointer; vars without this * flag set would trigger an error during memory allocation to avoid * uninitialized output var shape. This flag should be set by the owner opr. */ ALLOW_EMPTY_SHAPE = 1 << 5, /*! * value is always available on device even before opr is executed (e.g. * SharedDeviceTensor), so various optimizations can be performed */ PERSISTENT_DEVICE_VALUE = 1 << 6, /*! * disallow RT_FORCE_DYNAMIC_MEM_ALLOC added to this node during memory * optimization; this is only applicable when the operator manages memory * of this var manually, and the memory is never reclaimed. Must be used * with NO_MEM_RECLAIM. */ DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC = 1 << 7, /*! * disable sanity check for this VarNode * this flag is added for swap_memory; SwapInMS opr works as a trigger to * make its output VarNode start copying from host parallelly, when * SwapInMS finishs execute(), it is likely that its output tensor does not * have 'exact' content, so we need to disable var_sanity_check in this case */ DISALLOW_VAR_SANITY_CHECK = 1 << 8, /*! * force dynamic memory allocation even if shape could be statically * inferred; conflicts with NO_SYS_MEM_ALLOC * * note that this is a runtime-flag, which would be cleared and re-evaluated * on graph compiling; it is set up by VarNodeMemManager and propagated * through */ RT_FORCE_DYNAMIC_MEM_ALLOC = 1 << 9, /*! * this flag indicates that the opr has been inserted into the graph and * certain flags can not be modified. Only NO_MEM_RECLAIM, * NO_SYS_STATIC_MEM_ALLOC and RT_FORCE_DYNAMIC_MEM_ALLOC flags can be added * after FLAG_FREEZED is present. */ FLAG_FREEZED = 1 << 10, /*! * this flag indicates that data of this var has been processed and no need * later, it can be freed, this is used in weight preprocess for memory save */ MEMORY_NO_NEED = 1 << 11, }; MGB_DEF_ENUM_CLASS_BIT_OPR(VarNode::Flag) bool VarNode::contain_flag(Flag flag) const { return static_cast(m_flag & flag); } using VarNodeSet = ThinHashSet; DType MemAllocPlan::dtype() const { return m_chunk_storage.owner_var->dtype(); } TensorFormat MemAllocPlan::format() const { return m_chunk_storage.owner_var->format(); } } // namespace cg } // namespace mgb // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}