refactor(mge/functional): make docs better

GitOrigin-RevId: acb287d48b51024a4450f516565d8c5ed8ad2e9f

refactor(mge/functional): make docs better
GitOrigin-RevId: acb287d48b51024a4450f516565d8c5ed8ad2e9f
a8c75ee5 · Megvii Engine Team · 6c1dbd40 · a8c75ee5 · a8c75ee5 · a8c75ee5
38 changed file
--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
@@ -14,7 +14,7 @@ from .math import *
 from .nn import *
 from .quantized import conv_bias_activation
 from .tensor import *
-from .utils import accuracy, copy, zero_grad
+from .utils import accuracy, copy

 from . import distributed  # isort:skip


--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -33,6 +33,7 @@ __all__ = [
    "eq",
    "exp",
    "expm1",
+    "fast_tanh",
    "floor",
    "floor_div",
    "gt",
@@ -67,7 +68,6 @@ __all__ = [
    "sub",
    "tan",
    "tanh",
-    "fast_tanh",
 ]


@@ -108,13 +108,37 @@ def _elemwise_multi_type(*args, mode, **kwargs):
 def add(x, y):
    """Element-wise addition.
    At least one operand should be tensor.
-    same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.
+    Same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.
+
+    :param x: input tensor.
+    :return: computed tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.add(x, y)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[ 0.  2.  4.]
+         [ 6.  8. 10.]]
+
    """
    return _elwise(x, y, mode="add")


 def sub(x, y):
-    """Element-wise subtract."""
+    """Element-wise subtraction."""
    return _elwise(x, y, mode="sub")


@@ -173,24 +197,23 @@ def log1p(x):
    return _elwise(x, mode="log1p")


-def sqrt(inp: Tensor) -> Tensor:
-    """
-    Return a new tensor with the square-root of the elements of ``inp``.
-    For negative value, return nan.
+def sqrt(x: Tensor) -> Tensor:
+    """Element-wise sqrt.
+    For negative input value, return ``NaN``.

-    :param inp: The input tensor
-    :return: The computed tensor
+    :param x: input tensor.
+    :return: computed tensor.

    Examples:

    .. testcode::

        import numpy as np
-        import megengine as mge
+        from megengine import tensor
        import megengine.functional as F

-        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.sqrt(data)
+        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.sqrt(x)
        print(out.numpy())

    Outputs:
@@ -201,12 +224,12 @@ def sqrt(inp: Tensor) -> Tensor:
         [1.7321 2.     2.2361]]

    """
-    return inp ** 0.5
+    return x ** 0.5


-def square(inp: Tensor) -> Tensor:
+def square(x: Tensor) -> Tensor:
    """
-    Return a new tensor with the square of the elements of ``inp``
+    Return a new tensor with the square of the elements of input tensor.

    :param inp: The input tensor
    :return: The computed tensor
@@ -231,92 +254,129 @@ def square(inp: Tensor) -> Tensor:
         [ 9. 16. 25.]]

    """
-    return inp ** 2
+    return x ** 2


 def round(x):
-    """Round tensor to int element-wise."""
+    """Element-wise rounding to int."""
    return _elwise(x, mode="round")


 def ceil(x):
-    """Return the ceil of the input, element-wise."""
+    """Element-wise ceiling."""
    return _elwise(x, mode="ceil")


 def floor(x):
-    """Calculate the floor element-wise"""
+    """Element-wise floor."""
    return _elwise(x, mode="floor")


+def maximum(x, y):
+    """Element-wise maximum of array elements."""
+    return _elwise(x, y, mode="max")
+
+
+def minimum(x, y):
+    """Element-wise minimum of array elements."""
+    return _elwise(x, y, mode="min")
+
+
 # trigonometric functions


 def cos(x):
-    """Cosine, element-wise."""
+    """Element-wise cosine.
+
+    :param x: input tensor.
+    :return: computed tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.cos(x)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[ 1.      0.5403 -0.4161]
+         [-0.99   -0.6536  0.2837]]
+
+    """
    return _elwise(x, mode="cos")


 def sin(x):
-    """Sine, element-wise."""
+    """Element-wise sine."""
    return _elwise(x, mode="sin")


 def tan(x):
+    """Element-wise tangent."""
    return sin(x) / cos(x)


 def acos(x):
-    """Inverse cosine, element-wise."""
+    """Element-wise inverse cosine."""
    return _elwise(x, mode="acos")


 def asin(x):
-    """Inverse sine, element-wise."""
+    """Element-wise inverse sine."""
    return _elwise(x, mode="asin")


 def atan(x):
+    """Element-wise inverse tangent."""
    return _elwise(x, 1, mode="atan2")


 def atan2(y, x):
+    """Element-wise 2-argument arctangent."""
    return _elwise(y, x, mode="atan2")


 def cosh(x):
-    r"""Compute element-wise hyperbolic cosine."""
+    r"""Element-wise hyperbolic cosine."""
    return 0.5 * (exp(x) + exp(-x))


 def sinh(x):
-    r"""Compute element-wise hyperbolic sine."""
+    r"""Element-wise hyperbolic sine."""
    u = expm1(x)
    return 0.5 * u / (u + 1) * (u + 2)


 def tanh(x):
-    r"""Compute element-wise hyperbolic tangent."""
+    r"""Element-wise hyperbolic tangent."""
    return _elwise(x, mode="tanh")


 def asinh(x):
-    r"""Compute element-wise inverse hyperbolic sine."""
+    r"""Element-wise inverse hyperbolic sine."""
    return log(x + (x ** 2 + 1) ** 0.5)


 def acosh(x):
-    r"""Compute element-wise inverse hyperbolic cosine."""
+    r"""Element-wise inverse hyperbolic cosine."""
    return log(x + (x ** 2 - 1) ** 0.5)


 def atanh(x):
-    r"""Compute element-wise inverse hyperbolic tangent."""
+    r"""Element-wise inverse hyperbolic tangent."""
    return log1p(2 * x / (1 - x)) / 2


 def fast_tanh(x):
-    r"""Compute element-wise fast tanh; this is an approximation:
+    r"""Element-wise fast tanh; this is an approximation:

    .. math::
        \text{fast_tanh}(x) = x * (27. + x * x) / (27. + 9. * x * x)
@@ -328,29 +388,60 @@ def fast_tanh(x):


 def left_shift(x, y):
+    """Element-wise bitwise binary: x << y.
+
+    :param x: input tensor, should be int.
+    :param y: how many bits to be left-shifted.
+    :return: computed tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
+        out = F.left_shift(x, 2)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[ 0  4  8]
+         [12 16 20]]
+
+    """
    return _elwise(x, y, mode="shl")


 def right_shift(x, y):
-    return _elwise(x, y, mode="shl")
+    """Element-wise bitwise binary: x >> y."""
+    return _elwise(x, y, mode="shr")


 # logical functions


 def logical_and(x, y):
+    """Element-wise logical and: x && y."""
    return _elwise(x, y, mode="AND")


 def logical_not(x):
+    """Element-wise logical not: ~x."""
    return _elwise(x, mode="NOT")


 def logical_or(x, y):
+    """Element-wise logical or: x || y."""
    return _elwise(x, y, mode="OR")


 def logical_xor(x, y):
+    """Element-wise logical xor: x ^ y."""
    return _elwise(x, y, mode="XOR")


@@ -358,72 +449,112 @@ def logical_xor(x, y):


 def eq(x, y):
-    """Return (x == y) element-wise."""
+    """Element-wise (x == y).
+
+    :param x: input tensor 1.
+    :param y: input tensor 2.
+    :return: computed tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.eq(x, y)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1. 1. 1.]
+         [1. 1. 1.]]
+
+    """
    return _elwise(x, y, mode="eq")


 def ne(x, y):
+    """Element-wise (x != y)."""
    return x != y


 def lt(x, y):
-    """Return (x < y) element-wise."""
+    """Element-wise (x < y)."""
    return _elwise(x, y, mode="lt")


 def le(x, y):
-    """Return (x =< y) element-wise."""
+    """Element-wise (x <= y)."""
    return _elwise(x, y, mode="leq")


 def gt(x, y):
-    """Return (x > y) element-wise."""
+    """Element-wise (x > y)."""
    return _elwise(y, x, mode="lt")


 def ge(x, y):
-    """Return (x >= y) element-wise"""
+    """Element-wise (x >= y)."""
    return _elwise(y, x, mode="leq")


+# other functions
+
+
 def hswish(x):
-    """Return x * relu6(x + 3) / 6 element-wise"""
+    """Element-wise x * relu6(x + 3) / 6.
+
+    :param x: input tensor.
+    :return: computed tensor.
+
+    Example:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(5).astype(np.float32))
+        out = F.hswish(x)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [0.     0.6667 1.6667 3.     4.    ]
+
+    """
    return _elwise(x, mode="h_swish")


 def hsigmoid(x):
-    """Return relu6(x + 3) / 6 element-wise"""
+    """Element-wise relu6(x + 3) / 6."""
    return relu6(x + 3) / 6


 def relu(x):
-    """Return `max(x, 0)` element-wise."""
+    """Element-wise `max(x, 0)`."""
    return _elwise(x, mode="relu")


 def relu6(x):
-    """Return min(max(x, 0), 6) element-wise."""
+    """Element-wise min(max(x, 0), 6)."""
    return minimum(maximum(x, 0), 6)


 def sigmoid(x):
-    """Return 1 / ( 1 + exp( -x ) ) element-wise."""
+    """Element-wise 1 / ( 1 + exp( -x ) )."""
    return _elwise(x, mode="sigmoid")


-def maximum(x, y):
-    """Element-wise maximum of array elements."""
-    return _elwise(x, y, mode="max")
-
-
-def minimum(x, y):
-    """Element-wise minimum of array elements."""
-    return _elwise(x, y, mode="min")
-
-
-def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
-    r"""
-    Clamp all elements in :attr:`inp` into the range `[` :attr:`lower`, :attr:`upper` `]` and return
+def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
+    r"""Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
    a resulting tensor:

    .. math::
@@ -433,9 +564,10 @@ def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
            \text{upper} & \text{if } x_i > \text{upper}
        \end{cases}

-    :param inp: the input tensor.
-    :param lower: lower-bound of the range to be clamped to
-    :param upper: upper-bound of the range to be clamped to
+    :param x: input tensor.
+    :param lower: lower-bound of the range to be clamped to.
+    :param upper: upper-bound of the range to be clamped to.
+    :return: output clamped tensor.

    Examples:

@@ -444,12 +576,10 @@ def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
        import numpy as np
        from megengine import tensor
        import megengine.functional as F
-        a = tensor(np.arange(5).astype(np.int32))

+        a = tensor(np.arange(5).astype(np.int32))
        print(F.clamp(a, 2, 4).numpy())
-
        print(F.clamp(a, lower=3).numpy())
-
        print(F.clamp(a, upper=3).numpy())

    Outputs:
@@ -467,8 +597,8 @@ def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
    if lower is not None:
        if upper is not None:
            assert lower <= upper, "clamp lower bound is bigger that upper bound"
-            return minimum(maximum(inp, lower), upper)
+            return minimum(maximum(x, lower), upper)
        else:
-            return maximum(inp, lower)
+            return maximum(x, lower)
    else:
-        return minimum(inp, upper)
+        return minimum(x, upper)
--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
@@ -9,22 +9,22 @@
 # pylint: disable=too-many-lines
 from typing import List

-from ..core import Tensor
+from ..tensor import Tensor


 def cambricon_subgraph(
    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
 ) -> List[Tensor]:
-    """Load a serialized Cambricon subgraph (i.e. cnrtModel_t) and
+    """Loads a serialized Cambricon subgraph (i.e. cnrtModel_t) and
    execute the operations defined in the subgraph.

-    :param inputs: List of input tensors of the subgraph.
-    :param data: The serialized subgraph.
-    :param symbol: The name of the function in the subgraph.
+    :param inputs: list of input tensors of the subgraph.
+    :param data: the serialized subgraph.
+    :param symbol: the name of the function in the subgraph.
        The function is corresponding to a cnmlFusionOp
        which is added to the cnmlModel_t/cnrtModel_t.
-    :param tensor_dim_mutable: Whether the input tensors' shapes are mutalbe
-        in cnrtModel_t
+    :param tensor_dim_mutable: whether the input tensors' shapes are mutalbe
+        in cnrtModel_t.
    """
    raise NotImplementedError

@@ -32,13 +32,13 @@ def cambricon_subgraph(
 def extern_opr_subgraph(
    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
 ) -> List[Tensor]:
-    """Load a serialized extern opr subgraph and fake execute the operator
+    """Loads a serialized extern opr subgraph and fake execute the operator.

-    :param inputs: Tensor or list of input tensors.
-    :param output_shapes: The output shapes.
-    :param dump_name: The serialized subgraph name.
-    :param dump_data: The serialized subgraph.
+    :param inputs: tensor or list of input tensors.
+    :param output_shapes: the output shapes.
+    :param dump_name: the serialized subgraph name.
+    :param dump_data: the serialized subgraph.

-    :return: List of tensors
+    :return: list of tensors.
    """
    raise NotImplementedError
--- a/imperative/python/megengine/functional/graph.py
+++ b/imperative/python/megengine/functional/graph.py
@@ -9,7 +9,7 @@
 import collections
 from typing import Iterable, Optional, Union

-from ..core.tensor import Tensor
+from ..tensor import Tensor


 def add_update(
@@ -20,7 +20,7 @@ def add_update(
    beta: Union[Tensor, float, int] = 1.0,
    bias: Union[Tensor, float, int] = 0.0
 ):
-    r"""Inplace modify ``dest`` as follows:
+    r"""Modify ``dest`` inplace as follows:

    .. math::
        dest = alpha * dest +  beta * delta + bias

--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -11,9 +11,8 @@ import numpy as np
 from ..core.tensor.utils import make_shape_tuple
 from ..tensor import Tensor
 from .elemwise import abs, eq, exp, log, maximum, pow, relu
-from .nn import assert_equal, indexing_one_hot
+from .nn import indexing_one_hot
 from .tensor import where
-from .utils import zero_grad

 __all__ = [
    "l1_loss",
@@ -25,8 +24,7 @@ __all__ = [


 def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""
-    Calculates the mean absolute error (MAE) between
+    r"""Calculates the mean absolute error (MAE) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean absolute error can be described as:
@@ -43,8 +41,9 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
+    :param pred: predicted result from model.
+    :param label: ground truth to compare.
+    :return: loss value.

    Examples:

@@ -53,9 +52,10 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
        import numpy as np
        import megengine as mge
        import megengine.functional as F
+
        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.l1_loss(ipt,tgt)
+        loss = F.l1_loss(ipt, tgt)
        print(loss.numpy())

    Outputs:
@@ -70,8 +70,7 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:


 def square_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""
-    Calculates the mean squared error (squared L2 norm) between
+    r"""Calculates the mean squared error (squared L2 norm) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean squared error can be described as:
@@ -88,13 +87,33 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

-    :param pred: The predicted result from model.
-    :param label: The ground truth to compare.
+    :param pred: predicted result from model.
+    :param label: ground truth to compare.
+    :return: loss value.

    Shape:
        - pred: :math:`(N, *)` where :math:`*` means any number of additional
-          dimensions
-        - label: :math:`(N, *)`. Same shape as ``pred``
+          dimensions.
+        - label: :math:`(N, *)`. Same shape as ``pred``.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.functional as F
+
+        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+        loss = F.square_loss(ipt, tgt)
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [9.75]

    """
    diff = pred - label
@@ -104,8 +123,7 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
 def cross_entropy_with_softmax(
    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
 ) -> Tensor:
-    r"""
-    Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
+    r"""Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.

    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.

@@ -116,10 +134,33 @@ def cross_entropy_with_softmax(
    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.

-    :param pred: The input tensor representing the predicted probability.
-    :param label: The input tensor representing the classification label.
-    :param axis: An axis along which softmax will be applied. Default: 1.
-    :param label_smooth: A label smoothing of parameter that can re-distribute target distribution. Default: 0.
+    :param pred: input tensor representing the predicted probability.
+    :param label: input tensor representing the classification label.
+    :param axis: an axis along which softmax will be applied. Default: 1
+    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
+    :return: loss value.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data_shape = (1, 2)
+        label_shape = (1, )
+        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
+        label = tensor(np.ones(label_shape, dtype=np.int32))
+        loss = F.cross_entropy_with_softmax(pred, label)
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.6931]
+
    """
    n0 = pred.ndim
    n1 = label.ndim
@@ -147,26 +188,44 @@ def cross_entropy_with_softmax(
 def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
    r"""Function that measures the Binary Cross Entropy between the target and the prediction.

-    :param pred: (N,*) where * means, any number of additional dimensions.
-    :param label: (N,*), same shape as the input.
+    :param pred: `(N, *)` where `*` means any number of additional dimensions.
+    :param label: `(N, *)`, same shape as the input.
+    :return: loss value.

-    """
-    assert make_shape_tuple(pred.shape) == make_shape_tuple(label.shape)
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(1, 2))
+        label = tensor(np.ones((1, 2), dtype=np.float32))
+        loss = F.binary_cross_entropy(pred, label)
+        print(loss.numpy())
+
+    Outputs:

+    .. testoutput::
+
+        [0.6931]
+
+    """
    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()


 def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
-    r"""
-    Caculate the hinge loss which is often used in SVMs.
+    r"""Caculate the hinge loss which is often used in SVMs.

    The hinge loss can be described as:

-    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_i_j*y_i_j))
+    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_{ij}*y_{ij}))

-    :param pred: The input tensor representing the predicted probability, shape is (N, C).
-    :param label: The input tensor representing the binary classification label, shape is (N, C).
-    :param norm: Specify the norm to caculate the loss, should be "L1" or "L2".
+    :param pred: input tensor representing the predicted probability, shape is `(N, C)`.
+    :param label: input tensor representing the binary classification label, shape is `(N, C)`.
+    :param norm: specify the norm to caculate the loss, should be "L1" or "L2".
+    :return: loss value.

    Examples:

@@ -177,9 +236,7 @@ def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
-
        loss = F.hinge_loss(pred, label)
-
        print(loss.numpy())

    Outputs:

--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -18,7 +18,7 @@ from ..core.tensor import utils
 from ..core.tensor.core import apply
 from ..tensor import Tensor
 from .elemwise import clamp, exp, log, log1p
-from .tensor import remove_axis, reshape
+from .tensor import add_axis, remove_axis, reshape

 __all__ = [
    "argmax",
@@ -42,10 +42,10 @@ __all__ = [


 def isnan(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is NaN or not.
+    r"""Returns a new tensor representing if each element is ``NaN`` or not.

-    :param: inp
-    :return: a new tensor representing if each element in :attr:`inp` is NaN or not.
+    :param inp: input tensor.
+    :return: a new tensor representing if each element in inp is NaN or not.

    Examples:

@@ -55,7 +55,6 @@ def isnan(inp: Tensor) -> Tensor:
        import megengine.functional as F

        x = tensor([1, float("nan"), 0])
-
        print(F.isnan(x).numpy())

    Outputs:
@@ -69,10 +68,10 @@ def isnan(inp: Tensor) -> Tensor:


 def isinf(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is Inf or not.
+    r"""Returns a new tensor representing if each element is ``Inf`` or not.

-    :param: inp
-    :return: a new tensor representing if each element in :attr:`inp` is Inf or not.
+    :param inp: input tensor.
+    :return: a new tensor representing if each element in inp is Inf or not.

    Examples:

@@ -82,7 +81,6 @@ def isinf(inp: Tensor) -> Tensor:
        import megengine.functional as F

        x = tensor([1, float("inf"), 0])
-
        print(F.isinf(x).numpy())

    Outputs:
@@ -96,10 +94,10 @@ def isinf(inp: Tensor) -> Tensor:


 def sign(inp: Tensor):
-    r"""Returns sign of each element in the input tensor.
+    r"""Returns a new tensor representing the sign of each element in input tensor.

-    :param: inp
-    :return: a sign tensor.
+    :param: input tensor.
+    :return: the sign of input tensor.

    Examples:

@@ -109,8 +107,9 @@ def sign(inp: Tensor):
        import megengine.functional as F

        x = tensor([1, -1, 0])
-
        print(F.sign(x).numpy())
+    
+    Outputs:

    .. testoutput::

@@ -125,14 +124,15 @@ def sum(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the sum of each row of the ``inp`` tensor in the given ``axis``.
+    r"""Returns the sum of input tensor along given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor.
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced.
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced.
        Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not.
+    :param keepdims: whether the output tensor has axis retained or not.
        Default: False
-    :return: The output tensor
+    :return: output tensor.

    Examples:

@@ -142,12 +142,12 @@ def sum(
        from megengine import tensor
        import megengine.functional as F

-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.sum(data)
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.sum(x)
        print(out.numpy())
-    
+
    Outputs:
-    
+
    .. testoutput::

        [21]
@@ -159,13 +159,13 @@ def sum(
 def prod(
    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None, keepdims=False
 ) -> Tensor:
-    r"""
-    Returns the element product of input tensor along given *axis*.
+    r"""Returns the product of input tensor along given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: ``False``
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -175,8 +175,8 @@ def prod(
        from megengine import tensor
        import megengine.functional as F

-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.prod(data)
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.prod(x)
        print(out.numpy())

    Outputs:
@@ -194,13 +194,14 @@ def mean(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    """Returns the mean value of each row of the ``inp`` tensor in
-    the given ``axis``. If axis is a list of dimensions,
+    """Returns the mean value of input tensor along
+    given axis. If axis is a list of dimensions,
    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -210,8 +211,8 @@ def mean(
        from megengine import tensor
        import megengine.functional as F

-        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.mean(data)
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.mean(x)
        print(out.numpy())

    Outputs:
@@ -224,27 +225,19 @@ def mean(
    return inp.astype("float32").mean(axis=axis, keepdims=keepdims)


-def median(
-    inp: Tensor,
-    axis: Optional[Union[int, Sequence[int]]] = None,
-    keepdims: bool = False,
-) -> Tensor:
-    raise NotImplementedError
-
-
 def var(
    inp: Tensor,
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
    """Returns the variance value of input tensor along
-    given ``axis``. If axis is a list of dimensions,
+    given axis. If axis is a list of dimensions,
    reduce over all of them.

-    :param inp: The input tensor.
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``.
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: ``False``.
-    :return: The output tensor.
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -278,13 +271,13 @@ def std(
    keepdims: bool = False,
 ) -> Tensor:
    """Returns the standard deviation of input tensor along
-    given ``axis``. If axis is a list of dimensions,
+    given axis. If axis is a list of dimensions,
    reduce over all of them.

-    :param inp: The input tensor.
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``.
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: ``False``.
-    :return: The output tensor.
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -312,13 +305,14 @@ def min(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the min value of input tensor along given *axis*.
+    r"""Returns the min value of input tensor along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -329,8 +323,8 @@ def min(
        import megengine.functional as F

        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.min(x)
-        print(y.numpy())
+        out = F.min(x)
+        print(out.numpy())

    Outputs:

@@ -347,12 +341,14 @@ def max(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the max value of the input tensor along given *axis*.
+    r"""Returns the max value of the input tensor along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -363,8 +359,8 @@ def max(
        import megengine.functional as F

        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.max(x)
-        print(y.numpy())
+        out = F.max(x)
+        print(out.numpy())

    Outputs:

@@ -382,13 +378,15 @@ def norm(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims=False,
 ):
-    """Calculate ``p``-norm of input tensor along certain axis.
+    """Calculates ``p``-norm of input tensor along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param p: power of value ``p`` applied to ``inp``. Default: 2
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
-    :return: The output tensor
+    :param inp: input tensor.
+    :param p: power of value applied to inp. Default: 2
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -399,8 +397,8 @@ def norm(
        import megengine.functional as F

        x = tensor(np.arange(-3, 3, dtype=np.float32).reshape(2,3))
-        y = F.norm(x)
-        print(y.numpy())
+        out = F.norm(x)
+        print(out.numpy())

    Outputs:

@@ -423,12 +421,14 @@ def argmin(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the indices of the minimum values along an axis
+    r"""Returns the indices of the minimum values along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -439,8 +439,8 @@ def argmin(
        import megengine.functional as F

        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.argmin(x)
-        print(y.numpy())
+        out = F.argmin(x)
+        print(out.numpy())

    Outputs:

@@ -479,12 +479,14 @@ def argmax(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the indices of the maximum values along an axis
+    r"""Returns the indices of the maximum values along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    :param inp: The input tensor
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
-    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: whether the output tensor has axis retained or not. Default: False
+    :return: output tensor.

    Examples:

@@ -495,9 +497,9 @@ def argmax(
        import megengine.functional as F

        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        y = F.argmax(x)
-        print(y.numpy())
- 
+        out = F.argmax(x)
+        print(out.numpy())
+
    Outputs:

    .. testoutput::
@@ -536,21 +538,22 @@ def normalize(
    axis: Optional[Union[int, Sequence[int]]] = None,
    eps: float = 1e-12,
 ) -> Tensor:
-    r"""Perform :math:`L_p` normalization of input tensor along certain axis.
+    r"""Performs :math:`L_p` normalization of input tensor along
+    given axis. If axis is a list of dimensions,
+    reduce over all of them.

-    For a tensor :attr:`inp` of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    For a tensor inp of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:

    .. math::
        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.

-    :param inp: the input tensor
-    :param p: power of value ``p`` applied to ``inp``. Default: 2
-    :param axis: The dimension to reduce. If None, all the dimensions will be reduced
+    :param inp: input tensor.
+    :param p: power of value applied to inp. Default: 2
+    :param axis: dimension to reduce. If None, all the dimensions will be reduced
        to calculate the norm. Default: None
    :param eps: a small value to avoid division by zero. Default: 1e-12
-    :return: the normalized output tensor
-
+    :return: normalized output tensor.
    """
    if axis is None:
        return inp / clamp(norm(inp, p, axis), lower=eps)
@@ -559,12 +562,11 @@ def normalize(


 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
-    r"""
-    Sort the target 2d matrix by row, return both the sorted tensor and indices.
+    r"""Sorts the target 2d matrix by row, return both the sorted tensor and indices.

-    :param inp: The input tensor, if 2d, each row will be sorted
-    :param descending: Sort in descending order, where the largest comes first. Default: ``False``
-    :return: Tuple of two tensors (sorted_tensor, indices_of_int32)
+    :param inp: input tensor, if 2d, each row will be sorted.
+    :param descending: Sort in descending order, where the largest comes first. Default: False
+    :return: Tuple of two tensors `(sorted_tensor, indices_of_int32)`.

    Examples:

@@ -573,8 +575,9 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:
        import numpy as np
        from megengine import tensor
        import megengine.functional as F
-        data = tensor(np.array([1,2], dtype=np.float32))
-        indices = F.argsort(data)
+
+        x = tensor(np.array([1,2], dtype=np.float32))
+        indices = F.argsort(x)
        print(indices.numpy())

    Outputs:
@@ -622,15 +625,14 @@ def topk(
    kth_only: bool = False,
    no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
-    r"""
-    Selected the Top-K (by default) smallest elements of 2d matrix by row.
+    r"""Selects the ``Top-K(by default)`` smallest elements of 2d matrix by row.

-    :param inp: The input tensor, if 2d, each row will be sorted
-    :param k: The number of elements needed
-    :param descending: If true, return the largest elements instead. Default: ``False``
-    :param kth_only: If true, only the k-th element will be returned. Default: ``False``
-    :param no_sort: If true, the returned elements can be unordered. Default: ``False``
-    :return: Tuple of two tensors (topk_tensor, indices_of_int32)
+    :param inp: input tensor, if 2d, each row will be sorted.
+    :param k: number of elements needed.
+    :param descending: if true, return the largest elements instead. Default: False
+    :param kth_only: if true, only the k-th element will be returned. Default: False
+    :param no_sort: if true, the returned elements can be unordered. Default: False
+    :return: tuple of two tensors `(topk_tensor, indices_of_int32)`.

    Examples:

@@ -639,8 +641,9 @@ def topk(
        import numpy as np
        from megengine import tensor
        import  megengine.functional as F
-        data = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
-        top, indices = F.topk(data, 5)
+
+        x = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
+        top, indices = F.topk(x, 5)
        print(top.numpy(), indices.numpy())

    Outputs:

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -29,32 +29,29 @@ def conv_bias_activation(
    conv_mode="CROSS_CORRELATION",
    compute_mode="DEFAULT",
 ) -> Tensor:
-    """ convolution bias with activation operation, only for inference.
+    """Convolution bias with activation operation, only for inference.

-    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
-    :param bias: The bias added to the result of convolution
-    :param stride: Stride of the 2D convolution operation. Default: 1
-    :param padding: Size of the paddings added to the input on both sides of its
+    :param inp: feature map of the convolution operation.
+    :param weight: convolution kernel.
+    :param bias: bias added to the result of convolution
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: Dilation of the 2D convolution operation. Default: 1
+    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``.
-    :type conv_mode: string or :class:`P.Convolution.Mode`
-    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
-        'CROSS_CORRELATION'.
-    :param dtype:  Support for np.dtype, Default:
-        np.int8.
-    :param scale:  scale if use quantization, Default:
-        0.0.
-    :param zero_point:  scale if use quantization quint8, Default:
-        0.0.
+        so as to perform a "grouped convolution". When groups is not 1,
+        in_channels and out_channels must be divisible by groups,
+        and the shape of weight should be `(groups, out_channel // groups,
+        in_channels // groups, height, width)`.
+    :type conv_mode: string or :class:`P.Convolution.Mode`.
+    :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
+        'CROSS_CORRELATION'
+    :param dtype: support for np.dtype, Default: np.int8
+    :param scale: scale if use quantization, Default: 0.0
+    :param zero_point: scale if use quantization quint8, Default: 0.0
    :type compute_mode: string or
-        :class:`P.Convolution.ComputeMode`
-    :param compute_mode: When set to 'DEFAULT', no special requirements will be
+        :class:`P.Convolution.ComputeMode`.
+    :param compute_mode: when set to 'DEFAULT', no special requirements will be
        placed on the precision of intermediate results. When set to 'FLOAT32',
        Float32 would be used for accumulator and intermediate result, but only
        effective when input and output are of Float16 dtype.

--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -36,12 +36,14 @@ __all__ = [
    "broadcast",
    "concat",
    "cond_take",
-    "dimshuffle",
-    "expand_dims",
+    "transpose",
+    "add_axis",
    "eye",
+    "flatten",
    "full",
    "full_like",
    "gather",
+    "identity",
    "linspace",
    "ones",
    "ones_like",
@@ -50,7 +52,6 @@ __all__ = [
    "reshape",
    "remove_axis",
    "split",
-    "squeeze",
    "stack",
    "scatter",
    "transpose",
@@ -60,16 +61,14 @@ __all__ = [
 ]


-def eye(n: int, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
-    """
-    Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
+def eye(shape, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
+    """Returns a 2D tensor with ones on the diagonal and zeros elsewhere.

-    :param n: The number of rows
-    :param m: The number of columns. Default: None
-    :param dtype: The data type. Default: None
-    :param device: Compute node of the matrix. Default: None
-    :param comp_graph: Compute graph of the matrix. Default: None
-    :return: The eye matrix
+    :param shape: expected shape of otuput tensor.
+    :param m: number of columns. Default: None
+    :param dtype: data type. Default: None
+    :param device: compute node of the matrix. Default: None
+    :return: eye matrix.

    Examples:

@@ -79,8 +78,7 @@ def eye(n: int, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor
        import megengine.functional as F

        data_shape = (4, 6)
-        n, m = data_shape
-        out = F.eye([n, m], dtype=np.float32)
+        out = F.eye(data_shape, dtype=np.float32)
        print(out.numpy())

    Outputs:
@@ -94,11 +92,13 @@ def eye(n: int, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor

    """
    op = builtin.Eye(k=0, dtype=dtype, comp_node=device)
-    (result,) = apply(op, Tensor(n, dtype="int32", device=device))
+    (result,) = apply(op, Tensor(shape, dtype="int32", device=device))
    return result


 def full(shape, value, dtype="float32", device=None):
+    """Returns a tensor with given shape and value.
+    """
    if isinstance(shape, int):
        shape = (shape,)
    if device is None:
@@ -110,18 +110,42 @@ def full(shape, value, dtype="float32", device=None):


 def ones(shape, dtype="float32", device=None):
+    """Returns a ones tensor with given shape.
+
+    :param inp: input tensor.
+    :return: output zero tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import megengine.functional as F
+
+        out = F.ones((2, 1))
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1.]
+         [1.]]
+
+    """
    return full(shape, 1.0, dtype=dtype, device=device)


 def zeros(shape, dtype="float32", device=None):
+    """Returns a zero tensor with given shape.
+    """
    return full(shape, 0.0, dtype=dtype, device=device)


 def zeros_like(inp: Tensor) -> Tensor:
-    r"""
-    Returns a zero tensor with the same shape as input tensor
+    """Returns a zero tensor with the same shape as input tensor.

-    :param inp: input tensor
+    :param inp: input tensor.
+    :return: output zero tensor.

    Examples:

@@ -147,26 +171,36 @@ def zeros_like(inp: Tensor) -> Tensor:


 def ones_like(inp: Tensor) -> Tensor:
-    r"""
-    Returns a identity tensor with the same shape as input tensor
+    """Returns a identity tensor with the same shape as input tensor.
    """
    return ones(inp.shape, dtype=inp.dtype, device=inp.device)


 def full_like(inp: Tensor, value: Union[int, float]) -> Tensor:
-    r"""
-    Returns a tensor filled with value val with the same shape as input tensor
+    """Returns a tensor filled with given value with the same shape as input tensor.
    """
    return full(inp.shape, value, dtype=inp.dtype, device=inp.device)


+def identity(inp: Tensor) -> Tensor:
+    """Applies an identity transform to the input tensor.
+
+    :param inp: input tensor.
+    :return: output tensor.
+    """
+    op = builtin.Identity()
+    (data,) = utils.convert_inputs(inp)
+    (output,) = apply(op, data)
+    return output
+
+
 def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
    """
-    Broadcast a tensor to ``shape``
+    Broadcasts a tensor to given shape.

-    :param inp: The input tensor
-    :param shape: The target shape
-    :return: The output tensor
+    :param inp: input tensor.
+    :param shape: target shape.
+    :return: output tensor.

    Examples:

@@ -206,10 +240,10 @@ def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
    r"""
    Concat some tensors

-    :param inps: Input tensors to concat
-    :param axis: the dimension over which the tensors are concatenated. Default: 0
-    :param device: The comp node output on. Default: None
-    :return: The output tensor
+    :param inps: input tensors to concat.
+    :param axis: dimension over which the tensors are concatenated. Default: 0
+    :param device: comp node output on. Default: None
+    :return: output tensor.

    Examples:

@@ -254,10 +288,10 @@ def stack(inps, axis=0, device=None):
    """Concats a sequence of tensors along a new axis.
    The input tensors must have the same shape.

-    :param inps: The input tensors.
-    :param axis: Which axis will be concatenated.
+    :param inps: input tensors.
+    :param axis: which axis will be concatenated.
    :param device: The comp node output on. Default: None
-    :return: The output concatenated tensor.
+    :return: output concatenated tensor.

    Examples:

@@ -296,10 +330,10 @@ def split(inp, nsplits_or_sections, axis=0):
    """Splits the input tensor into several smaller tensors.
    When nsplits_or_sections is int, the last tensor may be smaller than others.

-    :param inp: The input tensor.
-    :param nsplits_or_sections: Number of sub tensors or section information list.
-    :param axis: Which axis will be splited.
-    :return: The output tensor list.
+    :param inp: input tensor.
+    :param nsplits_or_sections: number of sub tensors or section information list.
+    :param axis: which axis will be splited.
+    :return: output tensor list.

    Examples:

@@ -377,8 +411,7 @@ def _get_idx(index, axis):


 def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
-    r"""
-    Gather data from :attr:`inp` on :attr:`axis` using :attr:`index`.
+    r"""Gathers data from inp on axis using index.

    For a 3-D tensor, the output is specified by::

@@ -386,16 +419,16 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
        out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
        out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2

-    if :attr:`inp` is an n-dimensional tensor with size
+    if inp is an n-dimensional tensor with size
    :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
-    then :attr:`index` must be an n-dimensional tensor with size
+    then index must be an n-dimensional tensor with size
    :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
-    output will have the same size as :attr:`index`.
-
+    output will have the same size as index.

-    :param inp: the source tensor
-    :param axis: the axis along which to index
-    :param index: the indices of elements to gather
+    :param inp: input tensor.
+    :param axis: axis along which to index.
+    :param index: indices of elements to gather.
+    :return: output tensor.

    Examples:

@@ -449,25 +482,25 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:


 def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
-    r"""
-    Writes all values from the tensor :attr:`source` into :attr:`inp` at the indices specified in the :attr:`index` tensor.
+    r"""Writes all values from the tensor source into inp 
+    at the indices specified in the index tensor.

-    For each value in :attr:`source`, its output index is specified by its index
-    in :attr:`source` for ``axis != dimension`` and by the corresponding value in
-    :attr:`index` for ``axis = dimension``.
+    For each value in source, its output index is specified by its index
+    in source for ``axis != dimension`` and by the corresponding value in
+    index for ``axis = dimension``.

-    For a 3-D tensor, :attr:`inp` is updated as::
+    For a 3-D tensor, inp is updated as::

        inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
        inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
        inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2

-    :attr:`inp`, :attr:`index` and :attr:`source` should have same number of dimensions.
+    inp, index and source should have same number of dimensions.

    It is also required that ``source.shape(d) <= inp.shape(d)`` and ``index.shape(d) == source.shape(d)``
    for all dimensions ``d``.

-    Moreover, the values of :attr:`index` must be between ``0`` and ``inp.shape(axis) - 1`` inclusive.
+    Moreover, the values of index must be between ``0`` and ``inp.shape(axis) - 1`` inclusive.

    .. note::
        Please notice that, due to performance issues, the result is uncertain on the GPU device
@@ -478,10 +511,11 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
        from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
        if set the index[1][2] from 1 to 0.

-    :param inp: the inp tensor which to be scattered
-    :param axis: the axis along which to index
-    :param index: the indices of elements to scatter
-    :param source: the source element(s) to scatter
+    :param inp: inp tensor which to be scattered.
+    :param axis: axis along which to index.
+    :param index: indices of elements to scatter.
+    :param source: source element(s) to scatter.
+    :return: output tensor.

    Examples:

@@ -553,16 +587,16 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:


 def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
-    r"""
-    Select elements either from Tensor x or Tensor y, according to mask.
+    r"""Selects elements either from Tensor x or Tensor y, according to mask.

    .. math::

        \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i

-    :param mask: a mask used for choosing x or y
-    :param x: the first choice
-    :param y: the second choice
+    :param mask: a mask used for choosing x or y.
+    :param x: first choice.
+    :param y: second choice.
+    :return: output tensor.

    Examples:

@@ -620,8 +654,8 @@ def cond_take(mask: Tensor, x: Tensor) -> Tensor:
    and the second is the indices corresponding to those elements;
    they are both 1-dimensional. High-dimension input would first be flattened.

-    :param mask: condition param; must be the same shape with data
-    :param x: input tensor from which to take elements
+    :param mask: condition param; must be the same shape with data.
+    :param x: input tensor from which to take elements.

    Examples:

@@ -657,12 +691,13 @@ def cond_take(mask: Tensor, x: Tensor) -> Tensor:
    return v, index


-def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
+def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
    r"""
-    Swap shapes and strides according to given pattern
+    Swaps shapes and strides according to given pattern.

-    :param inp: Input tensor
-    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:
+    :param inp: input tensor.
+    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, 
+    and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:

        * (``'x'``) -> make a 0d (scalar) into a 1d vector
        * (0, 1) -> identity for 2d vectors
@@ -674,7 +709,7 @@ def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
        * (1, ``'x'``, 0) -> AxB to Bx1xA
        * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)

-    :return: The output tensor
+    :return: output tensor.

    Examples:

@@ -684,7 +719,7 @@ def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
        from megengine import tensor
        import megengine.functional as F
        x = tensor(np.array([[1, 1], [0, 0]], dtype=np.int32))
-        out = F.dimshuffle(x, (1, 0))
+        out = F.transpose(x, (1, 0))
        print(out.numpy())

    Outputs:
@@ -701,15 +736,15 @@ def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
    return result


-transpose = dimshuffle
+dimshuffle = transpose


 def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
    r"""
-    Reshape a tensor to given target shape; total number of logical elements must
+    Reshapes a tensor to given target shape; total number of logical elements must
    remain unchanged

-    :param inp: Input tensor
+    :param inp: input tensor.
    :param target_shape: target shape, the components would be concatenated to form the
        target shape, and it can contain an element of -1 representing unspec_axis.

@@ -764,13 +799,51 @@ AxisAddRemove = builtin.AxisAddRemove
 AxisDesc = AxisAddRemove.AxisDesc


+def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
+    r"""Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
+
+    :param inp: input tensor.
+    :param start_axis: start dimension that the sub-tensor to be flattened. Default: 0
+    :param end_axis: end dimension that the sub-tensor to be flattened. Default: -1
+    :return: output tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        inp_shape = (2, 2, 3, 3)
+        x = tensor(
+            np.arange(36, dtype=np.int32).reshape(inp_shape),
+        )
+        out = F.flatten(x, 2)
+        print(x.numpy().shape)
+        print(out.numpy().shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (2, 2, 3, 3)
+        (2, 2, 9)
+
+    """
+    target_shape = tuple(inp.shape[i] for i in range(start_axis)) + (-1,)
+    if end_axis != -1:
+        target_shape += (*inp.shape[end_axis + 1 :],)
+    return inp.reshape(*target_shape)
+
+
 def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    r"""
-    Add dimension before given axis.
+    Adds dimension before given axis.

-    :param inp: Input tensor
-    :param axis: Place of new axes
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: place of new axes.
+    :return: output tensor.

    Examples:

@@ -779,6 +852,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
        import numpy as np
        from megengine import tensor
        import megengine.functional as F
+
        x = tensor([1, 2])
        out = F.add_axis(x, 0)
        print(out.shape)
@@ -790,7 +864,7 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
        (1, 2)

    """
-    Param = AxisAddRemove.Param
+    Param = builtin.AxisAddRemove.Param

    def get_axes():
        try:
@@ -803,24 +877,24 @@ def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
    ndim = inp.ndim + len(axis)
    axis = sorted(i + ndim if i < 0 else i for i in axis)

-    param = Param(*map(AxisDesc.make_add, axis))
-    op = AxisAddRemove(param=param)
+    param = Param(*map(builtin.AxisAddRemove.AxisDesc.make_add, axis))
+    op = builtin.AxisAddRemove(param=param)
    (result,) = apply(op, inp)
    return result


-expand_dims = add_axis
+add_axis = add_axis


 def remove_axis(
    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None
 ) -> Tensor:
    r"""
-    Remove dimension of shape 1.
+    Removes dimension of shape 1.

-    :param inp: Input tensor
-    :param axis: Place of axis to be removed, if None, all axis=1 will be removed. Default: None
-    :return: The output tensor
+    :param inp: input tensor.
+    :param axis: place of axis to be removed.
+    :return: output tensor.

    Examples:

@@ -829,6 +903,7 @@ def remove_axis(
        import numpy as np
        from megengine import tensor
        import megengine.functional as F
+
        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
        out = F.remove_axis(x, 3)
        print(out.shape)
@@ -840,7 +915,7 @@ def remove_axis(
        (1, 1, 2)

    """
-    Param = AxisAddRemove.Param
+    Param = builtin.AxisAddRemove.Param

    def get_axes():
        if axis is None:
@@ -855,15 +930,12 @@ def remove_axis(
    axis = sorted(i + inp.ndim if i < 0 else i for i in axis)
    axis = [a - i for i, a in enumerate(axis)]

-    param = Param(*map(AxisDesc.make_remove, axis))
-    op = AxisAddRemove(param=param)
+    param = Param(*map(builtin.AxisAddRemove.AxisDesc.make_remove, axis))
+    op = builtin.AxisAddRemove(param=param)
    (result,) = apply(op, inp)
    return result


-squeeze = remove_axis
-
-
 def linspace(
    start: Union[int, float, Tensor],
    stop: Union[int, float, Tensor],
@@ -871,14 +943,13 @@ def linspace(
    dtype="float32",
    device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""
-    Return equally spaced numbers over a specified interval
+    r"""Returns equally spaced numbers over a specified interval.

-    :param start: Starting value of the squence, shoule be scalar
-    :param stop: The last value of the squence, shoule be scalar
-    :param num: number of values to generate
-    :param dtype: result data type
-    :return: The generated tensor
+    :param start: starting value of the squence, shoule be scalar.
+    :param stop: last value of the squence, shoule be scalar.
+    :param num: number of values to generate.
+    :param dtype: result data type.
+    :return: generated tensor.

    Examples:

@@ -916,14 +987,13 @@ def arange(
    dtype="float32",
    device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""
-    Returns a Tensor with values from `start` to `end` with adjacent interval `step`
+    r"""Returns a Tensor with values from start to end with adjacent interval step.

-    :param start: starting value of the squence, shoule be scalar
-    :param end: ending value of the squence, shoule be scalar
-    :param step: the gap between each pair of adjacent values. Default 1
-    :param dtype: result data type
-    :return: The generated tensor
+    :param start: starting value of the squence, shoule be scalar.
+    :param end: ending value of the squence, shoule be scalar.
+    :param step: gap between each pair of adjacent values. Default: 1
+    :param dtype: result data type.
+    :return: generated tensor.

    Examples:

@@ -937,9 +1007,11 @@ def arange(
    
    Outputs:

+    Outputs:
+
    .. testoutput::

-        [1. 2. 3. 4.]
+        [0. 1. 2. 3. 4.]

    """
    if end is None:
@@ -964,12 +1036,12 @@ def param_pack_split(inp: Tensor, offsets: List, shapes: List) -> Tensor:
    Returns split Tensor to Tensor list as offsets and shapes described,
            only used for parampack.

-    :param inp: Input tensor
+    :param inp: input tensor.
    :param offsets: offsets of outputs, length of 2 * n,
            while n is tensor nums you want to split,
-            format [begin0, end0, begin1, end1].
-    :param shapes: tensor shapes of outputs
-    :return: split tensors
+            format `[begin0, end0, begin1, end1]`.
+    :param shapes: tensor shapes of outputs.
+    :return: split tensors.

    Examples:

@@ -1004,8 +1076,8 @@ def param_pack_concat(inps: List, offsets: Tensor, offsets_val: List) -> Tensor:
    r"""
    Returns concat Tensor, only used for parampack.

-    :param inps: Input tensors
-    :param offsets: device value of offsets
+    :param inps: input tensors.
+    :param offsets: device value of offsets.
    :param offsets_val: offsets of inputs, length of 2 * n,
            format [begin0, end0, begin1, end1].
    :return: concat tensors

--- a/imperative/python/megengine/functional/types.py
+++ b/imperative/python/megengine/functional/types.py
@@ -10,12 +10,15 @@ import collections
 import functools


-def get_ndtuple(value, *, n, allow_zero=True):
-    r"""Converts possibly 1D tuple to nd tuple
+def get_ndtuple(value, *, n, allow_zero: bool = True):
+    r"""Converts possibly 1D tuple to n-dim tuple.

-    :type allow_zero: bool
-    :param allow_zero: whether to allow zero tuple value"""
-    if not isinstance(value, collections.abc.Iterable):
+    :param value: value will be filled in generated tuple.
+    :param n: how many elements will the tuple have.
+    :param allow_zero: whether to allow zero tuple value.
+    :return: a tuple.
+    """
+    if not isinstance(value, collections.Iterable):
        value = int(value)
        value = tuple([value for i in range(n)])
    else:

--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -15,7 +15,7 @@ from ..core.ops.builtin import Copy
 from ..core.tensor import Tensor
 from ..core.tensor.core import apply
 from .math import topk as _topk
-from .tensor import dimshuffle as _dimshuffle
+from .tensor import transpose as _transpose


 def accuracy(
@@ -24,11 +24,11 @@ def accuracy(
    r"""
    Calculate the classification accuracy given predicted logits and ground-truth labels.

-    :param logits: Model predictions of shape [batch_size, num_classes],
+    :param logits: model predictions of shape `[batch_size, num_classes]`,
        representing the probability (likelyhood) of each class.
-    :param target: Ground-truth labels, 1d tensor of int32
-    :param topk: Specifies the topk values, could be an int or tuple of ints. Default: 1
-    :return: Tensor(s) of classification accuracy between 0.0 and 1.0
+    :param target: ground-truth labels, 1d tensor of int32.
+    :param topk: specifies the topk values, could be an int or tuple of ints. Default: 1
+    :return: tensor(s) of classification accuracy between 0.0 and 1.0.

    Examples:

@@ -54,7 +54,7 @@ def accuracy(
    _, pred = _topk(logits, k=max(topk), descending=True)
    accs = []
    for k in topk:
-        correct = pred[:, :k].detach() == _dimshuffle(target, (0, "x")).broadcast(
+        correct = pred[:, :k].detach() == _transpose(target, (0, "x")).broadcast(
            target.shape[0], k
        )
        accs.append(correct.astype(np.float32).sum() / target.shape[0])
@@ -63,12 +63,25 @@ def accuracy(
    return accs


+def zero_grad(inp: Tensor) -> Tensor:
+    r"""
+    Returns a tensor which is treated as constant during backward gradient calcuation,
+    i.e. its gradient is zero.
+
+    :param inp: Input tensor.
+
+    See implementation of :func:`~.softmax` for example.
+    """
+    print("zero_grad is obsoleted, please use detach instead")
+    raise NotImplementedError
+
+
 def copy(inp, cn):
    r"""
    Copy tensor to another device.

-    :param inp: Input tensor.
-    :param cn: device that you copy to
+    :param inp: input tensor.
+    :param cn: device that you copy to.

    Examples:


--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -234,32 +234,33 @@ class BatchNorm2d(_BatchNorm):
        less than 4D.
    :type eps: float
    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5.
+        Default: 1e-5
    :type momentum: float
    :param momentum: the value used for the `running_mean` and `running_var`
        computation.
        Default: 0.9
    :type affine: bool
-    :param affine: a boolean value that when set to ``True``, this module has
-        learnable affine parameters. Default: ``True``
+    :param affine: a boolean value that when set to True, this module has
+        learnable affine parameters. Default: True
    :type track_running_stats: bool
-    :param track_running_stats: when set to ``True``, this module tracks the
-        running mean and variance. When set to ``False``, this module does not
+    :param track_running_stats: when set to True, this module tracks the
+        running mean and variance. When set to False, this module does not
        track such statistics and always uses batch statistics in both training
-        and eval modes. Default: ``True``.
+        and eval modes. Default: True

    :type freeze: bool
-    :param freeze: when set to ``True``, this module does not update the
+    :param freeze: when set to True, this module does not update the
        running mean and variance, and uses the running mean and variance instead of
        the batch mean and batch variance to normalize the input. The parameter takes effect
-        only when the module is initilized with ``track_running_stats`` as ``True`` and
+        only when the module is initilized with track_running_stats as True and
        the module is in training mode.
-        Default: ``False``.
+        Default: False

    Examples:

    .. testcode::

+        import numpy as np
        import megengine as mge
        import megengine.module as M

@@ -268,13 +269,13 @@ class BatchNorm2d(_BatchNorm):
        inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
        oup = m(inp)
        print(m.weight.numpy(), m.bias.numpy())
-        # Without Learnable Parameters
+        # Without L`e`arnable Parameters
        m = M.BatchNorm2d(4, affine=False)
        oup = m(inp)
        print(m.weight, m.bias)
-    
+
    Outputs:
-    
+
    .. testoutput::

        [1. 1. 1. 1.] [0. 0. 0. 0.]

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -88,8 +88,8 @@ class Conv2d(_ConvNd):
    :math:`H` is a height of input planes in pixels, and :math:`W` is
    width in pixels.

-    When ``groups == in_channels`` and ``out_channels == K * in_channels``,
-    where `K` is a positive integer, this operation is also known as depthwise
+    When `groups == in_channels` and `out_channels == K * in_channels`,
+    where K is a positive integer, this operation is also known as depthwise
    convolution.

    In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
@@ -98,27 +98,47 @@ class Conv2d(_ConvNd):

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
+        `(kernel_size, kernel_size)`. Default: 1
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        so as to perform a "grouped convolution". When groups is not 1,
+        in_channels and out_channels must be divisible by groups,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be ``(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)``.
+        shape. Specifically, the shape of weight would be `(groups,
+        out_channel // groups, in_channels // groups, *kernel_size)`.
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
-        `CROSS_CORRELATION`.
+        `CROSS_CORRELATION`
    :param compute_mode: When set to `DEFAULT`, no special requirements will be
        placed on the precision of intermediate results. When set to `FLOAT32`,
        float32 would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
+        inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
+        oup = m(inp)
+        print(oup.shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (2, 1, 2, 2)
+
    """

    _conv_mode_type = P.Convolution.Mode
@@ -226,7 +246,7 @@ class ConvTranspose2d(_ConvNd):
    :param bias: wether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
-        `CROSS_CORRELATION`.
+        `CROSS_CORRELATION`
    :param compute_mode: When set to `DEFAULT`, no special requirements will be
        placed on the precision of intermediate results. When set to `FLOAT32`,
        float32 would be used for accumulator and intermediate result, but only
@@ -314,17 +334,17 @@ class LocalConv2d(Conv2d):
    :param out_channels: number of output channels.
    :param input_height: the height of the input images.
    :param input_width: the width of the input images.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
+        `(kernel_size, kernel_size)`. Default: 1
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
-        The shape of weight is ``(groups, output_height, output_width,
-        in_channels // groups, *kernel_size, out_channels // groups)``.
+        so as to perform a "grouped convolution". When groups is not 1,
+        in_channels and out_channels must be divisible by groups.
+        The shape of weight is `(groups, output_height, output_width,
+        in_channels // groups, *kernel_size, out_channels // groups)`.
    """

    _conv_mode_type = P.Convolution.Mode

--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,7 +11,8 @@ from .module import Module


 class Dropout(Module):
-    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training. Commonly used in large networks to prevent overfitting.
+    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training.
+    Commonly used in large networks to prevent overfitting.
    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.


--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
@@ -67,6 +67,10 @@ class Elemwise(Module):
        * "H_SWISH": h_swish
        * "FUSE_ADD_H_SWISH": h_swish(x+y)
        * "H_SWISH_GRAD": h_swish_grad
+        * "AND": bool binary: x && y
+        * "OR": bool binary: x || y
+        * "XOR": bool binary: x ^ y
+        * "NOT": bool unary: ~x
    """

    _elemwise_mode_type = P.Elemwise.Mode

--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -78,7 +78,7 @@ def calculate_gain(
    Sigmoid           :math:`1`
    Tanh              :math:`\frac{5}{3}`
    ReLU              :math:`\sqrt{2}`
-    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative_{slope}}^2}}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + {\text{negative}_\text{slope}}^2}}`
    ================= ====================================================

    :param nonlinearity: Name of the non-linear function

--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -28,6 +28,25 @@ class Linear(Module):
    :param bias: If set to ``False``, the layer will not learn an additive bias.
        Default: ``True``

+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.Linear(in_features=3, out_features=1)
+        inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
+        oup = m(inp)
+        print(oup.shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (2, 1)
+
    """

    def __init__(

--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -48,8 +48,29 @@ class MaxPool2d(_PoolNd):
    both sides for :attr:`padding` number of points.

    :param kernel_size: the size of the window to take a max over.
-    :param stride: the stride of the window. Default value is ``kernel_size``.
+    :param stride: the stride of the window. Default value is kernel_size.
    :param padding: implicit zero padding to be added on both sides.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.MaxPool2d(kernel_size=3, stride=1, padding=0)
+        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+        oup = m(inp)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[10. 11.]
+           [14. 15.]]]]
+
    """

    def forward(self, inp):
@@ -72,8 +93,29 @@ class AvgPool2d(_PoolNd):
    both sides for :attr:`padding` number of points.

    :param kernel_size: the size of the window.
-    :param stride: the stride of the window. Default value is ``kernel_size``.
+    :param stride: the stride of the window. Default value is kernel_size。
    :param padding: implicit zero padding to be added on both sides.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        m = M.AvgPool2d(kernel_size=3, stride=1, padding=0)
+        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+        oup = m(inp)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[ 5.  6.]
+           [ 9. 10.]]]]
+
    """

    def forward(self, inp):

--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -23,12 +23,13 @@ class Sequential(Module):
    .. testcode::

        import numpy as np
-        from megengine import tensor
+        import megengine as mge
+        import megengine.module as M
        import megengine.functional as F

        batch_size = 64
-        data = tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
-        label = tensor(np.zeros(batch_size,), dtype=np.int32)
+        data = mge.tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
+        label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)

        data = data.reshape(batch_size, -1)
        net = M.Sequential(

--- a/imperative/python/test/unit/test_autodiff.py
+++ b/imperative/python/test/unit/test_autodiff.py
--- a/imperative/python/test/unit/test_function.py
+++ b/imperative/python/test/unit/test_function.py
--- a/imperative/python/test/unit/test_imperative_rt.py
+++ b/imperative/python/test/unit/test_imperative_rt.py
--- a/imperative/python/test/unit/test_indexing_op.py
+++ b/imperative/python/test/unit/test_indexing_op.py
@@ -192,7 +192,7 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
    return inp, tensors, items


-def dimshuffle(*args, **kwargs):
+def transpose(*args, **kwargs):
    op = all_ops.Dimshuffle(**kwargs).to_c()
    return invoke_op(op, args)

@@ -274,10 +274,10 @@ def batched_incr_mesh_indexing(input, value, tuple_val):
    return invoke_op(op, (input, value, *tensors))


-def test_dimshuffle():
+def test_transpose():
    x = np.arange(10).reshape(2, 5).astype("int32")
    xx = as_raw_tensor(x)
-    (yy,) = dimshuffle(xx, pattern="1x0")
+    (yy,) = transpose(xx, pattern="1x0")
    np.testing.assert_equal(np.expand_dims(x.transpose(), axis=1), yy.numpy())



--- a/imperative/python/test/unit/core/test_jit.py
+++ b/imperative/python/test/unit/core/test_jit.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import pytest
+
+from megengine.core import Tensor
+
+# from megengine.core.interpreter.hints import function
+
+
+@pytest.mark.skip(reason="under rewrite")
+def test_1():
+    @function
+    def f(x, p):
+        x = x + 1
+        if p:
+            return x * x
+        return x * 2
+
+    x = Tensor(0)
+
+    for _ in range(5):
+        assert f(x, 0).numpy() == 2
+        assert f(x, 1).numpy() == 1
--- a/imperative/python/test/unit/test_megbrain_graph.py
+++ b/imperative/python/test/unit/test_megbrain_graph.py
--- a/imperative/python/test/unit/test_raw_tensor.py
+++ b/imperative/python/test/unit/test_raw_tensor.py
--- a/imperative/python/test/unit/test_serialization.py
+++ b/imperative/python/test/unit/test_serialization.py
--- a/imperative/python/test/unit/test_tensor_wrapper.py
+++ b/imperative/python/test/unit/test_tensor_wrapper.py
--- a/imperative/python/test/unit/test_util.py
+++ b/imperative/python/test/unit/test_util.py
--- a/imperative/python/test/unit/data/__init__.py
+++ b/imperative/python/test/unit/data/__init__.py
--- a/imperative/python/test/unit/data/test_dataset.py
+++ b/imperative/python/test/unit/data/test_dataset.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import sys
+
+import numpy as np
+import pytest
+
+from megengine.data.dataset import ArrayDataset, Dataset, MapDataset, StreamDataset
+
+
+def test_abstract_cls():
+    with pytest.raises(TypeError):
+        Dataset()
+    with pytest.raises(TypeError):
+        MapDataset()
+    with pytest.raises(TypeError):
+        StreamDataset()
+
+
+def test_array_dataset():
+    size = (10,)
+    data_shape = (3, 256, 256)
+    label_shape = (1,)
+    data = np.random.randint(0, 255, size + data_shape)
+    label = np.random.randint(0, 9, size + label_shape)
+    dataset = ArrayDataset(data, label)
+    assert dataset[0][0].shape == data_shape
+    assert dataset[0][1].shape == label_shape
+    assert len(dataset) == size[0]
+
+
+def test_array_dataset_dim_error():
+    data = np.random.randint(0, 255, (10, 3, 256, 256))
+    label = np.random.randint(0, 9, (1,))
+    with pytest.raises(ValueError):
+        ArrayDataset(data, label)
--- a/imperative/python/test/unit/data/test_sampler.py
+++ b/imperative/python/test/unit/data/test_sampler.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import copy
+import os
+import sys
+
+import numpy as np
+import pytest
+
+from megengine.data.dataset import ArrayDataset
+from megengine.data.sampler import RandomSampler, ReplacementSampler, SequentialSampler
+
+
+def test_sequential_sampler():
+    indices = list(range(100))
+    sampler = SequentialSampler(ArrayDataset(indices))
+    assert indices == list(each[0] for each in sampler)
+
+
+def test_RandomSampler():
+    indices = list(range(20))
+    indices_copy = copy.deepcopy(indices)
+    sampler = RandomSampler(ArrayDataset(indices_copy))
+    sample_indices = sampler
+    assert indices != list(each[0] for each in sample_indices)
+    assert indices == sorted(list(each[0] for each in sample_indices))
+
+
+def test_random_sampler_seed():
+    seed = [0, 1]
+    indices = list(range(20))
+    indices_copy1 = copy.deepcopy(indices)
+    indices_copy2 = copy.deepcopy(indices)
+    indices_copy3 = copy.deepcopy(indices)
+    sampler1 = RandomSampler(ArrayDataset(indices_copy1), seed=seed[0])
+    sampler2 = RandomSampler(ArrayDataset(indices_copy2), seed=seed[0])
+    sampler3 = RandomSampler(ArrayDataset(indices_copy3), seed=seed[1])
+    assert indices != list(each[0] for each in sampler1)
+    assert indices != list(each[0] for each in sampler2)
+    assert indices != list(each[0] for each in sampler3)
+    assert indices == sorted(list(each[0] for each in sampler1))
+    assert indices == sorted(list(each[0] for each in sampler2))
+    assert indices == sorted(list(each[0] for each in sampler3))
+    assert list(each[0] for each in sampler1) == list(each[0] for each in sampler2)
+    assert list(each[0] for each in sampler1) != list(each[0] for each in sampler3)
+
+
+def test_ReplacementSampler():
+    num_samples = 30
+    indices = list(range(20))
+    weights = list(range(20))
+    sampler = ReplacementSampler(
+        ArrayDataset(indices), num_samples=num_samples, weights=weights
+    )
+    assert len(list(each[0] for each in sampler)) == num_samples
+
+
+def test_sampler_drop_last_false():
+    batch_size = 5
+    drop_last = False
+    indices = list(range(24))
+    sampler = SequentialSampler(
+        ArrayDataset(indices), batch_size=batch_size, drop_last=drop_last
+    )
+    assert len([each for each in sampler]) == len(sampler)
+
+
+def test_sampler_drop_last_true():
+    batch_size = 5
+    drop_last = True
+    indices = list(range(24))
+    sampler = SequentialSampler(
+        ArrayDataset(indices), batch_size=batch_size, drop_last=drop_last
+    )
+    assert len([each for each in sampler]) == len(sampler)
--- a/imperative/python/test/unit/data/test_transform.py
+++ b/imperative/python/test/unit/data/test_transform.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from megengine.data.transform import *
+
+data_shape = (100, 100, 3)
+label_shape = (4,)
+ToMode_target_shape = (3, 100, 100)
+CenterCrop_size = (90, 70)
+CenterCrop_target_shape = CenterCrop_size + (3,)
+RandomResizedCrop_size = (50, 50)
+RandomResizedCrop_target_shape = RandomResizedCrop_size + (3,)
+
+
+def generate_data():
+    return [
+        (
+            (np.random.rand(*data_shape) * 255).astype(np.uint8),
+            np.random.randint(10, size=label_shape),
+        )
+        for _ in range(*label_shape)
+    ]
+
+
+def test_ToMode():
+    t = ToMode(mode="CHW")
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(ToMode_target_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_CenterCrop():
+    t = CenterCrop(output_size=CenterCrop_size)
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(CenterCrop_target_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_ColorJitter():
+    t = ColorJitter()
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(data_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_RandomHorizontalFlip():
+    t = RandomHorizontalFlip(prob=1)
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(data_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_RandomVerticalFlip():
+    t = RandomVerticalFlip(prob=1)
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(data_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_RandomResizedCrop():
+    t = RandomResizedCrop(output_size=RandomResizedCrop_size)
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(RandomResizedCrop_target_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_Normalize():
+    t = Normalize()
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [(data_shape, label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_RandomCrop():
+    t = RandomCrop((150, 120), padding_size=10, padding_value=[1, 2, 3])
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    target_shape = [((150, 120, 3), label_shape)] * 4
+    assert aug_data_shape == target_shape
+
+
+def test_Compose():
+    t = Compose(
+        [
+            CenterCrop(output_size=CenterCrop_size),
+            RandomHorizontalFlip(prob=1),
+            ToMode(mode="CHW"),
+        ]
+    )
+    aug_data = t.apply_batch(generate_data())
+    aug_data_shape = [(a.shape, b.shape) for a, b in aug_data]
+    print(aug_data_shape)
+    target_shape = [((3, 90, 70), label_shape)] * 4
+    assert aug_data_shape == target_shape
--- a/imperative/python/test/unit/test_distributed.py
+++ b/imperative/python/test/unit/test_distributed.py
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -83,48 +83,6 @@ def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs)
    check_results(results, outp)


-def test_flatten():
-    data0_shape = (2, 3, 4, 5)
-    data1_shape = (4, 5, 6, 7)
-    data0 = np.random.random(data0_shape).astype(np.float32)
-    data1 = np.random.random(data1_shape).astype(np.float32)
-
-    def compare_fn(x, y):
-        assert x.numpy().shape == y
-
-    output0 = (2 * 3 * 4 * 5,)
-    output1 = (4 * 5 * 6 * 7,)
-    cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
-    ]
-    opr_test(cases, F.flatten, compare_fn=compare_fn)
-
-    output0 = (2, 3 * 4 * 5)
-    output1 = (4, 5 * 6 * 7)
-    cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
-    ]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1)
-
-    output0 = (2, 3, 4 * 5)
-    output1 = (4, 5, 6 * 7)
-    cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
-    ]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=2)
-
-    output0 = (2, 3 * 4, 5)
-    output1 = (4, 5 * 6, 7)
-    cases = [
-        {"input": data0, "output": (output0,)},
-        {"input": data1, "output": (output1,)},
-    ]
-    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1, end_axis=2)
-
-
 def test_where():
    maskv0 = np.array([[1, 0], [0, 1]], dtype=np.bool_)
    xv0 = np.array([[1, np.inf], [np.nan, 4]], dtype=np.float32)
@@ -155,45 +113,6 @@ def test_where():
    opr_test(cases, F.where, ref_fn=np.where)


-def test_matmul():
-    shape1 = 3
-    shape2 = 3
-    shape3 = (3, 5)
-    shape4 = (5, 6)
-    data1 = np.random.random(shape1).astype("float32")
-    data2 = np.random.random(shape2).astype("float32")
-    data3 = np.random.random(shape3).astype("float32")
-    data4 = np.random.random(shape4).astype("float32")
-
-    cases = [
-        {"input": [data1, data2]},
-        {"input": [data2, data3]},
-        {"input": [data3, data4]},
-    ]
-    opr_test(cases, F.matmul, ref_fn=np.matmul)
-
-    batch_size = 10
-    shape1 = (batch_size, 2, 3)
-    shape2 = (batch_size, 3, 4)
-    shape3 = (batch_size, 10, 4, 5)
-    data1 = np.random.random(shape1).astype("float32")
-    data2 = np.random.random(shape2).astype("float32")
-    data3 = np.random.random(shape3).astype("float32")
-
-    cases = [{"input": [data1, data2]}, {"input": [data2, data3]}]
-    for i in range(0, batch_size):
-
-        def compare_fn(x, y):
-            x.numpy()[i, ...] == y
-
-        opr_test(
-            cases,
-            F.matmul,
-            compare_fn=compare_fn,
-            ref_fn=lambda x, y: np.matmul(x[i, ...], y[i, ...]),
-        )
-
-
 def test_interpolate():
    def linear_interpolate():
        inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
@@ -303,28 +222,28 @@ def test_roi_pooling():
    assert make_shape_tuple(inp_feat.grad.shape) == make_shape_tuple(inp_feat.shape)


-# def test_one_hot():
-#     def onehot_low_dimension():
-#         inp = tensor(np.arange(1, 4, dtype=np.int32))
-#         out = F.one_hot(inp, num_classes=4)
-
-#         assertTensorClose(
-#             out.numpy(), np.eye(4, dtype=np.int32)[np.arange(1, 4, dtype=np.int32)]
-#         )
+def test_one_hot():
+    def onehot_low_dimension():
+        inp = tensor(np.arange(1, 4, dtype=np.int32))
+        out = F.one_hot(inp, num_classes=4)

+        assertTensorClose(
+            out.numpy(), np.eye(4, dtype=np.int32)[np.arange(1, 4, dtype=np.int32)]
+        )

-#     def onehot_high_dimension():
-#         arr = np.array(
-#             [[3, 2, 4, 4, 2, 4, 0, 4, 4, 1], [4, 1, 1, 3, 2, 2, 4, 2, 4, 3]], dtype=np.int32
-#         )
+    def onehot_high_dimension():
+        arr = np.array(
+            [[3, 2, 4, 4, 2, 4, 0, 4, 4, 1], [4, 1, 1, 3, 2, 2, 4, 2, 4, 3]],
+            dtype=np.int32,
+        )

-#         inp = tensor(arr)
-#         out = F.one_hot(inp, 10)
+        inp = tensor(arr)
+        out = F.one_hot(inp, 10)

-#         assertTensorClose(out.numpy(), np.eye(10, dtype=np.int32)[arr])
+        assertTensorClose(out.numpy(), np.eye(10, dtype=np.int32)[arr])

-#     onehot_low_dimension()
-#     onehot_high_dimension()
+    onehot_low_dimension()
+    onehot_high_dimension()


 def test_add_update():
@@ -554,7 +473,7 @@ def test_conv_bias():
            var = F.reshape(
                var, (var.shape[0], var.shape[1] // 4, 4, var.shape[2], var.shape[3])
            )
-            var = F.dimshuffle(var, (0, 1, 3, 4, 2))
+            var = F.transpose(var, (0, 1, 3, 4, 2))
            return var

        def run_conv2d(inp, w, b):
@@ -591,7 +510,7 @@ def test_conv_bias():
            "float32"
        )
        if format == "NCHW4":
-            result = F.dimshuffle(result, (0, 1, 4, 2, 3))
+            result = F.transpose(result, (0, 1, 4, 2, 3))
        expected = F.flatten(expected)
        result = F.flatten(result)
        assertTensorClose(result.numpy(), expected.numpy(), max_err=outp_scale)
@@ -608,22 +527,6 @@ def test_conv_bias():
    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "RELU")


-# def test_softplus():
-#     x = np.arange(1000).astype(np.float32)
-#     out = F.softplus(tensor(x))
-#     mask = x <= 20
-#     with np.errstate(over="ignore"):
-#         expected = np.where(mask, np.log(1 + np.exp(x)), x)
-#     assertTensorClose(out, expected)
-#     beta = 2
-#     out = F.softplus(tensor(x), beta=beta, threshold=30)
-#     mask = beta * x <= 30
-#     # ignore overflow
-#     with np.errstate(over="ignore"):
-#         expected = np.where(mask, np.log(1 + np.exp(x * beta)) / beta, x)
-#     assertTensorClose(out, expected)
-
-
 def test_condtake():
    x = np.array([[1, 2, 3], [4, 5, 6]])
    y = np.array([[True, False, True], [False, True, True]])

--- a/imperative/python/test/unit/test_loss.py
+++ b/imperative/python/test/unit/test_loss.py
@@ -12,7 +12,6 @@ import megengine.functional as F
 from megengine import tensor


-# XXX need to test label_smooth
 def test_cross_entropy_with_softmax():
    data = tensor([1, 100]).astype(np.float32).reshape((1, 2))
    label = tensor([1]).astype(np.int32)

--- a/imperative/python/test/unit/functional/test_math.py
+++ b/imperative/python/test/unit/functional/test_math.py
@@ -14,8 +14,6 @@ import megengine.functional as F
 from megengine import tensor
 from megengine.test import assertTensorClose

-# from helpers import opr_test
-

 def _default_compare_fn(x, y):
    assertTensorClose(x.numpy(), y)
@@ -207,6 +205,45 @@ def test_normalize():
    opr_test(cases, partial(F.normalize, axis=3), ref_fn=partial(np_normalize, axis=3))


+def test_matmul():
+    shape1 = 3
+    shape2 = 3
+    shape3 = (3, 5)
+    shape4 = (5, 6)
+    data1 = np.random.random(shape1).astype("float32")
+    data2 = np.random.random(shape2).astype("float32")
+    data3 = np.random.random(shape3).astype("float32")
+    data4 = np.random.random(shape4).astype("float32")
+
+    cases = [
+        {"input": [data1, data2]},
+        {"input": [data2, data3]},
+        {"input": [data3, data4]},
+    ]
+    opr_test(cases, F.matmul, ref_fn=np.matmul)
+
+    batch_size = 10
+    shape1 = (batch_size, 2, 3)
+    shape2 = (batch_size, 3, 4)
+    shape3 = (batch_size, 10, 4, 5)
+    data1 = np.random.random(shape1).astype("float32")
+    data2 = np.random.random(shape2).astype("float32")
+    data3 = np.random.random(shape3).astype("float32")
+
+    cases = [{"input": [data1, data2]}, {"input": [data2, data3]}]
+    for i in range(0, batch_size):
+
+        def compare_fn(x, y):
+            x.numpy()[i, ...] == y
+
+        opr_test(
+            cases,
+            F.matmul,
+            compare_fn=compare_fn,
+            ref_fn=lambda x, y: np.matmul(x[i, ...], y[i, ...]),
+        )
+
+
 # def test_logsumexp():
 #     x = np.arange(10).astype(np.float32)
 #     expected = np.log(np.sum(np.exp(x)))

--- a/imperative/python/test/unit/functional/test_tensor.py
+++ b/imperative/python/test/unit/functional/test_tensor.py
@@ -165,7 +165,7 @@ def test_squeeze():

    for axis in [None, 3, -4, (3, -4)]:
        y = np.squeeze(x, axis)
-        yy = F.squeeze(xx, axis)
+        yy = F.remove_axis(xx, axis)
        np.testing.assert_equal(y, yy.numpy())


@@ -175,7 +175,7 @@ def test_expand_dims():

    for axis in [2, -3, (3, -4), (1, -4)]:
        y = np.expand_dims(x, axis)
-        yy = F.expand_dims(xx, axis)
+        yy = F.add_axis(xx, axis)
        np.testing.assert_equal(y, yy.numpy())


@@ -258,6 +258,48 @@ def test_round():
    opr_test(cases, F.round, ref_fn=np.round)


+def test_flatten():
+    data0_shape = (2, 3, 4, 5)
+    data1_shape = (4, 5, 6, 7)
+    data0 = np.random.random(data0_shape).astype(np.float32)
+    data1 = np.random.random(data1_shape).astype(np.float32)
+
+    def compare_fn(x, y):
+        assert x.numpy().shape == y[0]
+
+    output0 = (2 * 3 * 4 * 5,)
+    output1 = (4 * 5 * 6 * 7,)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn)
+
+    output0 = (2, 3 * 4 * 5)
+    output1 = (4, 5 * 6 * 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1)
+
+    output0 = (2, 3, 4 * 5)
+    output1 = (4, 5, 6 * 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=2)
+
+    output0 = (2, 3 * 4, 5)
+    output1 = (4, 5 * 6, 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1, end_axis=2)
+
+
 def test_broadcast():
    input1_shape = (20, 30)
    output1_shape = (30, 20, 30)