fix(jit): fix jit doc and add NCHW44_DOT

GitOrigin-RevId: 5f5feae8e727dd111615022f2a21c7ede647156a

fix(jit): fix jit doc and add NCHW44_DOT
GitOrigin-RevId: 5f5feae8e727dd111615022f2a21c7ede647156a
5257991e · Megvii Engine Team · Xu Xinran · cdf25c4a · 5257991e
隐藏空白更改
内联并排

Showing with 32 addition and 10 deletion

python_module/megengine/jit/__init__.py python_module/megengine/jit/__init__.py +32 -10

未找到文件。
--- a/python_module/megengine/jit/__init__.py
+++ b/python_module/megengine/jit/__init__.py
@@ -442,17 +442,38 @@ class trace:
        Serialize trace to file system.

        :param fpath: positional only argument. Path of output file.
-        :param arg_names: names of the input tensors in the traced function
-        :param append: whether output is appended to ``fpath``
-        :param f16_io_f32_comp: whether to use float16 for I/O between oprs and use
+        :param arg_names: names of the input tensors in the traced function.
+        :param append: whether output is appended to ``fpath``.
+        :param optimize_for_inference: whether to enable optimize_for_inference
+            pass before dump.
+
+        :param enable_io16xc32: whether to use float16 for I/O between oprs and use
            float32 as internal computation precision. Note the output var would be
-            changed to float16
-        :param f16_io_comp: whether to use float16 for both I/O and computation
-            precision
-        :param use_nhwcd4: whether to use NHWCD4 data format. This is faster on some
-            OpenCL devices
-        :param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-            into one opr. This is supported only in NHWCD4 format.
+            changed to float16.
+        :param enable_ioc16: whether to use float16 for both I/O and computation
+            precision.
+
+        :param enable_hwcd4: whether to use NHWCD4 data layout. This is faster on some
+            OpenCL backend.
+        :param enable_nchw88: whether to use NCHW4 data layout. it currently
+            used in X86 AVX backend.
+        :param enable_nchw44: whether to use NCHW4 data layout. it currently
+            used in arm backend.
+        :param enable_nchw44_dot: whether to use NCHW4 data layout. it currently
+            used in armv8.2+dotprod backend.
+        :param enable_nchw4: whether to use NCHW4 data layout. it currently
+            used in nvidia backend(based on cudnn).
+        :param enable_nchw32 whether to use NCHW32 data layout. it currently
+            used in nvidia backend with tensorcore(based on cudnn).
+        :param enable_chwn4 whether to use CHWN4 data layout. it currently
+            used in nvidia backend with tensorcore.
+
+        :param enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
+            into one opr.
+        :param enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
+            input for inference on nvidia backend(this optimization pass will
+            result in mismatch of the precision of output of training and
+            inference)
        """
        if self._status != self._FINISHED:
            raise ValueError("not traced")
@@ -475,6 +496,7 @@ class trace:
            "enable_nchw88": "use_nchw88",
            "enable_nchw32": "use_nchw32",
            "enable_nchw44": "use_nchw44",
+            "enable_nchw44_dot": "use_nchw44_dot",
            "enable_chwn4": "use_chwn4",
            "enable_fuse_conv_bias_nonlinearity": "fuse_conv_bias_nonlinearity",
            "enable_fuse_conv_bias_with_z": "fuse_conv_bias_with_z",