!5783 GraphKernel supports GPU

Merge pull request !5783 from DeshiChen/graph_kernel_1.0

!5783 GraphKernel supports GPU
Merge pull request !5783 from DeshiChen/graph_kernel_1.0
7152fe04 · mindspore-ci-bot · Gitee · d88db18d · 37a48f6a · d237aa7d
83 changed file
--- a/akg @ d237aa7d
+++ b/akg @ d237aa7d
-Subproject commit 3bb6264188d0b1d6ff776a35a571bc7190df0800
+Subproject commit d237aa7d8e9d3fb709bda9f30205b02129bc2b59
--- a/mindspore/_extends/graph_kernel/__init__.py
+++ b/mindspore/_extends/graph_kernel/__init__.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""init"""
+from .splitter import split_with_json
+from .expander import get_op_expander
--- a/mindspore/_extends/graph_kernel/expander.py
+++ b/mindspore/_extends/graph_kernel/expander.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate json desc for graph kernel ops"""
+import json
+import json.decoder as jd
+import traceback
+from mindspore import log as logger
+import mindspore._extends.graph_kernel.expanders as expanders
+
+
+def get_op_expander(json_str: str):
+    """get op expander by json info"""
+    try:
+        kernel_info = json.loads(json_str)
+        expand_info = kernel_info['expand_info']
+
+        if 'name' not in expand_info:
+            logger.error("expand info have no op name")
+            return None
+        if 'process' not in expand_info:
+            logger.error("expand info have no processor info")
+            return None
+
+        processor = expand_info['process']
+        op_name = str(expand_info['name']).lower()
+        expand_op_func_name = 'expand_' + op_name
+        if not hasattr(expanders, expand_op_func_name):
+            logger.error("Generator do not support op: {}".format(op_name))
+            return None
+        expand_op_func = getattr(expanders, expand_op_func_name)
+        # generate graph desc.
+        graph = expand_op_func(expand_info)
+        if graph is None:
+            logger.error("Failed to generate graph of: {}".format(op_name))
+            return None
+
+        graph.set_processor(processor)
+
+        # dump graph to json desc.
+        desc = graph.dump()
+        return json.dumps(desc)
+
+    except jd.JSONDecodeError:
+        logger.error("Failed to generate graph kernel op")
+        logger.error(traceback.format_exc())
+        return None
--- a/mindspore/_extends/graph_kernel/expanders/__init__.py
+++ b/mindspore/_extends/graph_kernel/expanders/__init__.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""expanders init"""
+
+from .gelu import expand_gelu
+from .layernorm import expand_layernorm
+from .softmax import expand_softmax
+from .square import expand_square
--- a/mindspore/_extends/graph_kernel/expanders/gelu.py
+++ b/mindspore/_extends/graph_kernel/expanders/gelu.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""generate json desc for gelu"""
+from mindspore._extends.graph_kernel.model import model_builder as builder
+
+CSVALUE = 0.044715
+CSVALUE_A = 1.5957691  # 2*np.sqrt(2/np.pi)
+
+
+def expand_gelu(expand_info):
+    """Gelu expander"""
+
+    # get op info.
+    input_desc = expand_info['input_desc'][0]
+    graph_builder = builder.GraphBuilder()
+
+    # generate a graph.
+    with graph_builder.graph_scope('main') as graph_scope:
+        # create tensor input.
+        input_x = graph_builder.tensor(input_desc['shape'], input_desc['data_type'], input_desc['format'])
+        dtype = input_x.dtype
+        if dtype == 'float16':
+            input_x = graph_builder.emit('Cast', [input_x], attrs={'dst_type': 'float32'})
+
+        # cal tanh.
+        mul_0 = graph_builder.emit('Mul', [input_x, input_x])
+        pow_0 = graph_builder.emit('Mul', [mul_0, input_x])
+        const_csvalue = graph_builder.value(pow_0.dtype, CSVALUE, input_desc['format'])
+        mul_1 = graph_builder.emit('Mul', [pow_0, const_csvalue])
+        tanh_res = graph_builder.emit('TensorAdd', [input_x, mul_1])
+
+        const_csvalue_a = graph_builder.value(tanh_res.dtype, CSVALUE_A, input_desc['format'])
+        mul_0 = graph_builder.emit('Mul', [tanh_res, const_csvalue_a])
+
+        const_zero = graph_builder.value(mul_0.dtype, 0.0, input_desc['format'])
+        mul_0_min = graph_builder.emit('Minimum', [mul_0, const_zero])
+        right_mul = graph_builder.emit('Exp', [mul_0_min])
+
+        mul_0_abs = graph_builder.emit('Abs', [mul_0])
+        const_neg_one = graph_builder.value(mul_0_abs.dtype, -1.0, input_desc['format'])
+        mul_0_abs_neg = graph_builder.emit('Mul', [mul_0_abs, const_neg_one])
+
+        mul_0_abs_neg_exp = graph_builder.emit('Exp', [mul_0_abs_neg])
+
+        const_one = graph_builder.value(mul_0_abs_neg_exp.dtype, 1.0, input_desc['format'])
+        mul_0_abs_neg_exp_add = graph_builder.emit('TensorAdd', [mul_0_abs_neg_exp, const_one])
+        left_mul = graph_builder.emit('RealDiv', [input_x, mul_0_abs_neg_exp_add])
+
+        result = graph_builder.emit('Mul', [left_mul, right_mul])
+        if dtype == 'float16':
+            result = graph_builder.emit('Cast', [result], attrs={'dst_type': 'float16'})
+        # set graph output.
+        graph_scope.set_output(result)
+
+    graph = graph_builder.get()[0]
+    return graph
--- a/mindspore/_extends/graph_kernel/expanders/layernorm.py
+++ b/mindspore/_extends/graph_kernel/expanders/layernorm.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""generate json desc for LayerNorm"""
+from mindspore._extends.graph_kernel.model import model_builder as builder
+
+
+def expand_layernorm(expand_info):
+    """LayerNorm expander"""
+
+    # get op info.
+    input_desc_0 = expand_info['input_desc'][0]
+    input_desc_1 = expand_info['input_desc'][1]
+    input_desc_2 = expand_info['input_desc'][2]
+    attrs = expand_info['attr']
+    begin_norm_axis = None
+    epsilon = None
+    for item in attrs:
+        if 'begin_norm_axis' in item:
+            begin_norm_axis = item['begin_norm_axis']
+        if 'epsilon' in item:
+            epsilon = item['epsilon']
+    graph_builder = builder.GraphBuilder()
+
+    # generate a graph.
+    with graph_builder.graph_scope('main') as graph_scope:
+        # create tensor input.
+        input_x = graph_builder.tensor(input_desc_0['shape'], input_desc_0['data_type'], input_desc_0['format'])
+        input_gamma = graph_builder.tensor(input_desc_1['shape'], input_desc_1['data_type'], input_desc_1['format'])
+        input_beta = graph_builder.tensor(input_desc_2['shape'], input_desc_2['data_type'], input_desc_2['format'])
+
+        # Calculate the scaling ratio of the average
+        shape_x = input_desc_0['shape']
+        if begin_norm_axis < 0:
+            begin_norm_axis += len(shape_x)
+        reduce_axis = ()
+        for i, _ in enumerate(shape_x):
+            if i > begin_norm_axis or i == begin_norm_axis:
+                reduce_axis = reduce_axis + (i,)
+
+        reduce_elts = 1.0
+        for i in reduce_axis:
+            reduce_elts *= shape_x[i]
+        mean_cof = 1.0 / reduce_elts
+        mean_cof_v = graph_builder.value(input_x.dtype, mean_cof, input_x.data_format)
+
+        # Calculate mean
+        mean_red = graph_builder.emit('ReduceSum', [input_x], attrs={'reduce_axis': reduce_axis, 'keep_dims': True})
+        mean = graph_builder.emit('Mul', [mean_red, mean_cof_v])
+
+        # Calculate variance
+        variance_sub = graph_builder.emit('Sub', [input_x, mean])
+        variance_mul = graph_builder.emit('Mul', [variance_sub, variance_sub])
+        variance_red = graph_builder.emit('ReduceSum', [variance_mul],
+                                          attrs={'reduce_axis': reduce_axis, 'keep_dims': True})
+        variance = graph_builder.emit('Mul', [variance_red, mean_cof_v])
+
+        # Calculate normalize
+        normalize_sub = graph_builder.emit('Sub', [input_x, mean])
+        epsilon_v = graph_builder.value(input_x.dtype, epsilon, input_x.data_format)
+        normalize_add = graph_builder.emit('TensorAdd', [variance, epsilon_v])
+        normalize_log = graph_builder.emit('Log', [normalize_add])
+        input_y = graph_builder.value(input_x.dtype, -0.5, input_x.data_format)
+        normalize_log_mul = graph_builder.emit('Mul', [normalize_log, input_y])
+        normalize_exp = graph_builder.emit('Exp', [normalize_log_mul])
+        normalize_mul = graph_builder.emit('Mul', [normalize_sub, normalize_exp])
+
+        # Calculate scale and translate
+        scale_mul = graph_builder.emit('Mul', [input_gamma, normalize_mul])
+        res = graph_builder.emit('TensorAdd', [scale_mul, input_beta])
+
+        # set graph output.
+        graph_scope.set_output(res, mean, variance)
+
+    graph = graph_builder.get()[0]
+    return graph
--- a/mindspore/_extends/graph_kernel/expanders/softmax.py
+++ b/mindspore/_extends/graph_kernel/expanders/softmax.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""generate json desc for softmax"""
+from mindspore._extends.graph_kernel.model import model_builder as builder
+
+
+def expand_softmax(expand_info):
+    """Softmax expander"""
+
+    # get op info.
+    input_desc = expand_info['input_desc'][0]
+    attrs = expand_info['attr']
+    axis = None
+    for item in attrs:
+        if 'axis' in item:
+            axis = item['axis']
+    graph_builder = builder.GraphBuilder()
+
+    # generate a graph.
+    with graph_builder.graph_scope('main') as graph_scope:
+        # create tensor input.
+        input_x = graph_builder.tensor(input_desc['shape'], input_desc['data_type'], input_desc['format'])
+        # cal softmax.
+
+        if input_x.dtype == 'float32':
+            input_x_cast = graph_builder.emit('Cast', [input_x], attrs={'dst_type': 'float16'})
+            max_x = graph_builder.emit('ReduceMax', [input_x_cast], attrs={'reduce_axis': axis, 'keep_dims': True})
+            max_x = graph_builder.emit('Cast', [max_x], attrs={'dst_type': 'float32'})
+        else:
+            max_x = graph_builder.emit('ReduceMax', [input_x], attrs={'reduce_axis': axis, 'keep_dims': True})
+        data_sub = graph_builder.emit('Sub', [input_x, max_x])
+        data_exp = graph_builder.emit('Exp', [data_sub])
+        data_expsum = graph_builder.emit('ReduceSum', [data_exp], attrs={'reduce_axis': axis, 'keep_dims': True})
+        result = graph_builder.emit('RealDiv', [data_exp, data_expsum])
+        # set graph output.
+        graph_scope.set_output(result)
+
+    graph = graph_builder.get()[0]
+    return graph
--- a/mindspore/_extends/graph_kernel/expanders/square.py
+++ b/mindspore/_extends/graph_kernel/expanders/square.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""generate json desc for square"""
+from mindspore._extends.graph_kernel.model import model_builder as builder
+
+
+def expand_square(expand_info):
+    """Square expander"""
+
+    # get op info.
+    input_desc = expand_info['input_desc'][0]
+    graph_builder = builder.GraphBuilder()
+
+    # generate a graph.
+    with graph_builder.graph_scope('main') as graph_scope:
+        # create tensor input.
+        input_x = graph_builder.tensor(input_desc['shape'], input_desc['data_type'], input_desc['format'])
+        # create op.
+        result = graph_builder.emit('Mul', [input_x, input_x])
+        # set graph output.
+        graph_scope.set_output(result)
+
+    graph = graph_builder.get()[0]
+    return graph
--- a/mindspore/_extends/graph_kernel/model/__init__.py
+++ b/mindspore/_extends/graph_kernel/model/__init__.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""GraphKernel cost model init"""
+
+from .graph_split import split
+from .model_builder import GraphBuilder, load_composite
--- a/mindspore/_extends/graph_kernel/model/graph_split.py
+++ b/mindspore/_extends/graph_kernel/model/graph_split.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""Cost model splitter"""
+
+from .model import PrimLib, Graph
+
+
+class GraphSplitByPattern:
+    """Graph split by pattern"""
+
+    def __init__(self, graph):
+        self.graph = graph
+        self.groups = []
+        self.op_group = {}
+        for op in self.graph.ops:
+            g = [op]
+            self.groups.append(g)
+            self.op_group[op] = g
+        self.ids = {}
+        for i, op in enumerate(graph.ops):
+            self.ids[op] = i
+        self.doms = self.post_dom(graph.ops)
+        _, outputs = graph.deduce_parameters()
+        self.outputs = set(outputs)
+
+    def post_dom(self, ops):
+        """Post dom"""
+        doms, i_doms = {}, {}
+        for i in range(len(ops) - 1, -1, -1):
+            op = ops[i]
+            doms[op] = {op}
+            i_dom = None
+            if op.output.to_ops:
+                suc_dom = set(doms[op.output.to_ops[0]])
+                for to in op.output.to_ops[1:]:
+                    suc_dom.intersection_update(doms[to])
+                doms[op].update(suc_dom)
+                for dom in suc_dom:
+                    if i_dom is None or self.ids[dom] < self.ids[i_dom]:
+                        i_dom = dom
+            i_doms[op] = i_dom
+        return i_doms
+
+    def get_pattern(self, op, i):
+        """Get pattern"""
+        pattern = PrimLib.UNKNOWN
+        _, elem_relation = PrimLib.input_relation(op, i)
+        for pat in elem_relation:
+            if pat and pat > pattern:
+                pattern = pat
+        return pattern
+
+    def fuse(self, check_fun):
+        """Fuse ops"""
+        def _get_path(op, dom):
+            path_ops, visited = [], set()
+
+            def _get_path_depth(p):
+                visited.add(p)
+                if self.op_group[p][0] == p:
+                    path_ops.append(p)
+                for to in p.output.to_ops:
+                    if to != dom and to not in visited:
+                        _get_path_depth(to)
+            _get_path_depth(op)
+            return path_ops
+        changed = True
+        while changed:
+            for group in self.groups:
+                op = group[0]
+                dom = self.doms[op]
+                if dom is None or op.output in self.outputs:
+                    continue
+                ops = _get_path(op, dom)
+                if check_fun(op, dom, ops):
+                    dom_group = self.op_group[dom]
+                    fused = []
+                    for fop in ops:
+                        f_group = self.op_group[fop]
+                        for p in f_group:
+                            self.op_group[p] = dom_group
+                        fused.append(f_group)
+                        dom_group += f_group
+                    for g in fused:
+                        self.groups.remove(g)
+                    break
+            else:
+                changed = False
+
+    def to_subgraphs(self):
+        """Transform op groups to subgraphs"""
+        subgraphs = []
+        for i, group in enumerate(self.groups):
+            group.sort(key=lambda op: self.ids[op])
+            subgraphs.append(Graph('{}_{}'.format(self.graph.name, i), group))
+        return subgraphs
+
+    def split(self):
+        """Split graph"""
+        def _buddy(op, dom, path_ops):
+            """Fuse buddy together"""
+            # pylint: disable=unused-argument
+            group = self.op_group[op]
+            for p in group:
+                # p is buddy
+                if p.output.buddy is not None and p.output.buddy.members[0].op not in group:
+                    return True
+                # p's output is buddy
+                for to in p.output.to_ops:
+                    if to.output.buddy is not None and to not in group:
+                        return True
+            return False
+
+        def _injective(pattern, limit):
+            def _checker(op, dom, path_ops):
+                # pylint: disable=unused-argument
+                for p in op.output.to_ops:
+                    if p not in self.op_group[dom]:
+                        return False
+                if PrimLib.iter_type(op) in (PrimLib.ELEMWISE, PrimLib.BROADCAST):
+                    for i, t in enumerate(dom.inputs):
+                        if t == op.output:
+                            return self.get_pattern(dom, i) == pattern and len(self.op_group[op]) < limit
+                return False
+            return _checker
+
+        def _diamond(op, dom, path_ops):
+            if PrimLib.iter_type(op) not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or \
+                    PrimLib.iter_type(dom) in (PrimLib.UNKNOWN, PrimLib.TRANSFORM):
+                return False
+            return len(path_ops) == 1 and op.output not in dom.inputs
+        self.fuse(_buddy)
+        self.fuse(_injective(PrimLib.ELEMWISE, 100))
+        self.fuse(_injective(PrimLib.BROADCAST, 6))
+        self.fuse(_injective(PrimLib.REDUCE, 6))
+        self.fuse(_diamond)
+        return self.to_subgraphs()
+
+
+def split(graph):
+    return GraphSplitByPattern(graph).split()
--- a/mindspore/_extends/graph_kernel/model/model.py
+++ b/mindspore/_extends/graph_kernel/model/model.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""GraphKernel cost model"""
+
+
+class Utils:
+    """Model utils"""
+    @staticmethod
+    def get_attr_type(attr):
+        """Get attr type"""
+        if isinstance(attr, bool):
+            return 'bool'
+        if isinstance(attr, str):
+            return 'str'
+        if isinstance(attr, int):
+            return 'int'
+        if isinstance(attr, float):
+            return 'bool'
+        if isinstance(attr, (list, tuple)):
+            if not attr:
+                raise ValueError("Length of attr is 0")
+            if isinstance(attr[0], int):
+                return 'listInt'
+            if isinstance(attr[0], str):
+                return 'listStr'
+        raise ValueError("Unknown type of attr: {}".format(attr))
+
+
+class DataFormat:
+    """DataFormat"""
+    DEFAULT = "DefaultFormat"
+    NC1KHKWHWC0 = "NC1KHKWHWC0"
+    ND = "ND"
+    NCHW = "NCHW"
+    NHWC = "NHWC"
+    HWCN = "HWCN"
+    NC1HWC0 = "NC1HWC0"
+    FRAC_Z = "FracZ"
+    FRAC_NZ = "FRACTAL_NZ"
+    C1HWNCOC0 = "C1HWNCoC0"
+    NC1HWC0_C04 = "NC1HWC0_C04"
+    FRACTAL_Z_C04 = "FRACTAL_Z_C04"
+    NDHWC = "NDHWC"
+
+
+class Config:
+    R0 = 8.0
+    UB_SIZE = 256 * 1024
+    MAX_BLOCK = 32
+
+
+class PrimLib:
+    """Prim lib"""
+
+    UNKNOWN = 0
+    ELEMWISE = 1
+    BROADCAST = 2
+    REDUCE = 3
+    TRANSFORM = 4
+    CONTROL = 5
+
+    class Prim:
+        """Prim"""
+
+        def __init__(self, iter_type, calibrate=1, relation_func=None):
+            self.iter_type = iter_type
+            self.calibrate = calibrate
+            self.relation_func = relation_func
+            if relation_func is None:
+                self.relation_func = lambda *x: self.default_relation_func[iter_type](self, *x)
+
+        def default_elemwise_broadcast_relation(self, op, input_idx):
+            """Process elemwise and broadcast relation"""
+            out_shape = op.output.shape
+            in_shape = op.inputs[input_idx].shape
+            assert len(out_shape) >= len(in_shape)
+            axis_relation, elem_relation = [], []
+            delta = len(out_shape) - len(in_shape)
+            if delta > 0:
+                for i in range(0, delta):
+                    axis_relation.append(None)
+                    elem_relation.append(None)
+            for i, _ in enumerate(in_shape):
+                axis_relation.append(i)
+                elem_relation.append(
+                    PrimLib.ELEMWISE if out_shape[i + delta] == in_shape[i] else PrimLib.BROADCAST)
+            return axis_relation, elem_relation
+
+        def default_reduce_relation(self, op, input_idx):
+            """Process reduce relation"""
+            axis_relation, elem_relation = self.default_elemwise_broadcast_relation(op, input_idx)
+            for i in op.attrs['reduce_axis']:
+                elem_relation[i] = PrimLib.REDUCE
+            return axis_relation, elem_relation
+
+        def unknown_relation(self, op, input_idx):
+            """Process unknown relation"""
+            out_shape = op.output.shape
+            in_shape = op.inputs[input_idx].shape
+            all_relation = list(range(len(in_shape)))
+            axis_relation = [all_relation for i in range(0, len(out_shape))]
+            elem_relation = [PrimLib.UNKNOWN for i in range(0, len(out_shape))]
+            return axis_relation, elem_relation
+
+        default_relation_func = [
+            unknown_relation,
+            default_elemwise_broadcast_relation,
+            default_elemwise_broadcast_relation,
+            default_reduce_relation,
+            unknown_relation,
+            unknown_relation,
+        ]
+
+    primtives = {
+        'TensorAdd': Prim(ELEMWISE),
+        'Abs': Prim(ELEMWISE),
+        'Neg': Prim(ELEMWISE),
+        'Mul': Prim(ELEMWISE),
+        'Sub': Prim(ELEMWISE),
+        'Log': Prim(ELEMWISE),
+        'Exp': Prim(ELEMWISE),
+        'Rsqrt': Prim(ELEMWISE),
+        'Sqrt': Prim(ELEMWISE),
+        'RealDiv': Prim(ELEMWISE),
+        'Cast': Prim(ELEMWISE),
+        'Pow': Prim(ELEMWISE),
+        'Minimum': Prim(ELEMWISE),
+        'Maximum': Prim(ELEMWISE),
+        'Reciprocal': Prim(ELEMWISE),
+        'Equal': Prim(ELEMWISE),
+        'Greater': Prim(ELEMWISE),
+        'GreaterEqual': Prim(ELEMWISE),
+        'Less': Prim(ELEMWISE),
+        'LessEqual': Prim(ELEMWISE),
+        'Square': Prim(ELEMWISE),
+        'AddN': Prim(ELEMWISE),
+        'Select': Prim(ELEMWISE, 8),
+        'ReduceSum': Prim(REDUCE),
+        'ReduceMax': Prim(REDUCE),
+        'ReduceMin': Prim(REDUCE),
+        'make_tuple': Prim(CONTROL),
+        'ControlDepend': Prim(CONTROL),
+        '@ReduceInit': Prim(ELEMWISE),
+    }
+
+    default_primtive = Prim(UNKNOWN)
+
+    @classmethod
+    def get_prim(cls, op):
+        prim = cls.primtives.get(op.prim, None)
+        if prim is None:
+            print('[WARN] primtive is not registered: ' + op.prim)
+            prim = cls.default_primtive
+        return prim
+
+    @classmethod
+    def input_relation(cls, op, input_idx):
+        return cls.get_prim(op).relation_func(op, input_idx)
+
+    @classmethod
+    def iter_type(cls, op):
+        return cls.get_prim(op).iter_type
+
+    @classmethod
+    def is_reduce(cls, op):
+        return cls.get_prim(op).iter_type == cls.REDUCE
+
+    @classmethod
+    def calibrate_iter_size(cls, op, iter_size):
+        return cls.get_prim(op).calibrate * iter_size
+
+    @classmethod
+    def dtype_bytes(cls, dtype):
+        bits, unit = 1, 1
+        for i in range(len(dtype) - 1, 0, -1):
+            if dtype[i].isdecimal():
+                bits += int(dtype[i]) * unit
+                unit *= 10
+            else:
+                break
+        return bits // 8
+
+    @classmethod
+    def inplace_reuse(cls, op, input_idx, start_axis=0):
+        if cls.dtype_bytes(op.output.dtype) > cls.dtype_bytes(op.inputs[input_idx].dtype):
+            return False
+        _, elem_relation = cls.get_prim(op).relation_func(op, input_idx)
+        for i in range(start_axis, len(elem_relation)):
+            if elem_relation[i] != cls.ELEMWISE:
+                return False
+        return True
+
+
+class Tensor:
+    """Tensor"""
+
+    PARA_NONE = 0
+    PARA_INPUT = 1
+    PARA_OUTPUT = 2
+
+    class Buddy:
+        def __init__(self, leader):
+            self.members = [leader]
+
+    def __init__(self, name, shape, dtype, data_format=DataFormat.DEFAULT, para_type=0):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.data_format = data_format
+        self.para_type = para_type
+        self.op = None
+        self.to_ops = []
+        self.buddy = None
+
+    def __str__(self):
+        return self.name + str(list(self.shape))
+
+    def __repr__(self):
+        return "%s.%s%s" % (self.name, self.dtype, str(list(self.shape)))
+
+    def get_size(self):
+        """Get size"""
+        size = PrimLib.dtype_bytes(self.dtype)
+        for i in self.shape:
+            size *= i
+        return size
+
+    def add_buddy(self, tensor):
+        """Add buddy"""
+        if self.buddy is None:
+            self.buddy = self.Buddy(self)
+        self.buddy.members.append(tensor)
+        tensor.buddy = self.buddy
+
+
+class Value:
+    """Value"""
+
+    def __init__(self, name, dtype, value, data_format=DataFormat.DEFAULT):
+        self.name = name
+        self.shape = [1]
+        self.dtype = dtype
+        self.value = value
+        self.data_format = data_format
+
+    def __str__(self):
+        return self.name + str(list(self.shape)) + str(self.value)
+
+    def __repr__(self):
+        return "%s.%s%s%s" % (self.name, self.dtype, str(list(self.shape)), str(self.value))
+
+    def get_size(self):
+        return 1
+
+
+class Operator:
+    """Operator"""
+
+    def __init__(self, primtive, inputs, output, attrs):
+        self.prim = primtive
+        self.inputs = inputs
+        self.output = output
+        self.attrs = attrs
+        for t in inputs:
+            t.to_ops.append(self)
+        if output.op is None:
+            output.op = self
+        self.all_inputs = []  # include Tensor inputs and Value inputs.
+
+    def __str__(self):
+        args = ', '.join([str(t) for t in self.all_inputs])
+        expr = "%s = %s.%s(%s)" % (
+            str(self.output), self.prim, self.output.dtype, args)
+        return expr if not self.attrs else '%s // %s' % (expr, str(self.attrs))
+
+    def __repr__(self):
+        return str(self)
+
+
+class Graph:
+    """Graph"""
+
+    def __init__(self, name, ops):
+        self.name = name
+        self.ops = ops  # in topo order, can not use set
+        self.outputs = []
+
+    def set_processor(self, processor):
+        """Set processor"""
+        self.processor = processor
+
+    def add(self, ops):
+        """Add ops"""
+        if isinstance(ops, Operator):
+            self.ops.append(ops)
+        else:
+            self.ops.extend(ops)
+
+    def extract_subgraph(self, graph_name, tensor_names, difference=False):
+        """Extract subgraph from this graph"""
+        graph = Graph(graph_name, [])
+        outputs = set(tensor_names)
+        if difference:
+            for op in self.ops:
+                if op.output.name not in outputs:
+                    graph.add(op)
+        else:
+            for op in self.ops:
+                if op.output.name in outputs:
+                    graph.add(op)
+                    outputs.remove(op.output.name)
+            for name in outputs:
+                raise ValueError("invalid input tensor : " + name)
+        return graph
+
+    def deduce_parameters(self):
+        """Deduce parameters"""
+        inputs, outputs = [], []
+        for op in self.ops:
+            for t in op.inputs:
+                if t not in inputs and t.op not in self.ops:
+                    inputs.append(t)
+            if op.output not in outputs:
+                if op.output.para_type == Tensor.PARA_OUTPUT or not op.output.to_ops:
+                    outputs.append(op.output)
+                else:
+                    for d in op.output.to_ops:
+                        if d not in self.ops:
+                            outputs.append(op.output)
+                            break
+        if self.outputs:
+            outputs = self.outputs
+        return inputs, outputs
+
+    def __str__(self):
+        inputs, outputs = self.deduce_parameters()
+        para_str = ', '.join([repr(t) for t in inputs])
+        out_str = ', '.join([repr(t) for t in outputs])
+        lines = []
+        lines.append("%s(%s) -> %s {" % (self.name, para_str, out_str))
+        for op in self.ops:
+            lines.append('  ' + str(op))
+        lines.append('}')
+        return '\n'.join(lines)
+
+    def __repr__(self):
+        return str(self)
+
+    def dump(self):
+        """Dump Graph to json"""
+        attr_name = {'reduce_axis': 'axis'}
+        inputs, outputs = self.deduce_parameters()
+        input_desc, output_desc, op_desc = [], [], []
+        for t in inputs:
+            input_desc.append([{'data_type': t.dtype, 'shape': t.shape,
+                                'tensor_name': t.name, 'format': t.data_format}])
+        for t in outputs:
+            output_desc.append({'data_type': t.dtype, 'shape': t.shape,
+                                'tensor_name': t.name, 'format': t.data_format})
+        for op in self.ops:
+            attrs, in_desc = [], []
+            for a in op.attrs:
+                name = attr_name.get(a, a)
+                attrs.append(
+                    {'name': name, 'value': op.attrs[a], 'data_type': Utils.get_attr_type(op.attrs[a])})
+            for t in op.all_inputs:
+                if isinstance(t, Tensor):
+                    in_desc.append([{'data_type': t.dtype, 'name': '', 'shape': t.shape,
+                                     'tensor_name': t.name, 'format': t.data_format}])
+                else:
+                    in_desc.append([{'data_type': t.dtype, 'value': t.value, 'name': '', 'shape': t.shape,
+                                     'tensor_name': t.name, 'format': t.data_format}])
+            out_desc = [{'data_type': op.output.dtype, 'name': '', 'shape': op.output.shape,
+                         'tensor_name': op.output.name, 'format': t.data_format}]
+            op_desc.append({'attr': attrs, 'impl_path': '',
+                            'input_desc': in_desc, 'name': op.prim, 'output_desc': out_desc})
+        graph_desc = {'composite': True, 'composite_graph': '', 'id': 0,
+                      'input_desc': input_desc, 'op': self.name, 'op_desc': op_desc, 'output_desc': output_desc,
+                      'platform': 'AKG', 'process': self.processor}
+        return graph_desc
+
+
+class GraphVisitor:
+    """Graph visitor"""
+
+    def __init__(self, forward=True, once_mode=True):
+        self.forward = forward
+        self.once_mode = once_mode
+        if self.once_mode:
+            self.visited = set()
+
+    def visit_graph(self, graph):
+        """Visit graph"""
+        inputs, outputs = graph.deduce_parameters()
+        if self.forward:
+            for tensor in inputs:
+                for op in tensor.to_ops:
+                    self.visit(op)
+        else:
+            for tensor in outputs:
+                if not tensor.to_ops:
+                    self.visit(tensor.op)
+
+    def visit(self, op):
+        """Visit op"""
+        next_ops = op.output.to_ops if self.forward else [
+            t.op for t in op.inputs if t.op is not None]
+        if self.once_mode:
+            self.visited.add(op)
+            for n in next_ops:
+                if n not in self.visited:
+                    self.visit(n)
+        else:
+            for n in next_ops:
+                self.visit(n)
+
+
+class AlignShape(GraphVisitor):
+    """Align shape"""
+
+    def __init__(self):
+        super().__init__(once_mode=False)
+
+    def visit(self, op):
+        prim = PrimLib.get_prim(op)
+        if prim.iter_type in (PrimLib.ELEMWISE, PrimLib.BROADCAST, PrimLib.REDUCE):
+            out_dim = len(op.output.shape)
+            align_dim = out_dim
+            for t in op.inputs:
+                if len(t.shape) > align_dim:
+                    align_dim = len(t.shape)
+            if align_dim > out_dim:
+                op.output.shape = [1] * (align_dim - out_dim) + op.output.shape
+        super().visit(op)
+
+
+class AddControlBuddy(GraphVisitor):
+    """Add control buddy"""
+
+    def __init__(self):
+        super().__init__()
+        self.buddies = {}  # {op : [ctrl_op]}
+
+    def visit(self, op):
+        if PrimLib.iter_type(op) == PrimLib.CONTROL:
+            assert len(op.output.to_ops) == 1
+            owner = op.output.to_ops[0]
+            if owner in self.buddies:
+                self.buddies[owner].append(op)
+            else:
+                self.buddies[owner] = [op]
+            if op in self.buddies:
+                ops = self.buddies.pop(op)
+                self.buddies[owner].extend(ops)
+        super().visit(op)
+
+    def visit_graph(self, graph):
+        super().visit_graph(graph)
+        for owner in self.buddies:
+            for op in self.buddies[owner]:
+                owner.add_buddy(op.output)
--- a/mindspore/_extends/graph_kernel/model/model_builder.py
+++ b/mindspore/_extends/graph_kernel/model/model_builder.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""GraphKernel model builder"""
+
+import copy
+from .model import PrimLib, Tensor, Value, Operator, Graph, AlignShape, AddControlBuddy
+
+
+class OpInfer:
+    """Op infer"""
+    @staticmethod
+    def default_reduce_infer(inputs, attrs):
+        shape = copy.deepcopy(inputs[0].shape)
+        for i in attrs['reduce_axis']:
+            shape[i] = 1
+        return shape
+
+    default_infer_shape_func = [
+        None,
+        lambda inputs, attrs: max([t.shape for t in inputs]),
+        lambda inputs, attrs: max([t.shape for t in inputs]),
+        default_reduce_infer.__func__,
+        None,
+        lambda inputs, attrs: [1],  # control op
+    ]
+
+    @staticmethod
+    def default_infer_dtype_func(inputs, attrs):
+        """Infer dtype"""
+        # pylint: disable=unused-argument
+        return inputs[0].dtype
+
+    @staticmethod
+    def default_infer_format_func(inputs, attrs):
+        """Infer format"""
+        # pylint: disable=unused-argument
+        return inputs[0].data_format
+
+    infer_shape_func = {
+        # add special infer func here
+    }
+    infer_dtype_func = {
+        # add special infer func here
+        'Cast': lambda inputs, attrs: attrs['dst_type'],
+    }
+    infer_format_func = {
+        # add special infer func here
+    }
+
+    @classmethod
+    def infer(cls, prim_name, inputs, attrs):
+        prim = PrimLib.primtives[prim_name]
+        infer_shape = cls.infer_shape_func.get(
+            prim_name, cls.default_infer_shape_func[prim.iter_type])
+        infer_dtype = cls.infer_dtype_func.get(
+            prim_name, cls.default_infer_dtype_func)
+        infer_format = cls.infer_format_func.get(
+            prim_name, cls.default_infer_format_func)
+        return infer_shape(inputs, attrs), infer_dtype(inputs, attrs), infer_format(inputs, attrs)
+
+
+class GraphBuilder:
+    """Graph builder"""
+    class GraphWrapper:
+        def __init__(self, name):
+            self.graph = Graph(name, [])
+
+        def set_output(self, *para):
+            for t in para:
+                t.para_type = Tensor.PARA_OUTPUT
+                self.graph.outputs.append(t)
+
+    def __init__(self):
+        self.graphs = []
+        self.current = None
+        self.name_id = 0
+
+    def _alloc_tensor_name(self):
+        tid = self.name_id
+        self.name_id += 1
+        return "t%d" % (tid)
+
+    def graph_scope(self, name):
+        """The graph scope to be processed"""
+        class GraphScope:
+            def __init__(self, gb):
+                self.gb = gb
+
+            def __enter__(self):
+                return self.gb.current
+
+            def __exit__(self, ptype, value, trace):
+                self.gb.graphs.append(self.gb.current.graph)
+                self.gb.current = None
+
+        assert self.current is None
+        self.current = self.GraphWrapper(name)
+        return GraphScope(self)
+
+    def tensor(self, shape, dtype, data_format="DefaultFormat", name=None, para_type=Tensor.PARA_NONE):
+        """Create a new Tensor"""
+        if name in (None, ''):
+            name = self._alloc_tensor_name()
+        if not shape:
+            shape = [1]
+        return Tensor(name, shape, dtype, data_format, para_type=para_type)
+
+    def value(self, dtype, value, data_format, name=None):
+        """Create a new Value"""
+        if name in (None, ''):
+            name = self._alloc_tensor_name()
+        return Value(name, dtype, value, data_format)
+
+    def op(self, prim, output, inputs, attrs=None):
+        """Insert an operator into graph"""
+        if attrs is None:
+            attrs = {}
+        if isinstance(inputs, Tensor):
+            inputs = [inputs]
+        tensor_inputs = [t for t in inputs if isinstance(t, Tensor)]
+        node = Operator(prim, tensor_inputs, output, attrs)
+        node.all_inputs = inputs
+        self.current.graph.add(node)
+
+    def emit(self, prim, inputs, name=None, attrs=None):
+        """Emit a new operation"""
+        if attrs is None:
+            attrs = {}
+        if isinstance(inputs, Tensor):
+            inputs = [inputs]
+        tensor_inputs = [t for t in inputs if isinstance(t, Tensor)]
+        out_shape, out_dtype, out_format = OpInfer.infer(prim, tensor_inputs, attrs)
+        output = self.tensor(out_shape, out_dtype, out_format, name)
+        self.op(prim, output, inputs, attrs)
+        return output
+
+    def get(self):
+        return self.graphs
+
+
+class CompositeGraph:
+    """Composite Graph"""
+
+    def __init__(self):
+        self.graph = None
+        self.desc = None
+        self.tensors = {}  # name : Tensor
+
+    def refine(self):
+        """Refine Graph"""
+        AlignShape().visit_graph(self.graph)
+        AddControlBuddy().visit_graph(self.graph)
+
+    def load(self, desc):
+        """Load Graph from json"""
+        def _attr_of(op, inputs, output):
+            attr = {}
+            if op['name'] not in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
+                return attr
+            for a in op['attr']:
+                if a['name'] == 'axis':
+                    red_axis, dim_size = [], len(inputs[0].shape)
+                    if not a['value']:
+                        assert len(output.shape) == len(inputs[0].shape)
+                        for i in range(len(output.shape)):
+                            if output.shape[i] == 1 and inputs[0].shape[i] > 1:
+                                red_axis.append(i)
+                    else:
+                        for i in a['value']:
+                            red_axis.append(i if i >= 0 else dim_size + i)
+                    attr['reduce_axis'] = red_axis
+                    break
+            return attr
+
+        builder = GraphBuilder()
+        with builder.graph_scope(desc['op']):
+            for in_desc in desc['input_desc']:
+                name, shape, dtype, data_format = in_desc[0]['tensor_name'], in_desc[
+                    0]['shape'], in_desc[0]['data_type'], in_desc[0]['format']
+                self.tensors[name] = builder.tensor(
+                    shape, dtype, data_format, name=name, para_type=Tensor.PARA_INPUT)
+            for out_desc in desc['output_desc']:
+                name, shape, dtype, data_format = out_desc['tensor_name'], out_desc[
+                    'shape'], out_desc['data_type'], out_desc['format']
+                self.tensors[name] = builder.tensor(
+                    shape, dtype, data_format, name=name, para_type=Tensor.PARA_OUTPUT)
+            cur_fusion = None
+            for op in desc['op_desc']:
+                inputs = [self.tensors[d[0]['tensor_name']]
+                          for d in op['input_desc'] if 'value' not in d[0]]
+                out_desc = op['output_desc']
+                name, shape, dtype, data_format = out_desc[0]['tensor_name'], out_desc[
+                    0]['shape'], out_desc[0]['data_type'], out_desc[0]['format']
+                if op['name'] == 'InplaceAssign':
+                    inputs[0].add_buddy(inputs[1])
+                    inputs[1].para_type = Tensor.PARA_OUTPUT
+                    output = inputs[2]
+                    self.tensors[name] = output
+                else:
+                    output = self.tensors.get(name, None)
+                    if not output:
+                        output = builder.tensor(
+                            shape, dtype, data_format, name=name)
+                        self.tensors[name] = output
+                    builder.op(op['name'], output, inputs,
+                               attrs=_attr_of(op, inputs, output))
+                if 'fusion' in op:
+                    if cur_fusion is None:
+                        cur_fusion = output
+                    else:
+                        cur_fusion.add_buddy(output)
+                        if op['fusion'].endswith('_end'):
+                            cur_fusion = None
+        self.graph = builder.get()[0]
+        self.desc = desc
+
+    def dump(self, subgraph):
+        """Dump Graph to json"""
+        desc = {}
+        inputs, outputs = subgraph.deduce_parameters()
+        graph_ops = set(subgraph.ops)
+        inplace_assign = {}  # y_name, output_name
+        inplace_assign_z = None
+        for op in self.desc['op_desc']:
+            if op['name'] == 'InplaceAssign':
+                inplace_assign[op['input_desc'][1][0]['tensor_name']] = op['output_desc'][0]['tensor_name']
+        if inplace_assign:
+            for t in outputs:
+                if t.name not in inplace_assign:
+                    inplace_assign_z = t
+        for key in self.desc:
+            if key == 'input_desc':
+                desc[key] = [
+                    [{'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}] for t in inputs]
+            elif key == 'output_desc':
+                out_desc = []
+                for t in outputs:
+                    if t.name in inplace_assign:
+                        z = inplace_assign_z if inplace_assign_z is not None else self.tensors[t.name]
+                        out_desc.append(
+                            {'data_type': z.dtype, 'shape': z.shape, 'tensor_name': inplace_assign[t.name]})
+                    else:
+                        out_desc.append(
+                            {'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name})
+                desc[key] = out_desc
+            elif key == 'op_desc':
+                op_desc = []
+                for d in self.desc[key]:
+                    if d['name'] == 'InplaceAssign':
+                        y = d['input_desc'][1][0]['tensor_name']
+                        if self.tensors[y].op in graph_ops:
+                            z, fake = (inplace_assign_z, False) if inplace_assign_z is not None else (
+                                self.tensors[y], True)
+                            inplace_desc = copy.deepcopy(d)
+                            inplace_desc['attr'] = {'name': 'fake_output', 'value': fake}
+                            z_desc, out_desc = inplace_desc['input_desc'][2][0].inplace_desc['output_desc'][0]
+                            z_desc['shape'] = z.shape
+                            z_desc['data_type'] = z.dtype
+                            z_desc['tensor_name'] = z.name
+                            out_desc['shape'] = z.shape
+                            out_desc['data_type'] = z.dtype
+                            op_desc.append(inplace_desc)
+                    else:
+                        op = self.tensors[d['output_desc'][0]['tensor_name']].op
+                        if op in graph_ops:
+                            op_desc.append(d)
+                desc[key] = op_desc
+            elif key == 'op':
+                desc[key] = subgraph.name
+            else:
+                desc[key] = self.desc[key]
+        return desc
+
+
+def load_composite(desc):
+    """Load composite kernel"""
+    composite = CompositeGraph()
+    composite.load(desc)
+    composite.refine()
+    return composite
--- a/mindspore/_extends/graph_kernel/splitter.py
+++ b/mindspore/_extends/graph_kernel/splitter.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""GraphKernel splitter"""
+
+import json
+import json.decoder as jd
+import traceback
+from mindspore import log as logger
+from . import model
+
+
+def split_with_json(json_str: str):
+    """Call costmodel to split GraphKernel"""
+    try:
+        graph_desc = json.loads(json_str)
+        comp = model.load_composite(graph_desc)
+        graph_split = model.split(comp.graph)
+        is_multi_graph = len(graph_split) > 1
+        graph_list = list(map(comp.dump, graph_split))
+        result = {"multi_graph": is_multi_graph, "graph_desc": graph_list}
+        return json.dumps(result)
+    except jd.JSONDecodeError:
+        logger.error(traceback.format_exc())
+        return None
--- a/mindspore/_extends/graph_kernel/tests/env.sh
+++ b/mindspore/_extends/graph_kernel/tests/env.sh
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
+export PYTHONPATH
--- a/mindspore/_extends/graph_kernel/tests/graph_kernel_split.py
+++ b/mindspore/_extends/graph_kernel/tests/graph_kernel_split.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""graph kernel split"""
+import json
+import getopt
+import sys
+import model
+
+
+def print_usage():
+    print('Usage: graph_kernel_split.py [OPTION] <JSON_FILE>')
+    print('Options:')
+    print('  -s <config/auto>\tsplit graph with config')
+    print('  -e \t\testimate graph')
+    print('  -i \t\tnaive estimate')
+    print('  -o <prefix>\toutput split graphs')
+    print('  -v \t\tverbose mode')
+    print('  -h \t\tprint this help')
+    print('Report bugs to xiong.gao@huawei.com')
+
+
+class Option:
+    """Options"""
+
+    def __init__(self):
+        self.split = None
+        self.estimate = False
+        self.estimate_naive = False
+        self.output = None
+        self.verbose = False
+        self.help = False
+
+    def parse(self, options):
+        """parse options"""
+        for name, val in options:
+            if name == '-h':
+                self.help = True
+            elif name == '-v':
+                self.verbose = True
+            elif name == '-o':
+                self.output = val
+            elif name == '-e':
+                self.estimate = True
+            elif name == '-s':
+                self.split = val
+            elif name == '-i':
+                self.estimate_naive = True
+
+
+opt = Option()
+
+
+def estimate(graph_in, parts_in, naive):
+    """estimate graphs costs"""
+    def _print_cost(name, c):
+        print("%s\tdma_ratio=%f, saturation=%f, mix_saturation=%f, type=%s" %
+              (name, c.dma_ratio(), c.saturation(), c.mix_saturation(), c.cost_type()))
+    main_cost, _ = model.estimate(graph_in, naive)
+    split_cost, sub_costs = model.estimate(parts_in, naive) if parts_in else (None, None)
+    _print_cost("MainGraph:", main_cost)
+    if parts_in:
+        _print_cost("Subgraphs:", split_cost)
+        if opt.verbose:
+            for i, sub_cost in enumerate(sub_costs):
+                _print_cost(" |_%d:\t" % (i), sub_cost)
+
+
+def split_graph(graph_in, config):
+    """split graph"""
+    if config == 'auto':
+        return model.split(graph_in)
+    subgraphs = []
+    all_tensors = []
+    subgraph_idx = 0
+    config_parts = config.split('|')
+    for part in config_parts:
+        tensor_names = part.split(',')
+        graph_name = "%s_%d" % (graph_in.name, subgraph_idx)
+        g = graph_in.extract_subgraph(graph_name, tensor_names)
+        assert len(g.ops) == len(tensor_names)
+        subgraphs.append(g)
+        all_tensors += tensor_names
+        subgraph_idx += 1
+    if len(all_tensors) < len(graph_in.ops):
+        graph_name = "%s_%d" % (graph_in.name, subgraph_idx)
+        g = graph_in.extract_subgraph(graph_name, all_tensors, True)
+        subgraphs.append(g)
+    return subgraphs
+
+
+def main():
+    opts, args = getopt.getopt(sys.argv[1:], 'heivo:s:')
+    opt.parse(opts)
+    if len(args) != 1 or opt.help:
+        print_usage()
+        sys.exit(0)
+    in_file = args[0]
+    with open(in_file, 'r') as f:
+        desc = json.loads(f.read())
+        comp = model.load_composite(desc)
+        graph = comp.graph
+        parts = []
+        # 1. split sub-graphs
+        if opt.split is not None:
+            parts = split_graph(graph, opt.split)
+        if opt.verbose:
+            print('----------- main graph --------------')
+            print(graph)
+            for i, _ in enumerate(parts):
+                print('---------------- sub graph %d ---------------' % (i))
+                print(parts[i])
+        # 2. estimate cost
+        if opt.estimate:
+            print('------------- cost --------------')
+            estimate(graph, parts, False)
+        if opt.estimate_naive:
+            print('------------- naive cost --------------')
+            estimate(graph, parts, True)
+        # 3. output parts
+        if opt.output is not None:
+            for graph_part in parts:
+                desc = comp.dump(graph_part)
+                s_desc = json.dumps(desc)
+                fname = "%s_%s.json" % (opt.output, graph_part.name)
+                with open(fname, 'w', encoding='utf-8') as of:
+                    of.write(s_desc)
+
+
+if __name__ == '__main__':
+    main()
--- a/mindspore/_extends/graph_kernel/tests/test_split.py
+++ b/mindspore/_extends/graph_kernel/tests/test_split.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===========================================================================
+"""test split"""
+import model
+
+
+def graph_1():
+    gb = model.GraphBuilder()
+    with gb.graph_scope("main"):
+        a = gb.tensor([1024, 16], "float32", name="a")
+        b = gb.emit("Abs", a, 'b')
+        c = gb.emit("Abs", b, 'c')
+        d = gb.emit("Abs", c, 'd')
+        gb.emit("TensorAdd", [b, d], "e")
+    return gb.get()[0]
+
+
+def graph_2():
+    gb = model.GraphBuilder()
+    with gb.graph_scope("main"):
+        a = gb.tensor([1024, 16], "float32", name="a")
+        b = gb.emit("Abs", a, 'b')
+        c = gb.emit("Abs", b, 'c')
+        d = gb.emit("ReduceSum", c, 'd', attrs={'reduce_axis': (1,)})
+        gb.emit("Sqrt", d, 'e')
+    return gb.get()[0]
+
+
+def test_split_by_pattern():
+    def _test(graph):
+        print("***************** main graph ***************")
+        print(graph)
+        subgraphs = model.split(graph)
+        for i, g in enumerate(subgraphs):
+            print('------------- subgraph {} --------------'.format(i))
+            print(g)
+    _test(graph_2())
+
+
+if __name__ == '__main__':
+    test_split_by_pattern()
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -71,7 +71,8 @@ if(ENABLE_GPU)
            "runtime/device/gpu/*.cu"
            "backend/kernel_compiler/gpu/*.cu"
            "backend/kernel_compiler/akg/gpu/*.cc"
-            "backend/kernel_compiler/akg/akg_kernel_build.cc"
+            "backend/kernel_compiler/akg/akg_kernel_json_generator.cc"
+            "backend/kernel_compiler/akg/akg_kernel_json_decoder.cc"
            "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
            )


--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -10,7 +10,8 @@ if (ENABLE_D)
 		"kernel_query.cc"
 		"kernel_fusion.cc"
 		"akg/ascend/*.cc"
-		"akg/akg_kernel_build.cc"
+		"akg/akg_kernel_json_generator.cc"
+		"akg/akg_kernel_json_decoder.cc"
 		"akg/akg_kernel_attrs_process.cc"
 		"akg/akg_kernel_metadata.cc"
 		"tbe/*.cc"
@@ -49,7 +50,8 @@ if (ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
        "akg/gpu/*.cc"
-        "akg/akg_kernel_build.cc"
+        "akg/akg_kernel_json_generator.cc"
+        "akg/akg_kernel_json_decoder.cc"
        "akg/akg_kernel_attrs_process.cc"
 	)


--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
@@ -24,7 +24,6 @@
 #include <climits>
 #include "runtime/device/kernel_runtime.h"
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
-#include "backend/kernel_compiler/akg/akg_kernel_build.h"
 #include "proto/tensor.pb.h"
 #include "proto/tensor_shape.pb.h"
 #include "proto/attr.pb.h"
@@ -33,6 +32,7 @@
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/kernel_compiler/common_utils.h"
+#include "backend/kernel_compiler/oplib/oplib.h"

 namespace mindspore {
 namespace kernel {

--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@@ -15,13 +15,20 @@
 */
 #include "backend/kernel_compiler/akg/akg_kernel_attrs_process.h"

+#include <vector>
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"
 #include "backend/kernel_compiler/common_utils.h"
+#include "base/core_ops.h"
+#include "utils/utils.h"

 namespace mindspore {
 namespace kernel {
+namespace {
 void SetAkgAttrsForFour2Five(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // The x and output are akg op input and output param.
@@ -169,5 +176,29 @@ void SetAkgAttrsForBN2Relu(const AnfNodePtr &anf_node) {
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(bn2_input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(bn2_output_names), anf_node);
 }
+
+const std::unordered_map<std::string, std::function<void(const AnfNodePtr &anf_node)>> kAkgKernelAttrsProcessMap = {
+  {kFour2FiveOpName, SetAkgAttrsForFour2Five},
+  {kFive2FourOpName, SetAkgAttrsForFive2Four},
+  {kCastOpName, SetAkgAttrsForCast},
+  {kBNGrad1OpName, SetAkgAttrsForBNGrad1},
+  {kBNGrad2OpName, SetAkgAttrsForBNGrad2},
+  {kBNGrad3OpName, SetAkgAttrsForBNGrad3},
+  {kFusedBN1OpName, SetAkgAttrsForFusedBN1},
+  {kFusedBN2OpName, SetAkgAttrsForFusedBN2},
+  {kFusedBN3OpName, SetAkgAttrsForFusedBN3},
+  {kConvBN1OpName, SetAkgAttrsForConvBN1},
+  {kBN2AddReluOpName, SetAkgAttrsForBN2AddRelu},
+  {kBN2ReLUOpName, SetAkgAttrsForBN2Relu},
+};
+}  // namespace
+
+void SetAkgKernelAttrs(const AnfNodePtr &anf_node) {
+  auto it = kAkgKernelAttrsProcessMap.find(AnfAlgo::GetCNodeName(anf_node));
+  if (it != kAkgKernelAttrsProcessMap.end()) {
+    it->second(anf_node);
+  }
+}
+
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.h
@@ -16,43 +16,13 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_ATTRS_PROCESS_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_ATTRS_PROCESS_H

-#include <vector>
-#include <memory>
-#include <string>
-#include <unordered_map>
 #include "ir/anf.h"
-#include "utils/utils.h"
-#include "base/core_ops.h"

 namespace mindspore {
 namespace kernel {
-void SetAkgAttrsForFour2Five(const AnfNodePtr &anf_node);
-void SetAkgAttrsForFive2Four(const AnfNodePtr &anf_node);
-void SetAkgAttrsForCast(const AnfNodePtr &anf_node);
-void SetAkgAttrsForBNGrad1(const AnfNodePtr &anf_node);
-void SetAkgAttrsForBNGrad2(const AnfNodePtr &anf_node);
-void SetAkgAttrsForBNGrad3(const AnfNodePtr &anf_node);
-void SetAkgAttrsForFusedBN1(const AnfNodePtr &anf_node);
-void SetAkgAttrsForFusedBN2(const AnfNodePtr &anf_node);
-void SetAkgAttrsForFusedBN3(const AnfNodePtr &anf_node);
-void SetAkgAttrsForConvBN1(const AnfNodePtr &anf_node);
-void SetAkgAttrsForBN2AddRelu(const AnfNodePtr &anf_node);
-void SetAkgAttrsForBN2Relu(const AnfNodePtr &anf_node);

-const std::unordered_map<std::string, std::function<void(const AnfNodePtr &anf_node)>> kAkgKernelAttrsProcessMap = {
-  {kFour2FiveOpName, SetAkgAttrsForFour2Five},
-  {kFive2FourOpName, SetAkgAttrsForFive2Four},
-  {"Cast", SetAkgAttrsForCast},
-  {kBNGrad1OpName, SetAkgAttrsForBNGrad1},
-  {kBNGrad2OpName, SetAkgAttrsForBNGrad2},
-  {kBNGrad3OpName, SetAkgAttrsForBNGrad3},
-  {kFusedBN1OpName, SetAkgAttrsForFusedBN1},
-  {kFusedBN2OpName, SetAkgAttrsForFusedBN2},
-  {kFusedBN3OpName, SetAkgAttrsForFusedBN3},
-  {kConvBN1OpName, SetAkgAttrsForConvBN1},
-  {kBN2AddReluOpName, SetAkgAttrsForBN2AddRelu},
-  {kBN2ReLUOpName, SetAkgAttrsForBN2Relu},
-};
+void SetAkgKernelAttrs(const AnfNodePtr &anf_node);
+
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_ATTRS_PROCESS_H
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKGKERNELBUILD_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKGKERNELBUILD_H_
-#include <unordered_map>
-#include <string>
-#include <vector>
-#include <memory>
-#include <map>
-#include <utility>
-#include "backend/kernel_compiler/kernel.h"
-#include "ir/dtype.h"
-#include "ir/primitive.h"
-#include <nlohmann/json.hpp>
-#include "backend/kernel_compiler/common_utils.h"
-#include "backend/kernel_compiler/oplib/oplib.h"
-
-namespace mindspore {
-namespace kernel {
-class AkgKernelBuild {
- public:
-  AkgKernelBuild() {
-    input_tensor_idx_ = {};
-    output_tensor_idx_ = 0;
-  }
-  ~AkgKernelBuild() = default;
-
-  KernelPackPtr BuildByJson(const AnfNodePtr &anf_node, std::vector<size_t> *const input_size,
-                            std::vector<size_t> *const output_size);
-  static std::string GetProcessor(const AnfNodePtr &anf_node);
-
- protected:
-  bool CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const inputs_json);
-  bool CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const outputs_json);
-  bool CreateAttrDescJson(const AnfNodePtr &anf_node, const std::string &op_name,
-                          const std::shared_ptr<OpInfo> &op_info, nlohmann::json *const attrs_json);
-  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
-  int GetOpCntInc();
-  size_t GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx);
-  size_t GetOutputTensorIdxInc();
-  bool GenerateSingleKernelJson(const AnfNodePtr &anf_node, const std::string &op_name,
-                                nlohmann::json *const node_json);
-
-  static int op_cnt_;
-  // lock for variable fusionOpCnt in singleton mode
-  static std::mutex op_cnt_mtx_;
-  std::string json_name_;
-  std::string json_info_;
-  std::unordered_map<AnfNodePtr, size_t> input_tensor_idx_;
-  size_t output_tensor_idx_;
-};
-
-bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
-               std::vector<size_t> *const output_size);
-void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
-                   nlohmann::json *const node_json);
-std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
-                          const std::pair<size_t, size_t> &position);
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKGKERNELBUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/akg/akg_kernel_json_decoder.h"
+#include <string>
+#include <memory>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
+#include "ir/anf.h"
+#include "ir/func_graph.h"
+#include "ir/meta_tensor.h"
+#include "ir/manager.h"
+#include "ir/dtype.h"
+#include "frontend/operator/ops.h"
+#include "utils/convert_utils.h"
+#include "utils/convert_utils_py.h"
+#include "utils/utils.h"
+#include "ir/graph_utils.h"
+#include "runtime/device/kernel_info.h"
+#include "pipeline/jit/parse/data_converter.h"
+#include "pipeline/jit/parse/python_adapter.h"
+#include "backend/kernel_compiler/common_utils.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+ValuePtr ParseValue(const nlohmann::json &attr_json, const std::string &type) {
+  if (type == "str") {
+    std::string value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else if (type == "int") {
+    int value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else if (type == "bool") {
+    bool value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else if (type == "float") {
+    float value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else if (type == "listInt") {
+    std::vector<int> value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else if (type == "listStr") {
+    std::vector<std::string> value = attr_json[kJsonKeyValue];
+    return MakeValue(value);
+  } else {
+    MS_LOG(ERROR) << "Unknown type of attr: " << type << ", json: \n" << attr_json;
+    return nullptr;
+  }
+}
+
+bool DecodeAttrs(const nlohmann::json &attrs_json, std::map<std::string, ValuePtr> *attrs) {
+  MS_EXCEPTION_IF_NULL(attrs);
+  MS_LOG(DEBUG) << "start decode attrs, " << attrs_json;
+  // decode attrs.
+  if (attrs_json.find(kJsonKeyAttr) == attrs_json.end() || attrs_json[kJsonKeyAttr].is_null()) {
+    // attrs maybe empty
+    return true;
+  }
+
+  std::vector<nlohmann::json> attr_descs = attrs_json[kJsonKeyAttr];
+  for (const auto &attr_desc : attr_descs) {
+    std::string name = attr_desc[kJsonKeyName];
+    std::string type = attr_desc[kJsonKeyDataType];
+    auto value = ParseValue(attr_desc, type);
+    if (value == nullptr) {
+      return false;
+    }
+    (*attrs)[name] = value;
+  }
+
+  return true;
+}
+
+// python utils.
+constexpr auto kGetPythonOpFunc = "_get_python_op";
+constexpr auto kParallelUtilsModule = "mindspore.parallel._utils";
+// almost all ops are defined in this path.
+constexpr auto kOperationsModule = "mindspore.ops.operations";
+
+const std::map<std::string, std::vector<std::string>> op_attrs_map = {
+  {kReduceSumOpName, std::vector<std::string>{kAttrKeepDims}},
+  {kReduceMaxOpName, std::vector<std::string>{kAttrKeepDims}},
+  {kReduceMinOpName, std::vector<std::string>{kAttrKeepDims}},
+};
+
+ValuePtr CreatOpInstance(const std::string &op_name, const std::vector<ValuePtr> &attrs) {
+  py::module mod = py::module::import(kOperationsModule);
+  if (!py::hasattr(mod, op_name.c_str())) {
+    MS_LOG(ERROR) << kOperationsModule << " don't have attr: " << op_name;
+    return nullptr;
+  }
+  std::vector<py::object> arg_list;
+  (void)std::transform(attrs.begin(), attrs.end(), std::back_inserter(arg_list),
+                       [](const ValuePtr &attr) { return ValuePtrToPyData(attr); });
+  py::object obj = parse::python_adapter::CallPyFn(kParallelUtilsModule, kGetPythonOpFunc, op_name, kOperationsModule,
+                                                   op_name, arg_list);
+  ValuePtr op_instance = nullptr;
+  bool succ = parse::ConvertData(obj, &op_instance);
+  if (!succ) {
+    MS_LOG(ERROR) << "Get python op " << op_name << " from " << kOperationsModule << " failed.";
+    return nullptr;
+  }
+  return op_instance;
+}
+
+PrimitivePtr GetPrimitive(const std::string &op_name, const std::map<std::string, ValuePtr> &attrs_val) {
+  PrimitivePtr primitive{nullptr};
+  if (op_attrs_map.count(op_name) == 0) {
+    // no attrs for op instance.
+    primitive = CreatOpInstance(op_name, std::vector<ValuePtr>{})->cast<PrimitivePtr>();
+  } else {
+    // make attrs for op instance.
+    std::vector<ValuePtr> op_attrs;
+    const auto &attr_names = op_attrs_map.at(op_name);
+    for (const auto &attr_name : attr_names) {
+      if (attrs_val.count(attr_name) == 0) {
+        MS_LOG(ERROR) << "Attr: " << attr_name << " for: " << op_name << " not found.";
+        return nullptr;
+      }
+      op_attrs.push_back(attrs_val.at(attr_name));
+    }
+    primitive = CreatOpInstance(op_name, op_attrs)->cast<PrimitivePtr>();
+  }
+
+  if (primitive != nullptr) {
+    for (const auto &attr : attrs_val) {
+      primitive->AddAttr(attr.first, attr.second);
+    }
+  }
+
+  return primitive;
+}
+}  // namespace
+
+constexpr auto kIsFeatureMapOutput = "IsFeatureMapOutput";
+constexpr auto kIsFeatureMapInputList = "IsFeatureMapInputList";
+
+ScalarPtr AkgKernelJsonDecoder::DecodeScalar(const nlohmann::json &scalar_json) {
+  auto type_id = DtypeToTypeId(scalar_json[kJsonKeyDataType]);
+  switch (type_id) {
+    case kNumberTypeFloat16:
+    case kNumberTypeFloat32:
+      return std::make_shared<FP32Imm>(scalar_json[kJsonKeyValue]);
+    case kNumberTypeInt32:
+      return std::make_shared<Int32Imm>(scalar_json[kJsonKeyValue]);
+    default:
+      MS_LOG(ERROR) << "Unknown type: " << scalar_json[kJsonKeyDataType];
+      break;
+  }
+  return nullptr;
+}
+
+ValueNodePtr AkgKernelJsonDecoder::DecodeValueNode(const nlohmann::json &value_json, const FuncGraphPtr &func_graph) {
+  MS_LOG(DEBUG) << "start decode value node, " << value_json;
+  auto scalar = DecodeScalar(value_json);
+  auto tensor = ScalarToTensor(scalar);
+
+  auto value_node = std::make_shared<ValueNode>(tensor);
+  value_node->set_abstract(tensor->ToAbstract());
+  // create kernel_info fo new value node.
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node.
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // layout info.
+  builder->SetOutputsFormat(std::vector<std::string>{value_json[kJsonKeyFormat]});
+  builder->SetOutputsDeviceType(std::vector<TypeId>{DtypeToTypeId(value_json[kJsonKeyDataType])});
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), value_node.get());
+  func_graph->AddValueNode(value_node);
+  MS_LOG(DEBUG) << "decode value node success, " << value_node->DebugString(2);
+  return value_node;
+}
+
+ParameterPtr AkgKernelJsonDecoder::DecodeParameter(const nlohmann::json &parameter_json,
+                                                   const FuncGraphPtr &func_graph) {
+  MS_LOG(DEBUG) << "start decode parameter, " << parameter_json;
+  ParameterPtr new_parameter = func_graph->add_parameter();
+  std::string name = parameter_json[kJsonKeyTensorName];
+  new_parameter->set_name(name);
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  new_parameter->set_kernel_info(kernel_info);
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  builder->SetOutputsFormat(std::vector<std::string>{parameter_json[kJsonKeyFormat]});
+  builder->SetOutputsDeviceType(std::vector<TypeId>{DtypeToTypeId(parameter_json[kJsonKeyDataType])});
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), new_parameter.get());
+  nodes_map_[name] = new_parameter;
+  return new_parameter;
+}
+
+CNodePtr AkgKernelJsonDecoder::DecodeCNode(const nlohmann::json &cnode_json, const FuncGraphPtr &func_graph,
+                                           const std::string &processor) {
+  Processor p = kernel::GetProcessor(processor);
+  MS_LOG(DEBUG) << "start decode cnode, " << cnode_json;
+  // decode attrs.
+  std::map<std::string, ValuePtr> cnode_attrs;
+  if (!DecodeAttrs(cnode_json, &cnode_attrs)) {
+    MS_LOG(ERROR) << "Error decode attrs.";
+    return nullptr;
+  }
+  std::string op_name = cnode_json[kJsonKeyName];
+  // new primitive.
+  auto primitive = GetPrimitive(op_name, cnode_attrs);
+  if (primitive == nullptr) {
+    MS_LOG(ERROR) << "Create primitive failed.";
+    return nullptr;
+  }
+
+  // data layout info.
+  std::vector<std::string> input_formats;
+  std::vector<TypeId> input_types;
+  std::vector<std::string> output_formats;
+  std::vector<TypeId> output_types;
+
+  // collect inputs.
+  auto primitive_v = NewValueNode(primitive);
+  func_graph->AddValueNode(primitive_v);
+  std::vector<AnfNodePtr> inputs{primitive_v};
+  std::vector<nlohmann::json> input_descs = cnode_json[kJsonKeyInputDesc];
+  for (size_t i = 0; i < input_descs.size(); ++i) {
+    nlohmann::json input_desc = input_descs[i][0];
+    std::string name = input_desc[kJsonKeyTensorName];
+    if (input_desc.find(kJsonKeyValue) != input_desc.end()) {
+      inputs.push_back(DecodeValueNode(input_desc, func_graph));
+    } else if (nodes_map_.count(name) == 0) {
+      MS_LOG(ERROR) << "Input: " << name << " of: " << op_name << " not found.";
+      return nullptr;
+    } else {
+      inputs.push_back(nodes_map_[name]);
+    }
+    input_formats.push_back(input_desc[kJsonKeyFormat]);
+    input_types.push_back(DtypeToTypeId(input_desc[kJsonKeyDataType]));
+  }
+  MS_LOG(DEBUG) << "decode inputs success.";
+
+  // new cnode.
+  auto cnode = func_graph->NewCNode(inputs);
+  func_graph->AddNode(cnode);
+
+  // decode outputs.
+  std::vector<nlohmann::json> output_descs = cnode_json[kJsonKeyOutputDesc];
+  AbstractBasePtr abstract(nullptr);
+  if (output_descs.empty()) {
+    MS_LOG(ERROR) << "No outputs found.";
+    return nullptr;
+  } else if (output_descs.size() == 1) {
+    // single output.
+    nlohmann::json output_desc = output_descs[0];
+    output_formats.push_back(output_desc[kJsonKeyFormat]);
+    output_types.push_back(DtypeToTypeId(output_desc[kJsonKeyDataType]));
+    nodes_map_[output_desc[kJsonKeyTensorName]] = cnode;
+  } else {
+    // multi outputs.
+    for (size_t j = 0; j < output_descs.size(); ++j) {
+      nlohmann::json output_desc = output_descs[j];
+      output_formats.push_back(output_desc[kJsonKeyFormat]);
+      output_types.push_back(DtypeToTypeId(output_desc[kJsonKeyDataType]));
+      auto get_item = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), cnode, NewValueNode(SizeToInt(j))});
+      func_graph->AddNode(get_item);
+      nodes_map_[output_desc[kJsonKeyTensorName]] = get_item;
+    }
+  }
+  MS_LOG(DEBUG) << "decode outputs success.";
+
+  // create kernel_info.
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  std::vector<size_t> feature_map_input_indexs;
+  // if the node only has the primitive(such as getNext) or the node's input has a feature map input
+  // then the node's output is a feature map output
+  for (size_t index = 1; index < inputs.size(); ++index) {
+    auto node = AnfAlgo::VisitKernel(inputs[index], 0);
+    if (AnfAlgo::IsFeatureMapOutput(node.first)) {
+      feature_map_input_indexs.push_back(index);
+    }
+  }
+  if (AnfAlgo::GetCNodeName(cnode) == prim::kPrimCast->name()) {
+    AnfAlgo::SetNodeAttr(kIsBackendCast, MakeValue(false), cnode);
+  }
+  if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
+    kernel_info->SetFeatureMapFlag(true);
+  }
+  if (AnfAlgo::IsRealCNodeKernel(cnode)) {
+    AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
+    AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
+  }
+  cnode->set_kernel_info(kernel_info);
+  // create kernel_build_info.
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  builder->SetInputsFormat(input_formats);
+  builder->SetInputsDeviceType(input_types);
+  builder->SetOutputsFormat(output_formats);
+  builder->SetOutputsDeviceType(output_types);
+  builder->SetProcessor(p);
+  builder->SetKernelType(KernelType::AKG_KERNEL);
+  builder->SetFusionType(kernel::FusionType::OPAQUE);
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), cnode.get());
+  return cnode;
+}
+
+FuncGraphPtr AkgKernelJsonDecoder::DecodeFusedNodes(const nlohmann::json &kernel_json) {
+  MS_LOG(DEBUG) << "start decode, " << kernel_json;
+  // clear cache.
+  nodes_map_.clear();
+  // create a graph.
+  auto graph = std::make_shared<FuncGraph>();
+
+  // decode parameters.
+  std::vector<nlohmann::json> input_descs = kernel_json[kJsonKeyInputDesc];
+  if (input_descs.empty()) {
+    MS_LOG(ERROR) << "Error decode parameter, no inputs for graph.";
+    return nullptr;
+  }
+  for (size_t i = 0; i < input_descs.size(); ++i) {
+    std::vector<nlohmann::json> input_desc = input_descs[i];
+    auto parameter = DecodeParameter(input_desc[0], graph);
+    if (parameter == nullptr) {
+      MS_LOG(ERROR) << "Error decode parameter.";
+      return nullptr;
+    }
+  }
+  MS_LOG(DEBUG) << "decode parameters success.";
+
+  // decode cnodes in graph.
+  std::vector<nlohmann::json> op_node_descs = kernel_json[kJsonKeyOpDesc];
+  if (op_node_descs.empty()) {
+    MS_LOG(ERROR) << "Error decode cnodes, no cnodes for graph.";
+    return nullptr;
+  }
+  for (const auto &op_desc : op_node_descs) {
+    auto op_node = DecodeCNode(op_desc, graph, kernel_json[kJsonKeyProcess]);
+    if (op_node == nullptr) {
+      MS_LOG(ERROR) << "Error decode cnode.";
+      return nullptr;
+    }
+  }
+  MS_LOG(DEBUG) << "decode cnodes success.";
+
+  // decode outputs of graph.
+  std::vector<nlohmann::json> output_descs = kernel_json[kJsonKeyOutputDesc];
+  if (output_descs.empty()) {
+    MS_LOG(ERROR) << "Error decode outputs, no outputs for graph.";
+    return nullptr;
+  }
+  std::vector<AnfNodePtr> outputs{NewValueNode(prim::kPrimMakeTuple)};
+  for (const auto &output_desc : output_descs) {
+    std::string name = output_desc[kJsonKeyTensorName];
+    if (nodes_map_.count(name) == 0) {
+      MS_LOG(ERROR) << "Output: " << name << " of graph not found.";
+      return nullptr;
+    }
+    outputs.push_back(nodes_map_[name]);
+  }
+  if (outputs.size() == 2) {
+    graph->set_output(outputs[1]);
+  } else {
+    auto output = graph->NewCNode(outputs);
+    graph->AddNode(output);
+    graph->set_output(output);
+  }
+  MS_LOG(DEBUG) << "decode success, " << kernel_json;
+  return graph;
+}
+
+FuncGraphPtr AkgKernelJsonDecoder::DecodeFusedNodes(const std::string &kernel_json_str) {
+  auto kernel_json = nlohmann::json::parse(kernel_json_str);
+  return DecodeFusedNodes(kernel_json);
+}
+
+bool AkgKernelJsonDecoder::DecodeSplitNodes(const nlohmann::json &kernel_json,
+                                            const std::map<std::string, AnfNodePtr> &address_node_map,
+                                            AnfNodePtrList *res_graphs) {
+  MS_EXCEPTION_IF_NULL(res_graphs);
+  MS_LOG(DEBUG) << "start decode, " << kernel_json;
+  // decode cnodes in graph.
+  std::vector<nlohmann::json> op_node_descs = kernel_json[kJsonKeyOpDesc];
+  if (op_node_descs.empty()) {
+    MS_LOG(ERROR) << "Error decode, no cnodes for graph." << kernel_json;
+    return false;
+  }
+  for (const auto &op_desc : op_node_descs) {
+    if (op_desc.find(kJsonKeyPtrAddress) == op_desc.end() || op_desc[kJsonKeyPtrAddress].is_null()) {
+      MS_LOG(ERROR) << "Decode failed, key: " << kJsonKeyPtrAddress << " not found in: " << op_desc;
+      return false;
+    }
+
+    std::string ptr_address = op_desc[kJsonKeyPtrAddress];
+    if (address_node_map.count(ptr_address) == 0) {
+      MS_LOG(ERROR) << "Decode failed, ptr_address not found in map.";
+      return false;
+    }
+    res_graphs->push_back(address_node_map.at(ptr_address));
+  }
+  MS_LOG(DEBUG) << "decode cnodes success, size: " << res_graphs->size();
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_DECODER_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_DECODER_H_
+#include <string>
+#include <vector>
+#include <map>
+#include <nlohmann/json.hpp>
+#include "ir/scalar.h"
+#include "ir/anf.h"
+#include "ir/func_graph.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgKernelJsonDecoder {
+ public:
+  AkgKernelJsonDecoder() { nodes_map_.clear(); }
+  ~AkgKernelJsonDecoder() = default;
+
+  FuncGraphPtr DecodeFusedNodes(const nlohmann::json &kernel_json);
+  FuncGraphPtr DecodeFusedNodes(const std::string &kernel_json_str);
+  bool DecodeSplitNodes(const nlohmann::json &kernel_json, const std::map<std::string, AnfNodePtr> &address_node_map,
+                        AnfNodePtrList *res_graphs);
+
+ private:
+  ScalarPtr DecodeScalar(const nlohmann::json &scalar_json);
+  ValueNodePtr DecodeValueNode(const nlohmann::json &value_json, const FuncGraphPtr &func_graph);
+  ParameterPtr DecodeParameter(const nlohmann::json &parameter_json, const FuncGraphPtr &func_graph);
+  CNodePtr DecodeCNode(const nlohmann::json &cnode_json, const FuncGraphPtr &func_graph, const std::string &processor);
+  std::map<std::string, AnfNodePtr> nodes_map_{};
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_DECODER_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_GENERATOR_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_GENERATOR_H_
+#include <unordered_map>
+#include <string>
+#include <memory>
+#include <map>
+#include <utility>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "backend/kernel_compiler/oplib/oplib.h"
+
+namespace mindspore {
+namespace kernel {
+// json key
+constexpr auto kJsonKeyOpDesc = "op_desc";
+constexpr auto kJsonKeyAttr = "attr";
+constexpr auto kJsonKeyInputDesc = "input_desc";
+constexpr auto kJsonKeyFormat = "format";
+constexpr auto kJsonKeyInferDataType = "infer_data_type";
+constexpr auto kJsonKeyInferShape = "infer_shape";
+constexpr auto kJsonKeyShape = "shape";
+constexpr auto kJsonKeyDataType = "data_type";
+constexpr auto kJsonKeyOutputDesc = "output_desc";
+constexpr auto kJsonKeyName = "name";
+constexpr auto kJsonKeyTensorName = "tensor_name";
+constexpr auto kJsonKeyValue = "value";
+constexpr auto kJsonKeyImplPath = "impl_path";
+constexpr auto kJsonKeyProcess = "process";
+constexpr auto kJsonKeyComposite = "composite";
+constexpr auto kJsonKeyId = "id";
+constexpr auto kJsonKeyOp = "op";
+constexpr auto kJsonKeyPtrAddress = "ptr_address";
+constexpr auto kJsonKeyCompositeGraph = "composite_graph";
+constexpr auto kJsonKeyPlatform = "platform";
+
+constexpr auto kAttrInputNames = "input_names";
+
+// dump option
+struct DumpOption {
+  bool is_before_select_kernel = false;
+  bool save_ptr_address = false;
+};
+
+class AkgKernelJsonGenerator {
+ public:
+  AkgKernelJsonGenerator() { Clear(); }
+  explicit AkgKernelJsonGenerator(DumpOption dump_option) : dump_option_(dump_option) { Clear(); }
+  ~AkgKernelJsonGenerator() = default;
+
+  bool CollectJson(const AnfNodePtr &anf_node, nlohmann::json *const kernel_json);
+  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
+                        const std::vector<AnfNodePtr> &output_list, nlohmann::json *const kernel_json);
+  bool CollectJson(const AnfNodePtr &anf_node);
+  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
+                        const std::vector<AnfNodePtr> &output_list);
+  bool GenerateSingleKernelJson(const AnfNodePtr &anf_node, nlohmann::json *const node_json);
+  std::string kernel_name() const { return kernel_name_; }
+  nlohmann::json kernel_json() const { return kernel_json_; }
+  std::string kernel_json_str() const { return kernel_json_.dump(); }
+  const std::vector<size_t> &input_size_list() const { return input_size_list_; }
+  const std::vector<size_t> &output_size_list() const { return output_size_list_; }
+  void Clear() {
+    input_tensor_idx_.clear();
+    address_node_map_.clear();
+    output_tensor_idx_ = 0;
+  }
+  void set_dump_option(DumpOption dump_option) { dump_option_ = dump_option; }
+  std::map<std::string, AnfNodePtr> address_node_map() { return address_node_map_; }
+
+ private:
+  bool CreateInputDescJson(const AnfNodePtr &anf_node, const std::shared_ptr<OpInfo> &op_info,
+                           nlohmann::json *const inputs_json);
+  bool CreateOutputDescJson(const AnfNodePtr &anf_node, const std::shared_ptr<OpInfo> &op_info,
+                            nlohmann::json *const outputs_json);
+  void GetJson(const AnfNodePtr &anf_node, const std::vector<int> &dyn_input_sizes,
+               const std::shared_ptr<OpAttr> &op_attr, nlohmann::json *const attr_json, const ValuePtr &attr_value);
+  bool CreateAttrDescJson(const AnfNodePtr &anf_node, const std::shared_ptr<OpInfo> &op_info,
+                          nlohmann::json *const attrs_json);
+  bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
+                 std::vector<size_t> *const output_size);
+  int GetOpCntInc();
+  size_t GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx);
+  size_t GetOutputTensorIdxInc();
+  void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
+                     nlohmann::json *const node_json);
+  std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
+                            const std::pair<size_t, size_t> &position);
+  TypeId GetInputDataType(const AnfNodePtr &anf_node, size_t real_index);
+  std::vector<size_t> GetInputShape(const AnfNodePtr &anf_node, size_t real_index);
+  std::string GetInputFormat(const AnfNodePtr &anf_node, size_t real_index);
+  TypeId GetOutputDataType(const AnfNodePtr &anf_node, size_t index);
+  std::vector<size_t> GetOutputShape(const AnfNodePtr &anf_node, size_t index);
+  std::string GetOutputFormat(const AnfNodePtr &anf_node, size_t index);
+
+  DumpOption dump_option_;
+  static int op_cnt_;
+  // lock for variable fusionOpCnt in singleton mode
+  static std::mutex op_cnt_mtx_;
+  std::string kernel_name_;
+  std::unordered_map<AnfNodePtr, size_t> input_tensor_idx_;
+  size_t output_tensor_idx_;
+  nlohmann::json kernel_json_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::map<std::string, AnfNodePtr> address_node_map_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_JSON_GENERATOR_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
@@ -29,6 +29,7 @@
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.h"
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
 #include "backend/kernel_compiler/akg/akg_kernel_attrs_process.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/session/kernel_build_client.h"
@@ -38,287 +39,37 @@ namespace kernel {
 constexpr int32_t PROCESS_NUM = 16;
 constexpr int32_t TIME_OUT = 300;

-constexpr auto kOpDesc = "op_desc";
-constexpr auto kShape = "shape";
-constexpr auto kDataType = "data_type";
-constexpr auto kInputDesc = "input_desc";
-constexpr auto kOutputDesc = "output_desc";
-constexpr auto kTensorName = "tensor_name";
-
-namespace {
-void UpdateTensorNameInJson(const std::vector<AnfNodePtr> &anf_nodes,
-                            std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
-  for (auto const &anf_node : anf_nodes) {
-    std::vector<int> dyn_input_sizes;
-    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
-    MS_EXCEPTION_IF_NULL(primitive);
-
-    if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
-      dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
-    }
-
-    bool is_dynamic_input = !dyn_input_sizes.empty();
-    size_t input_num = is_dynamic_input ? dyn_input_sizes.size() : AnfAlgo::GetInputTensorNum(anf_node);
-    size_t real_input_index = 0;
-    for (size_t i = 0; i < input_num; ++i) {
-      size_t input_tensor_num = is_dynamic_input ? IntToSize(dyn_input_sizes[i]) : 1;
-      for (size_t j = 0; j < input_tensor_num; ++j) {
-        auto tmp_input = GetKernelInput(anf_node, real_input_index);
-        std::string tensor_name = GetTensorName((*node_json_map)[anf_node], kInputDesc, std::make_pair(i, j));
-        if (node_json_map->find(tmp_input.first) != node_json_map->end()) {
-          std::string new_tensor_name =
-            GetTensorName((*node_json_map)[tmp_input.first], kOutputDesc, std::make_pair(0, tmp_input.second));
-          SetTensorName(kInputDesc, new_tensor_name, std::make_pair(i, j), &((*node_json_map)[anf_node]));
-          MS_LOG(DEBUG) << "Update [" << real_input_index << "] input [" << tensor_name << "] of ["
-                        << anf_node->fullname_with_scope() << "] to [" << tmp_input.second << "] output ["
-                        << new_tensor_name << "] of [" << tmp_input.first->fullname_with_scope() << "].";
-        } else {
-          MS_LOG(DEBUG) << "[" << real_input_index << "] input " << tensor_name << "] of ["
-                        << anf_node->fullname_with_scope() << "] is out input.";
-        }
-        real_input_index++;
-      }
-    }
-  }
-}
-
-nlohmann::json GetInputsJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
-                             std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
-  nlohmann::json inputs_json;
-  auto input_index = GetInputIndex(anf_nodes, input_list);
-  for (size_t i = 0; i < input_index.size(); ++i) {
-    auto tmp_input = input_index[i];
-    auto type_id = AnfAlgo::GetInputDeviceDataType(tmp_input.first, tmp_input.second.first);
-    std::string dtype = TypeId2String(type_id);
-    nlohmann::json input_desc_json;
-    input_desc_json[kTensorName] = GetTensorName((*node_json_map)[tmp_input.first], kInputDesc, tmp_input.second);
-    input_desc_json[kDataType] = dtype;
-    input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(tmp_input.first, tmp_input.second.first);
-    inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json});
-  }
-
-  return inputs_json;
-}
-
-nlohmann::json GetOutputsJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
-                              const std::vector<AnfNodePtr> &output_list, const nlohmann::json &inputs_json,
-                              std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
-  nlohmann::json outputs_json;
-  auto output_index = GetOutputIndex(anf_nodes, input_list, output_list);
-  for (size_t i = 0; i < output_index.size(); ++i) {
-    auto tmp_output = output_index[i];
-    bool found = false;
-    nlohmann::json output_desc_json;
-    for (size_t input_i = 0; input_i < input_list.size(); ++input_i) {
-      if (tmp_output.first == input_list[input_i]) {
-        output_desc_json = inputs_json[input_i][0];
-        found = true;
-        break;
-      }
-    }
-    if (!found) {
-      auto type_id = AnfAlgo::GetOutputDeviceDataType(tmp_output.first, tmp_output.second);
-      std::string dtype = TypeId2String(type_id);
-      output_desc_json[kTensorName] =
-        GetTensorName((*node_json_map)[tmp_output.first], kOutputDesc, std::make_pair(0, tmp_output.second));
-      output_desc_json[kDataType] = dtype;
-      auto output_shape = AnfAlgo::GetOutputDeviceShape(tmp_output.first, tmp_output.second);
-      if (output_shape.empty()) {
-        output_shape.push_back(1);
-      }
-      output_desc_json[kShape] = output_shape;
-    }
-    outputs_json.emplace_back(output_desc_json);
-  }
-
-  return outputs_json;
-}
-
-std::pair<std::vector<std::string>, std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>>> PreProcessJsonForBuild(
-  const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
+bool AkgAscendKernelBuilder::AkgOpParallelBuild(
+  const std::vector<std::pair<AkgKernelJsonGenerator, AnfNodePtr>> &build_args) {
  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
  std::vector<std::string> jsons;
-  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> repeat_nodes;
-  std::unordered_set<std::string> json_name_set;
-  for (const auto &[builder, anf_node] : build_args) {
+  std::unordered_set<std::string> kernel_name_set;
+  std::vector<std::pair<AkgKernelJsonGenerator, AnfNodePtr>> repeat_nodes;
+  for (const auto &[json_generator, anf_node] : build_args) {
    MS_EXCEPTION_IF_NULL(anf_node);
-    auto json_name = builder.json_name();
-    MS_LOG(DEBUG) << "Akg start compile op: " << json_name;
-    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    auto kernel_name = json_generator.kernel_name();
+    MS_LOG(DEBUG) << "Akg start compile op: " << kernel_name;
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(kernel_name, GetProcessorStr(anf_node));
    if (cached_kernel_pack != nullptr) {
-      MS_LOG(DEBUG) << "Use cached kernel, json_name_[" << json_name << "], fullname_with_scope["
+      MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
                    << anf_node->fullname_with_scope() << "].";
      auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
-      kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
-      kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+      kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+      kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
      AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
      continue;
    }

-    if (json_name_set.count(json_name) != 0) {
-      repeat_nodes.push_back({builder, anf_node});
+    if (kernel_name_set.count(kernel_name) != 0) {
+      repeat_nodes.push_back({json_generator, anf_node});
      continue;
    }
-    json_name_set.insert(json_name);
-    auto node_json = builder.kernel_json();
-    kernel::SaveJsonInfo(json_name, node_json);
-    jsons.push_back(node_json);
-  }
-
-  return std::make_pair(jsons, repeat_nodes);
-}
-
-bool PostProcessAfterCompile(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args,
-                             const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &repeat_nodes) {
-  for (const auto &[builder, anf_node] : build_args) {
-    auto json_name = builder.json_name();
-    auto new_kernel_pack = tbe::TbeUtils::InsertCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
-    if (new_kernel_pack == nullptr) {
-      MS_LOG(ERROR) << "Insert to cache failed, json_name_[" << json_name << "], fullname_with_scope["
-                    << anf_node->fullname_with_scope() << "].";
-      return false;
-    }
-    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(new_kernel_pack);
-    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
-    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
-    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
-    MS_LOG(DEBUG) << "Akg compile " << json_name << " kernel and insert cache successfully!";
-  }
-
-  for (const auto &[builder, anf_node] : repeat_nodes) {
-    auto node_json = builder.kernel_json();
-    auto json_name = builder.json_name();
-    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
-    if (cached_kernel_pack == nullptr) {
-      return false;
-    }
-    MS_LOG(INFO) << "Use just compiled kernel, json_name_[" << json_name << "], fullname_with_scope["
-                 << anf_node->fullname_with_scope() << "].";
-    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
-    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
-    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
-    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
-  }
-
-  return true;
-}
-}  // namespace
-
-bool AkgAscendKernelBuilder::CollectJson(const AnfNodePtr &anf_node) {
-  MS_EXCEPTION_IF_NULL(anf_node);
-  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-  MS_LOG(INFO) << "AKG start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
-  auto it = kAkgKernelAttrsProcessMap.find(op_name);
-  if (it != kAkgKernelAttrsProcessMap.end()) {
-    it->second(anf_node);
-  }
-  MS_LOG(INFO) << "Akg start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
-  nlohmann::json node_json;
-  if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
-    MS_LOG(ERROR) << "Op[" << op_name << "] create single kernel json failed.";
-  }
-
-  kernel_json_ = node_json.dump();
-
-  if (!GetIOSize(node_json, &input_size_list_, &output_size_list_)) {
-    MS_LOG(ERROR) << "Cal mem size failed.";
-    return false;
+    kernel_name_set.insert(kernel_name);
+    auto kernel_json = json_generator.kernel_json_str();
+    kernel::SaveJsonInfo(kernel_name, kernel_json);
+    jsons.push_back(kernel_json);
  }

-  return true;
-}
-
-bool AkgAscendKernelBuilder::GenJsonAndPreprocess4Fused(const std::vector<AnfNodePtr> &anf_nodes,
-                                                        std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
-  for (auto const &anf_node : anf_nodes) {
-    MS_EXCEPTION_IF_NULL(anf_node);
-    std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-    if (!AnfAlgo::IsRealKernel(anf_node)) {
-      MS_LOG(ERROR) << "Invalid anf node to build [" << anf_node->fullname_with_scope() << "].";
-      return false;
-    }
-    auto it = kAkgKernelAttrsProcessMap.find(op_name);
-    if (it != kAkgKernelAttrsProcessMap.end()) {
-      it->second(anf_node);
-    }
-
-    nlohmann::json node_json;
-    if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
-      MS_LOG(ERROR) << "Op [" << op_name << "] create single kernel json failed.";
-      return false;
-    }
-    // No need for composite op.
-    node_json.erase("id");
-    node_json.erase("op");
-    node_json.erase("composite");
-
-    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
-    MS_EXCEPTION_IF_NULL(primitive);
-
-    if (primitive->GetAttr("fusion") != nullptr) {
-      node_json["fusion"] = primitive->GetAttr("fusion")->ToString();
-    }
-
-    (*node_json_map)[anf_node] = node_json;
-  }
-  return true;
-}
-
-bool AkgAscendKernelBuilder::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes,
-                                              const std::vector<AnfNodePtr> &input_list,
-                                              const std::vector<AnfNodePtr> &output_list) {
-  if (anf_nodes.empty() || input_list.empty()) {
-    MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size()
-                  << "].";
-    return false;
-  }
-  MS_LOG(INFO) << "anf_nodes [" << output_list.size() << "], input_list [" << anf_nodes.size() << "], output_list ["
-               << input_list.size() << "].";
-
-  std::map<AnfNodePtr, nlohmann::json> node_json_map;
-  if (!GenJsonAndPreprocess4Fused(anf_nodes, &node_json_map)) {
-    return false;
-  }
-
-  UpdateTensorNameInJson(anf_nodes, &node_json_map);
-
-  nlohmann::json fused_node_json;
-  std::vector<nlohmann::json> node_json_desc;
-  std::transform(anf_nodes.begin(), anf_nodes.end(), std::back_inserter(node_json_desc),
-                 [&node_json_map](const AnfNodePtr &anf_node) { return node_json_map[anf_node]; });
-  fused_node_json[kOpDesc] = node_json_desc;
-  fused_node_json[kInputDesc] = GetInputsJson(anf_nodes, input_list, &node_json_map);
-  fused_node_json[kOutputDesc] =
-    GetOutputsJson(anf_nodes, input_list, output_list, fused_node_json[kInputDesc], &node_json_map);
-
-  size_t hash_id = std::hash<std::string>()(fused_node_json.dump());
-  json_name_ = "Fused_";
-  auto fg = anf_nodes[0]->func_graph();
-  MS_EXCEPTION_IF_NULL(fg);
-  auto attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
-  if (attr_val != nullptr) {
-    auto fg_attr = GetValue<std::string>(attr_val);
-    (void)json_name_.append(fg_attr).append("_");
-  }
-  (void)json_name_.append(std::to_string(hash_id));
-  fused_node_json["composite_graph"] = fg->ToString();
-  fused_node_json["op"] = json_name_;
-  fused_node_json["platform"] = "AKG";
-  fused_node_json["process"] = "aicore";
-  fused_node_json["composite"] = true;
-
-  kernel_json_ = fused_node_json.dump();
-
-  if (!GetIOSize(fused_node_json, &input_size_list_, &output_size_list_)) {
-    MS_LOG(ERROR) << "Cal mem size failed.";
-    return false;
-  }
-
-  return true;
-}
-
-bool AkgOpParallelBuild(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
-  auto [jsons, repeat_nodes] = PreProcessJsonForBuild(build_args);
  if (jsons.empty()) {
    return true;
  }
@@ -337,18 +88,43 @@ bool AkgOpParallelBuild(const std::vector<std::pair<AkgAscendKernelBuilder, AnfN
    return false;
  }

-  if (!PostProcessAfterCompile(build_args, repeat_nodes)) {
-    return false;
+  // All unique done here, cache them and set kernel.
+  for (const auto &[json_generator, anf_node] : build_args) {
+    auto kernel_name = json_generator.kernel_name();
+    auto new_kernel_pack = tbe::TbeUtils::InsertCache(kernel_name, GetProcessorStr(anf_node));
+    if (new_kernel_pack == nullptr) {
+      MS_LOG(ERROR) << "Insert to cache failed, kernel_name[" << kernel_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      return false;
+    }
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(new_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+    MS_LOG(DEBUG) << "Akg compile " << kernel_name << " kernel and insert cache successfully!";
+  }
+
+  // Handle repeated nodes.
+  for (const auto &[json_generator, anf_node] : repeat_nodes) {
+    auto kernel_name = json_generator.kernel_name();
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(kernel_name, GetProcessorStr(anf_node));
+    if (cached_kernel_pack == nullptr) return false;
+    MS_LOG(INFO) << "Use just compiled kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
+                 << anf_node->fullname_with_scope() << "].";
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
  }

  return true;
 }

 bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
-  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> json_and_node;
+  std::vector<std::pair<AkgKernelJsonGenerator, AnfNodePtr>> json_and_node;
  for (const auto &anf_node : anf_nodes) {
    MS_EXCEPTION_IF_NULL(anf_node);
-    AkgAscendKernelBuilder akg_cce_kernel_builder;
+    AkgKernelJsonGenerator akg_kernel_json_generator;
    KernelPackPtr kernel_pack = nullptr;
    auto cnode = anf_node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
@@ -363,18 +139,17 @@ bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
      std::vector<AnfNodePtr> node_list;
      std::vector<AnfNodePtr> input_list;
      std::vector<AnfNodePtr> output_list;
-      std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-      MS_LOG(INFO) << "Akg start compile composite op[" << op_name << "]";
+      MS_LOG(INFO) << "Akg start compile composite op[" << anf_node->fullname_with_scope() << "]";
      GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
-      if (!akg_cce_kernel_builder.CollectFusedJson(node_list, input_list, output_list)) {
-        MS_EXCEPTION(UnknownError) << "Akg build failed composite op[" << op_name << "].";
+      if (!akg_kernel_json_generator.CollectFusedJson(node_list, input_list, output_list)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed composite op[" << anf_node->fullname_with_scope() << "].";
      }
    } else {
-      if (!akg_cce_kernel_builder.CollectJson(anf_node)) {
-        MS_EXCEPTION(UnknownError) << "Akg build failed op[" << AnfAlgo::GetCNodeName(anf_node) << "].";
+      if (!akg_kernel_json_generator.CollectJson(anf_node)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed op[" << anf_node->fullname_with_scope() << "].";
      }
    }
-    json_and_node.push_back({akg_cce_kernel_builder, anf_node});
+    json_and_node.push_back({akg_kernel_json_generator, anf_node});
  }

  if (json_and_node.empty()) {
@@ -382,7 +157,8 @@ bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
    return true;
  }

-  return AkgOpParallelBuild(json_and_node);
+  AkgAscendKernelBuilder akg_ascend_kernel_builder;
+  return akg_ascend_kernel_builder.AkgOpParallelBuild(json_and_node);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h
@@ -18,35 +18,21 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_

 #include <string>
-#include <memory>
+#include <utility>
 #include <vector>
 #include <map>
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
-#include "backend/kernel_compiler/akg/akg_kernel_build.h"
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"

 namespace mindspore {
 namespace kernel {
-class AkgAscendKernelBuilder : public AkgKernelBuild {
+class AkgAscendKernelBuilder {
 public:
  AkgAscendKernelBuilder() = default;
  ~AkgAscendKernelBuilder() = default;

-  bool CollectJson(const AnfNodePtr &anf_node);
-  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
-                        const std::vector<AnfNodePtr> &output_list);
-  std::string json_name() const { return json_name_; }
-  std::string kernel_json() const { return kernel_json_; }
-  const std::vector<size_t> &input_size_list() const { return input_size_list_; }
-  const std::vector<size_t> &output_size_list() const { return output_size_list_; }
-
- private:
-  bool GenJsonAndPreprocess4Fused(const std::vector<AnfNodePtr> &anf_nodes,
-                                  std::map<AnfNodePtr, nlohmann::json> *node_json_map);
-
-  std::string kernel_json_;
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
+  bool AkgOpParallelBuild(const std::vector<std::pair<AkgKernelJsonGenerator, AnfNodePtr>> &build_args);
 };

 bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);

--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
@@ -15,29 +15,116 @@
 */

 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h"
+#include <Python.h>
 #include <vector>
 #include <memory>
+#include <string>
 #include "backend/kernel_compiler/kernel.h"
-#include "backend/kernel_compiler/akg/akg_kernel_build.h"
+#include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h"
 #include "utils/ms_utils.h"
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "backend/session/kernel_build_client.h"

 namespace mindspore {
 namespace kernel {
-KernelModPtr AkgGpuKernelBuild(const AnfNodePtr &anf_node) {
+constexpr int32_t ARGS_SIZE = 1;
+constexpr auto kCompileWithJsonFunc = "compilewithjson";
+
+KernelPackPtr AkgGpuKernelBuilder::OpBuild(const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  auto processor = GetProcessorStr(anf_node);
+  auto kernel_name = json_generator.kernel_name();
+  auto cached_kernel_pack = SearchCache(kernel_name, processor);
+  if (cached_kernel_pack != nullptr) {
+    MS_LOG(INFO) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
+                 << anf_node->fullname_with_scope() << "].";
+    return cached_kernel_pack;
+  }
+
+  (void)alarm(AUTODIFF_COMPILE_OVERTIME);
+  auto kernel_json = json_generator.kernel_json_str();
+  auto res = GpuKernelBuildClient::Instance().AkgCompileSingle(kernel_json);
+  (void)alarm(0);
+  if (!res) {
+    MS_LOG(ERROR) << "Akg compile failed, json: " << kernel_json;
+    return nullptr;
+  }
+
+  auto new_kernel_pack = InsertCache(kernel_name, processor);
+  kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path());
+  if (new_kernel_pack == nullptr) {
+    MS_LOG(ERROR) << "Insert to cache failed, kernel_name[" << kernel_name << "], fullname_with_scope["
+                  << anf_node->fullname_with_scope() << "].";
+    return nullptr;
+  }
+  return new_kernel_pack;
+}
+
+KernelModPtr AkgGpuKernelBuilder::BuildByJson(const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_LOG(INFO) << "Akg start compile, op[" << anf_node->fullname_with_scope() << "]";
+  AkgKernelJsonGenerator json_generator;
+  if (!json_generator.CollectJson(anf_node)) {
+    MS_LOG(ERROR) << "Op[" << anf_node->fullname_with_scope() << "] create single kernel json failed.";
+  }
+
+  auto kernel_pack = OpBuild(json_generator, anf_node);
+  if (kernel_pack == nullptr) {
+    MS_LOG(ERROR) << "Akg build failed op[" << anf_node->fullname_with_scope() << "].";
+    return nullptr;
+  }
+
+  auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
+  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
+  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  MS_LOG(INFO) << "Akg compile success, op[" << anf_node->fullname_with_scope() << "]";
+  return kernel_mod_ptr;
+}
+
+KernelModPtr AkgGpuKernelBuilder::FuseByJson(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
-  AkgKernelBuild akg_kernel_build;
+  MS_LOG(INFO) << "Akg start compile, graph_kernel[" << anf_node->fullname_with_scope() << "]";
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(anf_node);
+  MS_EXCEPTION_IF_NULL(fg);
+  auto mng = fg->manager();
+  if (mng == nullptr) {
+    mng = Manage(fg, true);
+    fg->set_manager(mng);
+  }

-  std::vector<size_t> input_size_list;
-  std::vector<size_t> output_size_list;
-  KernelPackPtr kernel_pack = akg_kernel_build.BuildByJson(anf_node, &input_size_list, &output_size_list);
-  MS_EXCEPTION_IF_NULL(kernel_pack);
+  AnfNodePtrList node_list;
+  AnfNodePtrList input_list;
+  AnfNodePtrList output_list;
+  GetValidKernelNodes(fg, &node_list, &input_list, &output_list);
+  AkgKernelJsonGenerator json_generator;
+  if (!json_generator.CollectFusedJson(node_list, input_list, output_list)) {
+    MS_LOG(ERROR) << "Op[" << anf_node->fullname_with_scope() << "] create single kernel json failed.";
+  }
+
+  auto kernel_pack = OpBuild(json_generator, anf_node);
+  if (kernel_pack == nullptr) {
+    MS_LOG(ERROR) << "Akg build failed, graph_kernel[" << anf_node->fullname_with_scope() << "].";
+    return nullptr;
+  }

  auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
-  kernel_mod_ptr->SetInputSizeList(input_size_list);
-  kernel_mod_ptr->SetOutputSizeList(output_size_list);
+  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  MS_LOG(INFO) << "Akg compile success, graph_kernel[" << anf_node->fullname_with_scope() << "]";
  return kernel_mod_ptr;
 }
+
+KernelModPtr AkgGpuKernelBuild(const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  AkgGpuKernelBuilder akg_gpu_kernel_builder;
+  if (AnfAlgo::IsGraphKernel(anf_node)) {
+    return akg_gpu_kernel_builder.FuseByJson(anf_node);
+  }
+  return akg_gpu_kernel_builder.BuildByJson(anf_node);
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
@@ -16,11 +16,25 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
+#include <string>
 #include "backend/kernel_compiler/kernel.h"
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
 #include "base/base.h"

 namespace mindspore {
 namespace kernel {
+class AkgGpuKernelBuilder {
+ public:
+  AkgGpuKernelBuilder() = default;
+  ~AkgGpuKernelBuilder() = default;
+
+  KernelModPtr BuildByJson(const AnfNodePtr &anf_node);
+  KernelModPtr FuseByJson(const AnfNodePtr &anf_node);
+
+ private:
+  KernelPackPtr OpBuild(const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node);
+};
+
 KernelModPtr AkgGpuKernelBuild(const AnfNodePtr &anf_node);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -205,10 +205,13 @@ TypeId DtypeToTypeId(const std::string &dtypes) {
  }
 }

-std::string TypeId2String(TypeId type_id) {
+std::string TypeId2String(TypeId type_id, bool unknown_as_default) {
  auto iter = type_id_str_map.find(type_id);
  if (iter == type_id_str_map.end()) {
-    return std::string(TypeIdLabel(type_id));
+    if (!unknown_as_default) {
+      MS_EXCEPTION(ArgumentError) << "Illegal input dtype." << TypeIdLabel(type_id);
+    }
+    return "float32";
  }
  return iter->second;
 }
@@ -427,9 +430,9 @@ bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpIn
  return true;
 }

-void SaveJsonInfo(const std::string &json_name, const std::string &info) {
+void SaveJsonInfo(const std::string &json_name, const std::string &info, const std::string &base_path) {
  char real_path[PATH_MAX] = {0};
-  std::string path = kCceKernelMeta + json_name + kInfoSuffix;
+  std::string path = base_path + json_name + kInfoSuffix;
  if (path.size() > PATH_MAX) {
    MS_LOG(DEBUG) << "file path " << path << " is too long.";
    return;
@@ -458,6 +461,14 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info) {
  }
 }

+Processor GetProcessor(const string &processor) {
+  if (processor == kProcessorAiCore) return Processor::AICORE;
+  if (processor == kProcessorAiCpu) return Processor::AICPU;
+  if (processor == kProcessorCuda) return Processor::CUDA;
+  MS_LOG(DEBUG) << "Unknown processor type.";
+  return Processor::UNKNOWN;
+}
+
 std::string GetProcessor(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::string device;
@@ -628,16 +639,21 @@ void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr>

 void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list) {
+  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(node_list);
  MS_EXCEPTION_IF_NULL(input_list);
-  MS_EXCEPTION_IF_NULL(output_list);
-  MS_EXCEPTION_IF_NULL(func_graph);

  GetValidKernelNodes(func_graph, node_list);

  auto parameters = func_graph->parameters();
  input_list->insert(input_list->begin(), parameters.begin(), parameters.end());

+  GetFuncGraphOutputNodes(func_graph, output_list);
+}
+
+void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *output_list) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(output_list);
  auto func_output = func_graph->output();
  MS_EXCEPTION_IF_NULL(func_output);
  if (func_output->isa<CNode>()) {
@@ -780,5 +796,36 @@ std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode) {
  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(axis), cnode);
  return axis;
 }
+
+std::string GetProcessorStr(const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::string processor = kProcessorUnknown;
+  auto kernel_info = dynamic_cast<device::KernelInfo *>(anf_node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  auto build_info = kernel_info->select_kernel_build_info();
+  // we may call this before kernel select.
+  if (build_info == nullptr) {
+    return processor;
+  }
+  switch (build_info->processor()) {
+    case Processor::AICORE:
+      processor = kProcessorAiCore;
+      break;
+
+    case Processor::AICPU:
+      processor = kProcessorAiCpu;
+      break;
+
+    case Processor::CUDA:
+      processor = kProcessorCuda;
+      break;
+
+    default:
+      MS_LOG(ERROR) << "Unknown processor type.";
+      break;
+  }
+
+  return processor;
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -23,6 +23,7 @@
 #include <unordered_set>
 #include <map>
 #include <string>
+#include <algorithm>
 #include <vector>
 #include <utility>
 #include <nlohmann/json.hpp>
@@ -37,6 +38,7 @@ constexpr auto kGpuKernelMeta = "./cuda_meta";
 constexpr auto kProcessorAiCore = "aicore";
 constexpr auto kProcessorAiCpu = "aicpu";
 constexpr auto kProcessorCuda = "cuda";
+constexpr auto kProcessorUnknown = "unknown";
 constexpr auto kJsonSuffix = ".json";
 constexpr auto kInfoSuffix = ".info";
 constexpr unsigned int AUTODIFF_COMPILE_OVERTIME = 600;
@@ -76,12 +78,13 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro
 KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
 TypeId DtypeToTypeId(const std::string &dtypes);
 std::string Dtype2ShortType(const std::string &dtypes);
-std::string TypeId2String(TypeId type_id);
+std::string TypeId2String(TypeId type_id, bool unknown_as_default = false);
 size_t GetDtypeNbyte(const std::string &dtypes);
 bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr, Processor processor,
                   std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list);
-void SaveJsonInfo(const std::string &json_name, const std::string &info);
+void SaveJsonInfo(const std::string &json_name, const std::string &info, const std::string &base_path = kCceKernelMeta);
 std::string GetProcessor(const AnfNodePtr &anf_node);
+Processor GetProcessor(const string &processor);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
@@ -90,13 +93,26 @@ std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(cons
 std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
                                                          const std::vector<AnfNodePtr> &input_list,
                                                          const std::vector<AnfNodePtr> &output_list);
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list);
 void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list);
-void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list);
+void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *output_list);
 bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json);
 void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list);
 bool IsWeightBoundary(const AnfNodePtr &node);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
+std::string GetProcessorStr(const AnfNodePtr &anf_node);
+
+template <typename T>
+inline std::string Vector2Str(const std::vector<T> &inputs) {
+  if (!inputs.empty()) {
+    std::ostringstream oss;
+    (void)std::copy(inputs.begin(), inputs.end() - 1, std::ostream_iterator<T>(oss, ", "));
+    oss << inputs.back();
+    return oss.str();
+  }
+  return "";
+}
 }  // namespace kernel
 }  // namespace mindspore


--- a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
@@ -16,14 +16,12 @@

 #include <unistd.h>
 #include <fstream>
-#include "backend/kernel_compiler/kernel.h"
-#include "backend/kernel_compiler/akg/akg_kernel_build.h"
 #include "nlohmann/json.hpp"
 #include "securec/include/securec.h"
 #include "utils/log_adapter.h"
 #include "utils/convert_utils.h"
 #include "utils/system/sha256.h"
-
+#include "backend/kernel_compiler/common_utils.h"
 namespace mindspore {
 namespace kernel {
 namespace {

--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@@ -49,6 +49,7 @@ enum OpPattern {

 // Backend processor
 enum Processor {
+  UNKNOWN = -1,
  AICORE = 0,
  AICPU,
  CUDA,

--- a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
@@ -5,13 +5,19 @@ file(GLOB_RECURSE _PREACTIVATE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
 )

 if (ENABLE_D)
-    file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc")
-	list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST})
+    file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "ascend/*.cc"
+        "graph_kernel/*.cc"
+    )
+    list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST})
 endif ()

 if (ENABLE_GPU)
-    file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
-	list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST})
+    file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "gpu/*.cc"
+        "graph_kernel/*.cc"
+    )
+    list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST})
 endif ()

 set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT)

--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -14,6 +14,8 @@
 * limitations under the License.
 */
 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
+#include <algorithm>
+#include <list>
 #include <memory>
 #include <string>
 #include "backend/optimizer/common/optimizer.h"
@@ -68,8 +70,6 @@
 #include "backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
 #include "backend/optimizer/pass/eliminate_redundant_op.h"
 #include "backend/optimizer/pass/common_subexpression_elimination.h"
-#include "backend/optimizer/pass/fuse_graph_kernel.h"
-#include "backend/optimizer/pass/fuse_basic.h"
 #include "backend/optimizer/pass/add_atomic_clean.h"
 #include "backend/optimizer/ascend/format_type/merge_cast_to_op.h"
 #include "backend/optimizer/ascend/format_type/check_consistency.h"
@@ -106,6 +106,8 @@
 #include "backend/optimizer/ascend/ir_fission/pack_fission.h"
 #include "backend/optimizer/ascend/enhancer/concat_outputs_for_all_gather.h"
 #include "utils/ms_context.h"
+#include "backend/optimizer/graph_kernel/composite_ops_fusion.h"
+#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
 #include "debug/dump_proto.h"
@@ -406,7 +408,7 @@ void AscendBackendGraphKernelOpt(const std::shared_ptr<session::KernelGraph> &ke
  }

  // Fuse graph kernels with basic ops
-  FuseGraphKernel(kernel_graph, is_before_kernel_select);
+  static_cast<void>(FuseCompositeOps(kernel_graph, is_before_kernel_select));

  if (save_graphs) {
    std::string file_path = save_graphs_path + "/" + "hwopt_d_graph_kernel_opt_end_graph_" +
@@ -429,17 +431,17 @@ void AscendBackendFuseBasicOpt(const std::shared_ptr<session::KernelGraph> &kern
    save_graphs_path = ".";
  }
  if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_before_graph_" +
+    std::string file_path = save_graphs_path + "/" + "hwopt_fuse_basic_opt_before_graph_" +
                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
                            ".ir";
    DumpIR(file_path, kernel_graph, true);
  }

  // Fuse basic ops with basic ops
-  FuseBasic(kernel_graph, is_before_kernel_select);
+  static_cast<void>(FuseBasicOps(kernel_graph, is_before_kernel_select));

  if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_end_graph_" +
+    std::string file_path = save_graphs_path + "/" + "hwopt_fuse_basic_opt_end_graph_" +
                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
                            ".ir";
    DumpIR(file_path, kernel_graph, true);

--- a/mindspore/ccsrc/backend/optimizer/common/helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/common/helper.cc
@@ -601,6 +601,7 @@ void ConstInputToAttr(const CNodePtr &cnode, const std::unordered_set<size_t> &i
  std::vector<std::string> new_input_names;
  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
  MS_EXCEPTION_IF_NULL(primitive);
+  primitive = primitive->Clone();
  auto input_names = primitive->GetAttr(kAttrInputNames);
  if (input_names == nullptr) {
    MS_LOG(DEBUG) << "input_names are nullptr in cnode[" + cnode->DebugString() + "]";
@@ -631,6 +632,7 @@ void ConstInputToAttr(const CNodePtr &cnode, const std::unordered_set<size_t> &i
  }
  if (need_update) {
    // Update cnode's inputs
+    new_inputs[0] = NewValueNode(primitive);
    cnode->set_inputs(new_inputs);
    // Update cnode's input_names attr
    primitive->set_attr(kAttrInputNames, MakeValue(new_input_names));

--- a/mindspore/ccsrc/backend/optimizer/common/pass_manager.cc
+++ b/mindspore/ccsrc/backend/optimizer/common/pass_manager.cc
@@ -73,7 +73,7 @@ bool PassManager::Run(const FuncGraphPtr &func_graph, const std::vector<PassPtr>
      if (save_graphs) {
        auto dump_file_path =
          save_graphs_path + "/" + "hwopt_" + name() + "_" + std::to_string(num) + "_" + pass->name() + ".ir";
-        DumpIR(dump_file_path, func_graph);
+        DumpIR(dump_file_path, func_graph, true);
      }
      num++;
    }

--- a/mindspore/ccsrc/backend/optimizer/pass/fuse_basic.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/fuse_basic.cc
-
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
@@ -14,8 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "backend/optimizer/pass/fuse_basic.h"
-#include "backend/optimizer/pass/fuse_graph_kernel.h"
+#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"

 #include <memory>
 #include <algorithm>
@@ -31,17 +29,30 @@
 #include "vm/segment_runner.h"
 #include "debug/anf_ir_dump.h"
 #include "ir/func_graph_cloner.h"
+#include "backend/optimizer/graph_kernel/composite_ops_fusion.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_helper.h"

 namespace mindspore {
 namespace opt {
 namespace {
-std::vector<PrimitivePtr> get_fusable_basic_ops(bool is_before_kernel_select) {
+bool IsBasicOp(const AnfNodePtr &node, bool is_before_kernel_select) {
+#if ENABLE_D
  std::vector<PrimitivePtr> fusable_basic_ops = {prim::kPrimTensorAdd, prim::kPrimMul, prim::kPrimSub,
                                                 prim::kPrimExpandDims};
  if (!is_before_kernel_select) {
    fusable_basic_ops.push_back(prim::kPrimCast);
  }
-  return fusable_basic_ops;
+#elif ENABLE_GPU
+  std::vector<PrimitivePtr> fusable_basic_ops = {
+    prim::kPrimAbs,     prim::kPrimRound, prim::kPrimNeg,        prim::kPrimExp,      prim::kPrimTensorAdd,
+    prim::kPrimRealDiv, prim::kPrimMul,   prim::kPrimMinimum,    prim::kPrimMaximum,  prim::kPrimLog,
+    prim::kPrimPow,     prim::kPrimSub,   prim::kPrimRsqrt,      prim::kPrimSqrt,     prim::kPrimCast,
+    prim::kPrimAddN,    prim::kPrimEqual, prim::kPrimReciprocal, prim::KPrimTransData};
+#else
+  std::vector<PrimitivePtr> fusable_basic_ops;
+#endif
+  return std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                     [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
 }

 IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
@@ -53,16 +64,14 @@ IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKe
    return EXCLUDE;
  }

-  auto fusable_basic_ops = get_fusable_basic_ops(info.is_before_kernel_select);
-  bool is_fusable = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
-                                [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
-
+  bool is_fusable = IsBasicOp(node, info.is_before_kernel_select);
  return is_fusable ? FOLLOW : EXCLUDE;
 }

 std::vector<AnfNodePtr> FindFuseCNodes(const CNodePtr &cnode, bool is_before_kernel_select) {
  GraphKernelInfo info;
  info.is_before_kernel_select = is_before_kernel_select;
+
  // Search fusable nodes according input direction.
  auto include_func_forward = std::bind(IncludeFusedBasicOpForward, cnode, info, std::placeholders::_1);
  auto used_nodes = DeepLinkedGraphSearch(cnode, include_func_forward);
@@ -170,8 +179,9 @@ void RemoveControlDependOut(const FuncGraphPtr &fg, AnfNodePtrList *outputs, con
  fg->set_output(fg_new_output, true);
 }

-void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::vector<AnfNodePtr> &todos,
-               std::unordered_set<AnfNodePtr> *fused_ops, bool is_before_kernel_select) {
+bool FuseBasicOps(const FuncGraphPtr &kernel_graph, const std::vector<AnfNodePtr> &todos,
+                  std::unordered_set<AnfNodePtr> *fused_ops, bool is_before_kernel_select) {
+  bool changed = false;
  auto mng = kernel_graph->manager();
  for (auto iter = todos.cbegin(); iter != todos.cend(); ++iter) {
    auto node = (*iter)->cast<CNodePtr>();
@@ -181,9 +191,7 @@ void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const
    if (fused_ops->count(node)) {
      continue;
    }
-    auto fusable_basic_ops = get_fusable_basic_ops(is_before_kernel_select);
-    bool is_basic_op = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
-                                   [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+    bool is_basic_op = IsBasicOp(node, is_before_kernel_select);
    if (!is_basic_op || !kernel_graph->nodes().contains(node)) {
      continue;
    }
@@ -193,12 +201,16 @@ void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const
      continue;
    }

+    changed = true;
    FuncGraphPtr fg;
    AnfNodePtrList inputs;
    AnfNodePtrList outputs;
    std::tie(fg, inputs, outputs) = compile::TransformSegmentToAnfGraph(fuse_nodes);
    RemoveControlDependOut(fg, &outputs, mng);
    auto fuse_new_node = CreateNewFuseCNode(kernel_graph, fg, inputs, outputs, is_before_kernel_select);
+    if (!is_before_kernel_select) {
+      SetNewKernelInfo(fuse_new_node, fg, inputs, outputs, AnfAlgo::GetProcessor(fuse_nodes[0]));
+    }

    ReplaceNewFuseCNode(kernel_graph, fuse_new_node, outputs);

@@ -210,10 +222,12 @@ void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const
    fused_ops->insert(fuse_nodes.begin(), fuse_nodes.end());
    fg->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(fuse_op_name));
  }
+  std::dynamic_pointer_cast<session::KernelGraph>(kernel_graph)->SetExecOrderByDefault();
+  return changed;
 }
 }  // namespace

-void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select) {
+bool FuseBasicOps(const FuncGraphPtr &kernel_graph, bool is_before_kernel_select) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto mng = kernel_graph->manager();
  if (mng == nullptr) {
@@ -223,7 +237,9 @@ void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool i
  std::unordered_set<AnfNodePtr> fused_ops;
  auto todos = TopoSort(kernel_graph->get_return());
  std::reverse(todos.begin(), todos.end());
-  FuseBasic(kernel_graph, todos, &fused_ops, is_before_kernel_select);
+  return FuseBasicOps(kernel_graph, todos, &fused_ops, is_before_kernel_select);
 }
+
+bool BasicOpsFusion::Run(const FuncGraphPtr &func_graph) { return FuseBasicOps(func_graph, false); }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/pass/fuse_basic.h
+++ b/mindspore/ccsrc/backend/optimizer/pass/fuse_basic.h
@@ -14,8 +14,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_BASIC_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_BASIC_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_BASIC_OPS_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_BASIC_OPS_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@@ -23,7 +23,16 @@

 namespace mindspore {
 namespace opt {
-void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select);
+bool FuseBasicOps(const FuncGraphPtr &kernel_graph, bool is_before_kernel_select);
+
+class BasicOpsFusion : public Pass {
+ public:
+  BasicOpsFusion() : Pass("basic_ops_fusion") {}
+  ~BasicOpsFusion() override = default;
+  bool Run(const FuncGraphPtr &func_graph) override;
+};
+using FuseBasicPtr = std::shared_ptr<BasicOpsFusion>;
+
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_BASIC_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_BASIC_OPS_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/pass/fuse_graph_kernel.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/fuse_graph_kernel.cc
--- a/mindspore/ccsrc/backend/optimizer/pass/fuse_graph_kernel.h
+++ b/mindspore/ccsrc/backend/optimizer/pass/fuse_graph_kernel.h
-
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
@@ -14,13 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_GRAPH_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_GRAPH_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_COMPOSITE_OPS_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_COMPOSITE_OPS_FUSION_H_

 #include <set>
 #include <string>
 #include <vector>
 #include <memory>
+#include <limits>
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/session/kernel_graph.h"

@@ -31,18 +31,20 @@ enum GraphKernelType {
  REDUCE,       // contain reduce ops
  CUBE,         // contain cube ops
 };
+
 struct GraphKernelInfo {
  GraphKernelType op_type = ELEWISE;
  bool is_before_kernel_select = false;
  int reduce_op_num = 0;
  int cal_step = 0;
+  std::string origin_composite_name = "";
 };

-// when reduce graph kernel's cal step is greater than this number, not fuse
+// when composite fuse composite the cal step is greate than this number, not fuse
+#if ENABLE_D
 const int MAX_REDUCE_OP_FUSION_CAL_STEP = 5;
-// when reduce graph kernel contain reduce op num is greater than this number, not fuse
 const int MAX_REDUCE_OP_FUSION_REDUCE_NUM = 2;
-
+#endif
 const std::set<std::string> graph_kernel_black_list = {"BNTrainingUpdateSum", "ApplyMomentum", "LayerNormForward",
                                                       "LambNextMV", "LambUpdateWithLR"};

@@ -50,14 +52,15 @@ std::vector<AnfNodePtr> RemoveCircle(const std::vector<AnfNodePtr> &fused_op, bo

 void TopoSortForNodeList(std::vector<AnfNodePtr> *lst);

-AnfNodePtr CreateNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const FuncGraphPtr &fg,
-                              const AnfNodePtrList &inputs, const AnfNodePtrList &outputs,
-                              bool is_before_kernel_select);
+bool FuseCompositeOps(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select = false);

-void ReplaceNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const AnfNodePtr &new_fuse_cnode,
-                         const AnfNodePtrList &outputs);
-
-void FuseGraphKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select = false);
+class CompositeOpsFusion : public Pass {
+ public:
+  CompositeOpsFusion() : Pass("composite_ops_fusion") {}
+  ~CompositeOpsFusion() override = default;
+  bool Run(const FuncGraphPtr &func_graph) override;
+};
+using FuseGraphKernelPassPtr = std::shared_ptr<CompositeOpsFusion>;
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_FUSE_GRAPH_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_COMPOSITE_OPS_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_EXPANDER_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_EXPANDER_H_
+#include <memory>
+#include <unordered_set>
+#include "ir/func_graph.h"
+#include "backend/optimizer/common/pass.h"
+
+namespace mindspore {
+namespace opt {
+class GraphKernelExpander : public Pass {
+ public:
+  GraphKernelExpander() : Pass("graph_kernel_expander") {}
+  ~GraphKernelExpander() override = default;
+  bool Run(const FuncGraphPtr &func_graph);
+
+ private:
+  FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node);
+  bool DoExpand(const FuncGraphPtr &func_graph);
+  AnfNodePtr CreateExpandGraphKernel(const FuncGraphPtr &func_graph, const FuncGraphPtr &new_func_graph,
+                                     const CNodePtr &node);
+  bool CanExpand(const CNodePtr &node) {
+    return std::any_of(expand_ops_.begin(), expand_ops_.end(),
+                       [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+  }
+
+ private:
+  std::unordered_set<PrimitivePtr> expand_ops_;
+};
+using GraphKernelExpanderPtr = std::shared_ptr<GraphKernelExpander>;
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_EXPANDER_H_
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <map>
+#include <unordered_set>
+#include <nlohmann/json.hpp>
+#include "ir/anf.h"
+#include "ir/func_graph.h"
+#include "backend/session/kernel_graph.h"
+#include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
+
+namespace mindspore {
+namespace opt {
+using kernel::DumpOption;
+
+constexpr auto kGraphKernelModule = "mindspore._extends.graph_kernel";
+constexpr auto kGraphKernelSplitFunc = "split_with_json";
+constexpr auto kGetGraphKernelOpExpander = "get_op_expander";
+constexpr auto kJsonKeyMultiGraph = "multi_graph";
+constexpr auto kJsonKeyGraphDesc = "graph_desc";
+
+void SetNewKernelInfo(const AnfNodePtr &new_node, const FuncGraphPtr &fg, const AnfNodePtrList &inputs,
+                      const AnfNodePtrList &outputs, kernel::Processor processor);
+AnfNodePtrList GetExpandOuts(const AnfNodePtrList &outs);
+AnfNodePtr CreateNewFuseCNode(const FuncGraphPtr &kernel_graph, const FuncGraphPtr &fg, const AnfNodePtrList &inputs,
+                              const AnfNodePtrList &outputs, bool is_before_kernel_select);
+void ReplaceNewFuseCNode(const FuncGraphPtr &kernel_graph, const AnfNodePtr &new_fuse_cnode,
+                         const AnfNodePtrList &outputs);
+void FuseNodesToSubGraph(const std::vector<AnfNodePtr> &fuse_nodes,
+                         const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::string &postfix,
+                         bool is_before_kernel_select);
+bool AnfToJsonDesc(const AnfNodePtrList &nodes, DumpOption dump_option, nlohmann::json *op_desc,
+                   std::map<std::string, AnfNodePtr> *address_node_map = nullptr);
+bool AnfToJsonDesc(const std::vector<AnfNodePtrList> &graphs, DumpOption dump_option, nlohmann::json *op_desc);
+FuncGraphPtr JsonDescToAnf(const std::string &json_desc, const std::vector<AnfNodePtr> &inputs);
+bool JsonDescToAnf(const std::string &json_desc, const std::map<std::string, AnfNodePtr> &address_node_map,
+                   std::vector<AnfNodePtrList> *res_graphs);
+std::unordered_set<PrimitivePtr> GetExpandOps();
+std::string ExtractGraphKernelName(const AnfNodePtrList &cnodes, const string &prefix = "", const string &postfix = "");
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_SPLITTER_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_SPLITTER_H_
+#include <memory>
+#include "ir/func_graph.h"
+#include "backend/optimizer/common/pass.h"
+
+namespace mindspore {
+namespace opt {
+class GraphKernelSplitter : public Pass {
+ public:
+  GraphKernelSplitter() : Pass("graph_kernel_splitter") {}
+  ~GraphKernelSplitter() override = default;
+  bool Run(const FuncGraphPtr &func_graph);
+};
+using GraphKernelSplitterPtr = std::shared_ptr<GraphKernelSplitter>;
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_SPLITTER_H_
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -41,6 +41,7 @@
 #include "utils/config_manager.h"
 #include "utils/base_ref_extends.h"
 #include "debug/tensor_load.h"
+#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"

 namespace mindspore {
 namespace session {

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -51,6 +51,8 @@ class GPUSession : public SessionBasic {

  void HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph);

+  void GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph);
+
  void AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph);

  void BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const;

--- a/mindspore/ccsrc/debug/anf_ir_dump.cc
+++ b/mindspore/ccsrc/debug/anf_ir_dump.cc
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
--- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
--- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h
+++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
--- a/mindspore/core/base/core_ops.h
+++ b/mindspore/core/base/core_ops.h
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
--- a/mindspore/ops/_op_impl/akg/gpu/__init__.py
+++ b/mindspore/ops/_op_impl/akg/gpu/__init__.py
--- a/mindspore/ops/_op_impl/akg/gpu/abs.py
+++ b/mindspore/ops/_op_impl/akg/gpu/abs.py
--- a/mindspore/ops/_op_impl/akg/gpu/add.py
+++ b/mindspore/ops/_op_impl/akg/gpu/add.py
--- a/mindspore/ops/_op_impl/akg/gpu/add_n.py
+++ b/mindspore/ops/_op_impl/akg/gpu/add_n.py
--- a/mindspore/ops/_op_impl/akg/gpu/exp.py
+++ b/mindspore/ops/_op_impl/akg/gpu/exp.py
--- a/mindspore/ops/_op_impl/akg/gpu/expand_dims.py
+++ b/mindspore/ops/_op_impl/akg/gpu/expand_dims.py
--- a/mindspore/ops/_op_impl/akg/gpu/log.py
+++ b/mindspore/ops/_op_impl/akg/gpu/log.py
--- a/mindspore/ops/_op_impl/akg/gpu/maximum.py
+++ b/mindspore/ops/_op_impl/akg/gpu/maximum.py
--- a/mindspore/ops/_op_impl/akg/gpu/minimum.py
+++ b/mindspore/ops/_op_impl/akg/gpu/minimum.py
--- a/mindspore/ops/_op_impl/akg/gpu/neg.py
+++ b/mindspore/ops/_op_impl/akg/gpu/neg.py
--- a/mindspore/ops/_op_impl/akg/gpu/pow.py
+++ b/mindspore/ops/_op_impl/akg/gpu/pow.py
--- a/mindspore/ops/_op_impl/akg/gpu/real_div.py
+++ b/mindspore/ops/_op_impl/akg/gpu/real_div.py
--- a/mindspore/ops/_op_impl/akg/gpu/reciprocal.py
+++ b/mindspore/ops/_op_impl/akg/gpu/reciprocal.py
--- a/mindspore/ops/_op_impl/akg/gpu/reduce_max.py
+++ b/mindspore/ops/_op_impl/akg/gpu/reduce_max.py
--- a/mindspore/ops/_op_impl/akg/gpu/reduce_min.py
+++ b/mindspore/ops/_op_impl/akg/gpu/reduce_min.py
--- a/mindspore/ops/_op_impl/akg/gpu/reduce_sum.py
+++ b/mindspore/ops/_op_impl/akg/gpu/reduce_sum.py
--- a/mindspore/ops/_op_impl/akg/gpu/reshape.py
+++ b/mindspore/ops/_op_impl/akg/gpu/reshape.py
--- a/mindspore/ops/_op_impl/akg/gpu/round.py
+++ b/mindspore/ops/_op_impl/akg/gpu/round.py
--- a/mindspore/ops/_op_impl/akg/gpu/rsqrt.py
+++ b/mindspore/ops/_op_impl/akg/gpu/rsqrt.py
--- a/mindspore/ops/_op_impl/akg/gpu/sqrt.py
+++ b/mindspore/ops/_op_impl/akg/gpu/sqrt.py
--- a/tests/st/ops/graph_kernel/test_fuse.py
+++ b/tests/st/ops/graph_kernel/test_fuse.py
--- a/tests/st/ops/graph_kernel/test_layernorm.py
+++ b/tests/st/ops/graph_kernel/test_layernorm.py
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt