From 63f6ce7bd9494699373ac22267a3234fc9818e5b Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Fri, 23 Jul 2021 18:52:21 +0800
Subject: [PATCH] [NPU] add index_sample_op_npu and tests (#34239)

* add index_sample_op_npu and tests

* update
---
 paddle/fluid/operators/index_sample_op_npu.cc | 130 ++++++++++++
 .../unittests/npu/test_index_sample_op_npu.py | 193 ++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 paddle/fluid/operators/index_sample_op_npu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
new file mode 100644
index 00000000000..f5a4100c635
--- /dev/null
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/index_sample_op.h"
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class IndexSampleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* input = ctx.Input<framework::LoDTensor>("X");
+    auto* index = ctx.Input<framework::LoDTensor>("Index");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor transformed_index;
+    const auto& index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Input(Index) holds the wrong type, it holds %s, but "
+                          "desires to be %s or %s",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+    if (index_type == framework::proto::VarType::INT32) {
+      transformed_index.mutable_data<int64_t>(index->dims(),
+                                              dev_ctx.GetPlace());
+      const auto& cast_runner = NpuOpRunner(
+          "Cast", {*index}, {transformed_index}, {{"dst_type", ACL_INT64}});
+      cast_runner.Run(dev_ctx.stream());
+    } else {
+      transformed_index.ShareDataWith(*index);
+    }
+
+    const auto& runner = NpuOpRunner(
+        "GatherElements", {*input, transformed_index}, {*out}, {{"dim", 1}});
+    runner.Run(dev_ctx.stream());
+  }
+};
+
+template <typename IndexT>
+void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
+                            const Tensor* index, const Tensor* out_grad,
+                            Tensor* x_grad) {
+  auto index_dims = index->dims();
+  auto input_dims = x_grad->dims();
+  auto batch_size = input_dims[0];
+  auto index_length = index_dims[1];
+
+  std::vector<IndexT> scatter_index_vec;
+  std::vector<IndexT> index_vec;
+  framework::TensorToVector(*index, dev_ctx, &index_vec);
+  for (auto i = 0; i < batch_size; ++i) {
+    for (auto j = 0; j < index_length; j++) {
+      scatter_index_vec.push_back(i);
+      scatter_index_vec.push_back(index_vec[i * index_length + j]);
+    }
+  }
+  Tensor scatter_index;
+  framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
+  scatter_index.Resize({batch_size, index_length, 2});
+
+  NpuOpRunner runner;
+  runner.SetType("ScatterNd")
+      .AddInput(scatter_index)
+      .AddInput(*out_grad)
+      .AddInput(framework::vectorize<IndexT>(x_grad->dims()))
+      .AddOutput(*x_grad);
+  runner.Run(dev_ctx.stream());
+}
+
+template <typename T>
+class IndexSampleGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* index = ctx.Input<framework::LoDTensor>("Index");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    const auto& index_type = index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      IndexSampleGradScatter<int32_t>(dev_ctx, index, out_grad, x_grad);
+    } else {
+      IndexSampleGradScatter<int64_t>(dev_ctx, index, out_grad, x_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(index_sample, ops::IndexSampleNPUKernel<plat::float16>,
+                       ops::IndexSampleNPUKernel<float>,
+                       ops::IndexSampleNPUKernel<int32_t>,
+                       ops::IndexSampleNPUKernel<int64_t>);
+REGISTER_OP_NPU_KERNEL(index_sample_grad,
+                       ops::IndexSampleGradNPUKernel<plat::float16>,
+                       ops::IndexSampleGradNPUKernel<float>,
+                       ops::IndexSampleGradNPUKernel<int32_t>,
+                       ops::IndexSampleGradNPUKernel<int64_t>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
new file mode 100644
index 00000000000..9b890d22ada
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
@@ -0,0 +1,193 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import unittest
+import numpy as np
+sys.path.append("..")
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestIndexSampleOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "index_sample"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.dtype)
+        indexnp = np.random.randint(
+            low=0, high=self.x_shape[1],
+            size=self.index_shape).astype(self.index_type)
+        self.inputs = {'X': xnp, 'Index': indexnp}
+        index_array = []
+        for i in range(self.index_shape[0]):
+            for j in indexnp[i]:
+                index_array.append(xnp[i, j])
+        index_array = np.array(index_array).astype(self.dtype)
+        out = np.reshape(index_array, self.index_shape)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.NPUPlace(0), ['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int32"
+
+
+class TestCase1(TestIndexSampleOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100, 1)
+        self.dtype = "float32"
+        self.index_shape = (100, 1)
+        self.index_type = "int32"
+
+
+class TestCase2(TestIndexSampleOp):
+    def config(self):
+        """
+        For int64_t index type
+        """
+        self.x_shape = (10, 100)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int64"
+
+
+class TestCase3(TestIndexSampleOp):
+    def config(self):
+        """
+        For int index type
+        """
+        self.x_shape = (10, 100)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int32"
+
+
+class TestCase4(TestIndexSampleOp):
+    def config(self):
+        """
+        For int64 index type
+        """
+        self.x_shape = (10, 128)
+        self.dtype = "float32"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestCase5(TestIndexSampleOp):
+    def config(self):
+        """
+        For float16 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "float16"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+    def test_check_grad(self):
+        pass
+
+
+class TestCase6(TestCase5):
+    def config(self):
+        """
+        For int32 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "int32"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestCase7(TestCase5):
+    def config(self):
+        """
+        For int64 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "int64"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestIndexSampleShape(unittest.TestCase):
+    def test_shape(self):
+        paddle.enable_static()
+        # create x value
+        x_shape = (2, 5)
+        x_type = "float32"
+        x_np = np.random.random(x_shape).astype(x_type)
+
+        # create index value
+        index_shape = (2, 3)
+        index_type = "int32"
+        index_np = np.random.randint(
+            low=0, high=x_shape[1], size=index_shape).astype(index_type)
+
+        x = fluid.data(name='x', shape=[-1, 5], dtype='float32')
+        index = fluid.data(name='index', shape=[-1, 3], dtype='int32')
+        output = paddle.index_sample(x=x, index=index)
+
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+
+        feed = {'x': x_np, 'index': index_np}
+        res = exe.run(feed=feed, fetch_list=[output])
+
+
+class TestIndexSampleDynamic(unittest.TestCase):
+    def test_result(self):
+        with fluid.dygraph.guard(paddle.NPUPlace(0)):
+            x = paddle.to_tensor(
+                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                 [9.0, 10.0, 11.0, 12.0]],
+                dtype='float32')
+            index = paddle.to_tensor(
+                [[0, 1, 2], [1, 2, 3], [0, 0, 0]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+
+            except_output = np.array(
+                [[1.0, 2.0, 3.0], [6.0, 7.0, 8.0], [9.0, 9.0, 9.0]])
+            assert out_z1.numpy().all() == except_output.all()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
-- 
GitLab