[cherry-pick] Fix gru as small frame_size has error. (#20922) (#21440)

seems shuffle_sync cannot handle small size test=develop Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com>

[cherry-pick] Fix gru as small frame_size has error. (#20922) (#21440)
seems shuffle_sync cannot handle small size test=develop Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com>
873b32de · zhaoyuchen2018 · GitHub · 0473cdb8 · 873b32de
隐藏空白更改
内联并排

Showing with 35 addition and 17 deletion

paddle/fluid/operators/math/gru_compute.cu paddle/fluid/operators/math/gru_compute.cu +35 -17

未找到文件。
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -31,23 +31,41 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
    dim3 grid;
    if (batch_size == 1) {
      if (context.GetComputeCapability() >= 70) {
-        constexpr int tiled_size = 16;
-        int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
-        threads = dim3(tiled_size, 1);
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruGate<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.gate_value, value.prev_out_value, value.gate_weight,
-            value.reset_output_value, frame_size, active_gate);
-
-        frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruOut<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.state_weight, value.prev_out_value, value.output_value,
-            value.gate_value, value.reset_output_value, frame_size, active_node,
-            origin_mode);
-
+        if (frame_size < 16) {
+          constexpr int tiled_size = 8;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value, value.prev_out_value, value.gate_weight,
+              value.reset_output_value, frame_size, active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight, value.prev_out_value, value.output_value,
+              value.gate_value, value.reset_output_value, frame_size,
+              active_node, origin_mode);
+        } else {
+          constexpr int tiled_size = 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value, value.prev_out_value, value.gate_weight,
+              value.reset_output_value, frame_size, active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight, value.prev_out_value, value.output_value,
+              value.gate_value, value.reset_output_value, frame_size,
+              active_node, origin_mode);
+        }
        return;
      } else {
        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;