[XLA:GPU][NFC] Make PickBestAlgorithmNoCacheCuda suitable for runtime autotuning

We add some parameters to the function interface to avoid computing them from the HLO instruction, since HLO instructions are not available at runtime. PiperOrigin-RevId: 507925824

[XLA:GPU][NFC] Make PickBestAlgorithmNoCacheCuda suitable for runtime autotuning
We add some parameters to the function interface to avoid computing them from the HLO instruction, since HLO instructions are not available at runtime. PiperOrigin-RevId: 507925824
9b616058 · Anlun Xu · TensorFlower Gardener · 4e700dff · 9b616058 · 9b616058
2 changed file
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
@@ -18,7 +18,9 @@ limitations under the License.
 #include <optional>
 #include <string>
+#include <string_view>
 #include <variant>
+#include <vector>
 #include "absl/time/time.h"
 #include "tensorflow/compiler/xla/autotune_results.pb.h"
@@ -82,18 +84,48 @@ class GpuConvAlgorithmPicker : public GpuSerializableAutotuner {
    stream_executor::DeviceMemoryBase buffer;
  };
+  // Debug information about the instruction we are autotuning.
+  struct AutotuneInstructionInfo {
+    std::string instr_str;
+    std::string module_str;
+    explicit AutotuneInstructionInfo(const HloCustomCallInstruction* instr)
+        : instr_str(instr->ToString()),
+          module_str(instr->GetModule()->ToString()) {}
+  };
+  // Execution environment for autotuning. Runtime autotuning requires runtime
+  // information such as input/output buffers in order to run. It can be
+  // constructed from the autotuned instruction by FromInstruction.
+  struct AutotuneRuntimeArguments {
+    const Shape result_shape;
+    const HloModuleConfig hlo_module_config;
+    std::vector<se::DeviceMemoryBase> operand_buffers;
+    se::DeviceMemoryBase result_buffer;
+    se::RedzoneAllocator* input_output_allocator;
+    const GpuConvConfig gpu_conv_config;
+    std::string canonical_hlo;
+    static StatusOr<AutotuneRuntimeArguments> FromInstruction(
+        const HloCustomCallInstruction* instr,
+        se::DeviceMemoryAllocator* allocator, se::StreamExecutor* stream,
+        se::RedzoneAllocator* input_output_allocator);
+  };
  StatusOr<tensorflow::AutotuneResult> AutotuneOneConvRunner(
-      const GpuConvConfig& config, const HloCustomCallInstruction* instr,
+      se::DeviceMemoryAllocator* allocator, se::Stream* stream,
-      se::DeviceMemoryAllocator* allocator,
-      se::RedzoneAllocator* input_output_allocator, se::Stream* stream,
      MaybeFusedConvRunner* const runner,
-      absl::Span<const stream_executor::DeviceMemoryBase> operand_buffers,
-      stream_executor::DeviceMemoryBase result_buffer,
      std::optional<ReferenceResult>* reference_result,
-      absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos);
+      absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
+      std::optional<AutotuneInstructionInfo> instruction_info,
+      const AutotuneRuntimeArguments& runtime_arguments);
+  // Pick the best algorithm for CUDA platform.
  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
      const HloCustomCallInstruction* instr,
-      se::DeviceMemoryAllocator* allocator, se::Stream* stream);
+      se::DeviceMemoryAllocator* allocator, se::Stream* stream,
+      std::optional<AutotuneInstructionInfo> instruction_info,
+      const AutotuneRuntimeArguments& runtime_arguments);
 #endif
  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(