提交 9b616058 编写于 作者: A Anlun Xu 提交者: TensorFlower Gardener

[XLA:GPU][NFC] Make PickBestAlgorithmNoCacheCuda suitable for runtime autotuning

We add some parameters to the function interface to avoid computing them from the HLO instruction, since HLO instructions are not available at runtime.

PiperOrigin-RevId: 507925824
上级 4e700dff
...@@ -18,7 +18,9 @@ limitations under the License. ...@@ -18,7 +18,9 @@ limitations under the License.
#include <optional> #include <optional>
#include <string> #include <string>
#include <string_view>
#include <variant> #include <variant>
#include <vector>
#include "absl/time/time.h" #include "absl/time/time.h"
#include "tensorflow/compiler/xla/autotune_results.pb.h" #include "tensorflow/compiler/xla/autotune_results.pb.h"
...@@ -82,18 +84,48 @@ class GpuConvAlgorithmPicker : public GpuSerializableAutotuner { ...@@ -82,18 +84,48 @@ class GpuConvAlgorithmPicker : public GpuSerializableAutotuner {
stream_executor::DeviceMemoryBase buffer; stream_executor::DeviceMemoryBase buffer;
}; };
// Debug information about the instruction we are autotuning.
struct AutotuneInstructionInfo {
std::string instr_str;
std::string module_str;
explicit AutotuneInstructionInfo(const HloCustomCallInstruction* instr)
: instr_str(instr->ToString()),
module_str(instr->GetModule()->ToString()) {}
};
// Execution environment for autotuning. Runtime autotuning requires runtime
// information such as input/output buffers in order to run. It can be
// constructed from the autotuned instruction by FromInstruction.
struct AutotuneRuntimeArguments {
const Shape result_shape;
const HloModuleConfig hlo_module_config;
std::vector<se::DeviceMemoryBase> operand_buffers;
se::DeviceMemoryBase result_buffer;
se::RedzoneAllocator* input_output_allocator;
const GpuConvConfig gpu_conv_config;
std::string canonical_hlo;
static StatusOr<AutotuneRuntimeArguments> FromInstruction(
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::StreamExecutor* stream,
se::RedzoneAllocator* input_output_allocator);
};
StatusOr<tensorflow::AutotuneResult> AutotuneOneConvRunner( StatusOr<tensorflow::AutotuneResult> AutotuneOneConvRunner(
const GpuConvConfig& config, const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator, se::Stream* stream,
se::DeviceMemoryAllocator* allocator,
se::RedzoneAllocator* input_output_allocator, se::Stream* stream,
MaybeFusedConvRunner* const runner, MaybeFusedConvRunner* const runner,
absl::Span<const stream_executor::DeviceMemoryBase> operand_buffers,
stream_executor::DeviceMemoryBase result_buffer,
std::optional<ReferenceResult>* reference_result, std::optional<ReferenceResult>* reference_result,
absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos); absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
std::optional<AutotuneInstructionInfo> instruction_info,
const AutotuneRuntimeArguments& runtime_arguments);
// Pick the best algorithm for CUDA platform.
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda( StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
const HloCustomCallInstruction* instr, const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::Stream* stream); se::DeviceMemoryAllocator* allocator, se::Stream* stream,
std::optional<AutotuneInstructionInfo> instruction_info,
const AutotuneRuntimeArguments& runtime_arguments);
#endif #endif
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm( StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册