Calculate GpuProfiler::GetScratchBytes using BufferUses in Autotuner

EusebioDM · tensorflower-gardener · commit b810251ea281 · 2026-06-29T06:10:23.000-07:00
- Old logic: Scans physical `BufferAllocation`s and only counts those where `IsPreallocatedTempBuffer()` is true. This misses scratch buffers that the compiler overlays onto live-out (output) allocations to save memory.

- New logic: Walks the executed `Thunk` sequence and sums the sizes of all logical `BufferUse::Scratch` slices, accurately capturing scratch usage regardless of physical overlay optimizations.

Example (32MB matmul scratch overlaid on a 157MB live-out output buffer):
- Old logic: Returns 0 bytes (skips the live-out allocation).
- New logic: Returns 32MB (correctly extracts the scratch thunk use).

Also had to a const way to Walk Thunks

#### Why this is needed

The current logic relies on the `BufferAssignments` being alive, since the `buffer` variable which holds an `HloValue` is owned by the `BufferAssignments` brass.

I'm in the process of removing the `BufferAssignment` from the executable since it it cannot be re-created when loading an AOT binary, and its not really needed. So we need to get rid of this implicit dependency before doing so.

PiperOrigin-RevId: 939785631
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -400,23 +400,25 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/autotuner:profiler",
         "//xla/backends/gpu/runtime:buffer_comparator",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/backends/gpu/runtime:thunk_executor",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/autotuning:redzone_buffers",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:stream_executor_address_allocator",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:redzone_allocator",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_macros",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -427,7 +429,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@tsl//tsl/platform:casts",
     ],
 )
 
@@ -708,12 +709,10 @@ xla_test(
         "//xla/service/gpu:nvptx_compiler_impl",
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_address_allocator",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_macros",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
@@ -34,16 +34,18 @@ limitations under the License.
 #include "xla/tsl/platform/status_macros.h"
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_executor.h"
 #include "xla/executable_run_options.h"
-#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
@@ -55,10 +57,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_address_allocator.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/casts.h"
 
 namespace xla {
 
@@ -81,23 +80,24 @@ std::vector<ExecutionInput> CreateExecutionInputsFromBuffers(
   return inputs;
 }
 
-int GetScratchBytes(const Executable* executable) {
-  int scratch_bytes = 0;
-  for (const auto* allocation : executable->GetAllocations()) {
-    if (allocation->IsPreallocatedTempBuffer()) {
-      for (const auto& [buffer, offset] : allocation->assigned_buffers()) {
-        // Scratch space is allocated as the second element in the output tuple
-        // of the instruction.
-        const auto& shape_index = buffer->positions().front().index;
-        bool is_second_element_in_output_tuple =
-            !shape_index.empty() && shape_index[0] == 1;
-        if (is_second_element_in_output_tuple) {
-          scratch_bytes += offset.size;
+int GetScratchBytes(const GpuExecutable& executable) {
+  int32_t scratch_bytes = 0;
+  CHECK_OK(executable.thunk_executor().thunks().WalkNested(
+      [&scratch_bytes](const Thunk* thunk) {
+        std::vector<BufferAllocation::Slice> scratch_slices;
+        for (const auto& buffer_use : thunk->buffer_uses()) {
+          // ContentValidity::kUndefined means the buffer is a scratch buffer.
+          if (buffer_use.content_validity() ==
+              BufferUse::ContentValidity::kUndefined) {
+            // TODO(b/517426568): De-duplicate overlapping slices.
+            scratch_bytes += buffer_use.slice().size();
+          }
         }
-      }
-    }
-  }
-  return scratch_bytes;
+
+        return absl::OkStatus();
+      }));
+
+  return static_cast<int>(scratch_bytes);
 }
 
 // Initialize a specific input buffer with custom values.
@@ -267,8 +267,12 @@ absl::StatusOr<ProfileResult> GpuProfiler::Profile(
   const GpuInputBuffers& gpu_buffers =
       absl::down_cast<const GpuInputBuffers&>(buffers);
   const RedzoneBuffers& rz_buffers = gpu_buffers.redzone_buffers;
+
   ProfileResult result;
-  result.scratch_bytes = GetScratchBytes(executable);
+  if (auto* gpu_executable = dynamic_cast<const GpuExecutable*>(executable);
+      gpu_executable != nullptr) {
+    result.scratch_bytes = GetScratchBytes(*gpu_executable);
+  }
   {
     // Warm up run.
     std::vector<ExecutionInput> execution_inputs =
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
@@ -47,10 +47,8 @@ limitations under the License.
 #include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/stream_executor/stream_executor_address_allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -145,15 +143,15 @@ TEST_F(GpuProfilerTest, CreateInputBuffersAndProfile) {
       ROOT c = s32[] constant(1)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHloModule));
   MockExecutable mock_executable(module, 1000);
   auto profiler =
       GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
-                          profiler->CreateInputBuffers(&mock_executable));
-  TF_ASSERT_OK_AND_ASSIGN(ProfileResult profile,
-                          profiler->Profile(&mock_executable, *buffers));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
+                       profiler->CreateInputBuffers(&mock_executable));
+  ASSERT_OK_AND_ASSIGN(ProfileResult profile,
+                       profiler->Profile(&mock_executable, *buffers));
   EXPECT_EQ(profile.duration, absl::Nanoseconds(1000));
   EXPECT_EQ(profile.output_buffer->on_device_shape(),
             ShapeUtil::MakeShape(S32, {}));
@@ -167,15 +165,15 @@ TEST_F(GpuProfilerTest, FailingExecutablesReturnStatus) {
       ROOT c = s32[] constant(1)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHloModule));
   MockExecutable mock_executable(module, /*duration_ns=*/0,
                                  /*should_fail=*/true);
 
   auto profiler =
       GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
-                          profiler->CreateInputBuffers(&mock_executable));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
+                       profiler->CreateInputBuffers(&mock_executable));
   EXPECT_THAT(profiler->Profile(&mock_executable, *buffers),
               StatusIs(absl::StatusCode::kInternal));
 }
@@ -191,14 +189,14 @@ TEST_P(GpuProfilerTestWithRedzonePadding, CheckInputBuffers) {
       ROOT c = s32[] constant(1)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHloModule));
   MockExecutable mock_executable(module, 1000);
   ProfileOptions options;
   options.redzone_padding_bytes = GetParam();
   auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
-                          profiler->CreateInputBuffers(&mock_executable));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
+                       profiler->CreateInputBuffers(&mock_executable));
   TF_EXPECT_OK(profiler->CheckInputBuffers(*buffers));
 }
 
@@ -210,33 +208,33 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWhenBuffersAreSame) {
   ProfileOptions options;
   auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
+  ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
   auto allocator =
       std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_exec_);
-  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
-                          CreateTestBuffer(allocator.get(), stream_exec_,
-                                           stream.get(), /*value=*/1));
-  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
-                          CreateTestBuffer(allocator.get(), stream_exec_,
-                                           stream.get(), /*value=*/1));
+  ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
+                       CreateTestBuffer(allocator.get(), stream_exec_,
+                                        stream.get(), /*value=*/1));
+  ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
+                       CreateTestBuffer(allocator.get(), stream_exec_,
+                                        stream.get(), /*value=*/1));
   EXPECT_THAT(profiler->CheckOutputBuffer(output, reference, /*rtol=*/0.0),
               StatusIs(absl::StatusCode::kOk));
 }
 
 TEST_F(GpuProfilerTest, CheckOutputBufferWhenBuffersAreDifferent) {
   ProfileOptions options;
   auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
+  ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
   auto allocator =
       std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_exec_);
-  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
-                          CreateTestBuffer(allocator.get(), stream_exec_,
-                                           stream.get(), /*value=*/1));
-  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
-                          CreateTestBuffer(allocator.get(), stream_exec_,
-                                           stream.get(), /*value=*/2));
+  ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
+                       CreateTestBuffer(allocator.get(), stream_exec_,
+                                        stream.get(), /*value=*/1));
+  ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
+                       CreateTestBuffer(allocator.get(), stream_exec_,
+                                        stream.get(), /*value=*/2));
   EXPECT_THAT(profiler->CheckOutputBuffer(output, reference, /*rtol=*/0.0),
               StatusIs(absl::StatusCode::kInternal));
 }
@@ -245,15 +243,15 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreSame) {
   ProfileOptions options;
   auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
+  ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
   auto allocator =
       std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_exec_);
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer output,
       CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
                             /*value1=*/1, /*value2=*/2));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer reference,
       CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
                             /*value1=*/1, /*value2=*/2));
@@ -265,19 +263,19 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreDifferent) {
   ProfileOptions options;
   auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
+  ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
   auto allocator =
       std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_exec_);
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer reference,
       CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
                             /*value1=*/1, /*value2=*/2));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer output_error_in_first_element,
       CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
                             /*value1=*/0, /*value2=*/2));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer output_error_in_second_element,
       CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
                             /*value1=*/1, /*value2=*/3));
@@ -289,8 +287,8 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreDifferent) {
               StatusIs(absl::StatusCode::kInternal));
 }
 
-TEST_F(GpuProfilerTest, CheckScratchBytesArePopulatedUsingBufferAssignment) {
-  constexpr absl::string_view kHloModule = R"(
+TEST_F(GpuProfilerTest, CheckScratchBytesArePopulated) {
+  constexpr absl::string_view kHloModule = R"hlo(
 HloModule gemm_fusion_dot.1, is_scheduled=true, entry_computation_layout={(bf16[32,120,6,512]{3,2,1,0}, f32[3072,512]{1,0})->bf16[3840,512]{1,0}}, frontend_attributes={fingerprint_before_lhs="40f912baf5b53a4f75b1ba9b3442042f"}
 
 %wrapped_convert_computation (param_0: f32[3072,512]) -> bf16[3072,512] {
@@ -307,19 +305,19 @@ ENTRY %entry_computation (transpose.562: bf16[32,120,6,512], Arg_1.2: f32[3072,5
   %custom-call.1 = (bf16[512,3840]{0,1}, s8[26738688]{0}) custom-call(%wrapped_convert, %bitcast.1), custom_call_target="__cublas$lt$matmul", backend_config={"operation_queue_id":"0","gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["1"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"1572864","rhs_stride":"11796480","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[]}
   %get-tuple-element = bf16[512,3840]{0,1} get-tuple-element(%custom-call.1), index=0
   ROOT %bitcast.2 = bf16[3840,512]{1,0} bitcast(%get-tuple-element)
-})";
+})hlo";
   NVPTXCompiler compiler;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_executable,
-                          compiler.RunBackend(std::move(module), stream_exec_,
-                                              GpuCompiler::CompileOptions()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHloModule));
+  ASSERT_OK_AND_ASSIGN(auto gpu_executable,
+                       compiler.RunBackend(std::move(module), stream_exec_,
+                                           GpuCompiler::CompileOptions()));
   auto profiler =
       GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
-                          profiler->CreateInputBuffers(gpu_executable.get()));
-  TF_ASSERT_OK_AND_ASSIGN(ProfileResult profile,
-                          profiler->Profile(gpu_executable.get(), *buffers));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
+                       profiler->CreateInputBuffers(gpu_executable.get()));
+  ASSERT_OK_AND_ASSIGN(ProfileResult profile,
+                       profiler->Profile(gpu_executable.get(), *buffers));
   EXPECT_EQ(profile.scratch_bytes, 26738688);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -508,6 +508,13 @@ absl::Status ThunkSequence::WalkNested(Thunk::Walker callback) {
   return absl::OkStatus();
 }
 
+absl::Status ThunkSequence::WalkNested(Thunk::ConstWalker callback) const {
+  for (const auto& thunk : *this) {
+    RETURN_IF_ERROR(thunk->Walk(callback));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status ThunkSequence::TransformNested(Thunk::Transformer callback) {
   for (std::unique_ptr<Thunk>& thunk : *this) {
     RETURN_IF_ERROR(thunk->TransformNested(callback));
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h