Skip to content

Commit b810251

Browse files
EusebioDMtensorflower-gardener
authored andcommitted
Calculate GpuProfiler::GetScratchBytes using BufferUses in Autotuner
- Old logic: Scans physical `BufferAllocation`s and only counts those where `IsPreallocatedTempBuffer()` is true. This misses scratch buffers that the compiler overlays onto live-out (output) allocations to save memory. - New logic: Walks the executed `Thunk` sequence and sums the sizes of all logical `BufferUse::Scratch` slices, accurately capturing scratch usage regardless of physical overlay optimizations. Example (32MB matmul scratch overlaid on a 157MB live-out output buffer): - Old logic: Returns 0 bytes (skips the live-out allocation). - New logic: Returns 32MB (correctly extracts the scratch thunk use). Also had to a const way to Walk Thunks #### Why this is needed The current logic relies on the `BufferAssignments` being alive, since the `buffer` variable which holds an `HloValue` is owned by the `BufferAssignments` brass. I'm in the process of removing the `BufferAssignment` from the executable since it it cannot be re-created when loading an AOT binary, and its not really needed. So we need to get rid of this implicit dependency before doing so. PiperOrigin-RevId: 939785631
1 parent a5b68ee commit b810251

5 files changed

Lines changed: 102 additions & 83 deletions

File tree

third_party/xla/xla/backends/gpu/autotuner/BUILD

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -400,23 +400,25 @@ cc_library(
400400
"//xla:xla_data_proto_cc",
401401
"//xla/backends/autotuner:profiler",
402402
"//xla/backends/gpu/runtime:buffer_comparator",
403+
"//xla/backends/gpu/runtime:thunk",
404+
"//xla/backends/gpu/runtime:thunk_executor",
403405
"//xla/hlo/ir:hlo",
406+
"//xla/runtime:buffer_use",
407+
"//xla/service:buffer_assignment",
404408
"//xla/service:executable",
405409
"//xla/service:maybe_owning_device_address",
406410
"//xla/service:shaped_buffer",
407411
"//xla/service/gpu:backend_configs_cc",
412+
"//xla/service/gpu:gpu_executable",
408413
"//xla/service/gpu:gpu_executable_run_options",
409-
"//xla/service/gpu:matmul_utils",
410414
"//xla/service/gpu:stream_executor_util",
411415
"//xla/service/gpu/autotuning:redzone_buffers",
412416
"//xla/stream_executor:device_address",
413417
"//xla/stream_executor:device_address_allocator",
414418
"//xla/stream_executor:stream_executor_address_allocator",
415419
"//xla/stream_executor:stream_executor_h",
416420
"//xla/stream_executor/gpu:redzone_allocator",
417-
"//xla/tsl/platform:errors",
418421
"//xla/tsl/platform:status_macros",
419-
"//xla/tsl/platform:statusor",
420422
"@com_google_absl//absl/base",
421423
"@com_google_absl//absl/log",
422424
"@com_google_absl//absl/log:check",
@@ -427,7 +429,6 @@ cc_library(
427429
"@com_google_absl//absl/synchronization",
428430
"@com_google_absl//absl/time",
429431
"@com_google_absl//absl/types:span",
430-
"@tsl//tsl/platform:casts",
431432
],
432433
)
433434

@@ -708,12 +709,10 @@ xla_test(
708709
"//xla/service/gpu:nvptx_compiler_impl",
709710
"//xla/stream_executor:device_address_allocator",
710711
"//xla/stream_executor:platform",
712+
"//xla/stream_executor:stream_executor_address_allocator",
711713
"//xla/stream_executor:stream_executor_h",
712-
"//xla/stream_executor:stream_executor_memory_allocator",
713714
"//xla/tsl/lib/core:status_test_util",
714-
"//xla/tsl/platform:errors",
715715
"//xla/tsl/platform:status_macros",
716-
"//xla/tsl/platform:statusor",
717716
"@com_google_absl//absl/log",
718717
"@com_google_absl//absl/status",
719718
"@com_google_absl//absl/status:status_matchers",

third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,18 @@ limitations under the License.
3434
#include "xla/tsl/platform/status_macros.h"
3535
#include "xla/backends/autotuner/profiler.h"
3636
#include "xla/backends/gpu/runtime/buffer_comparator.h"
37+
#include "xla/backends/gpu/runtime/thunk.h"
38+
#include "xla/backends/gpu/runtime/thunk_executor.h"
3739
#include "xla/executable_run_options.h"
38-
#include "xla/hlo/ir/hlo_computation.h"
3940
#include "xla/hlo/ir/hlo_instruction.h"
40-
#include "xla/hlo/ir/hlo_module.h"
4141
#include "xla/hlo/ir/hlo_opcode.h"
42+
#include "xla/runtime/buffer_use.h"
43+
#include "xla/service/buffer_assignment.h"
4244
#include "xla/service/executable.h"
4345
#include "xla/service/gpu/autotuning/redzone_buffers.h"
4446
#include "xla/service/gpu/backend_configs.pb.h"
47+
#include "xla/service/gpu/gpu_executable.h"
4548
#include "xla/service/gpu/gpu_executable_run_options.h"
46-
#include "xla/service/gpu/matmul_utils.h"
4749
#include "xla/service/gpu/stream_executor_util.h"
4850
#include "xla/service/maybe_owning_device_address.h"
4951
#include "xla/service/service_executable_run_options.h"
@@ -55,10 +57,7 @@ limitations under the License.
5557
#include "xla/stream_executor/gpu/redzone_allocator.h"
5658
#include "xla/stream_executor/stream_executor.h"
5759
#include "xla/stream_executor/stream_executor_address_allocator.h"
58-
#include "xla/tsl/platform/errors.h"
59-
#include "xla/tsl/platform/statusor.h"
6060
#include "xla/xla_data.pb.h"
61-
#include "tsl/platform/casts.h"
6261

6362
namespace xla {
6463

@@ -81,23 +80,24 @@ std::vector<ExecutionInput> CreateExecutionInputsFromBuffers(
8180
return inputs;
8281
}
8382

84-
int GetScratchBytes(const Executable* executable) {
85-
int scratch_bytes = 0;
86-
for (const auto* allocation : executable->GetAllocations()) {
87-
if (allocation->IsPreallocatedTempBuffer()) {
88-
for (const auto& [buffer, offset] : allocation->assigned_buffers()) {
89-
// Scratch space is allocated as the second element in the output tuple
90-
// of the instruction.
91-
const auto& shape_index = buffer->positions().front().index;
92-
bool is_second_element_in_output_tuple =
93-
!shape_index.empty() && shape_index[0] == 1;
94-
if (is_second_element_in_output_tuple) {
95-
scratch_bytes += offset.size;
83+
int GetScratchBytes(const GpuExecutable& executable) {
84+
int32_t scratch_bytes = 0;
85+
CHECK_OK(executable.thunk_executor().thunks().WalkNested(
86+
[&scratch_bytes](const Thunk* thunk) {
87+
std::vector<BufferAllocation::Slice> scratch_slices;
88+
for (const auto& buffer_use : thunk->buffer_uses()) {
89+
// ContentValidity::kUndefined means the buffer is a scratch buffer.
90+
if (buffer_use.content_validity() ==
91+
BufferUse::ContentValidity::kUndefined) {
92+
// TODO(b/517426568): De-duplicate overlapping slices.
93+
scratch_bytes += buffer_use.slice().size();
94+
}
9695
}
97-
}
98-
}
99-
}
100-
return scratch_bytes;
96+
97+
return absl::OkStatus();
98+
}));
99+
100+
return static_cast<int>(scratch_bytes);
101101
}
102102

103103
// Initialize a specific input buffer with custom values.
@@ -267,8 +267,12 @@ absl::StatusOr<ProfileResult> GpuProfiler::Profile(
267267
const GpuInputBuffers& gpu_buffers =
268268
absl::down_cast<const GpuInputBuffers&>(buffers);
269269
const RedzoneBuffers& rz_buffers = gpu_buffers.redzone_buffers;
270+
270271
ProfileResult result;
271-
result.scratch_bytes = GetScratchBytes(executable);
272+
if (auto* gpu_executable = dynamic_cast<const GpuExecutable*>(executable);
273+
gpu_executable != nullptr) {
274+
result.scratch_bytes = GetScratchBytes(*gpu_executable);
275+
}
272276
{
273277
// Warm up run.
274278
std::vector<ExecutionInput> execution_inputs =

third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc

Lines changed: 48 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,8 @@ limitations under the License.
4747
#include "xla/stream_executor/device_address_allocator.h"
4848
#include "xla/stream_executor/platform.h"
4949
#include "xla/stream_executor/stream_executor.h"
50-
#include "xla/stream_executor/stream_executor_memory_allocator.h"
50+
#include "xla/stream_executor/stream_executor_address_allocator.h"
5151
#include "xla/tsl/lib/core/status_test_util.h"
52-
#include "xla/tsl/platform/errors.h"
53-
#include "xla/tsl/platform/statusor.h"
5452
#include "xla/xla_data.pb.h"
5553

5654
namespace xla {
@@ -145,15 +143,15 @@ TEST_F(GpuProfilerTest, CreateInputBuffersAndProfile) {
145143
ROOT c = s32[] constant(1)
146144
}
147145
)";
148-
TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
149-
ParseAndReturnVerifiedModule(kHloModule));
146+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
147+
ParseAndReturnVerifiedModule(kHloModule));
150148
MockExecutable mock_executable(module, 1000);
151149
auto profiler =
152150
GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
153-
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
154-
profiler->CreateInputBuffers(&mock_executable));
155-
TF_ASSERT_OK_AND_ASSIGN(ProfileResult profile,
156-
profiler->Profile(&mock_executable, *buffers));
151+
ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
152+
profiler->CreateInputBuffers(&mock_executable));
153+
ASSERT_OK_AND_ASSIGN(ProfileResult profile,
154+
profiler->Profile(&mock_executable, *buffers));
157155
EXPECT_EQ(profile.duration, absl::Nanoseconds(1000));
158156
EXPECT_EQ(profile.output_buffer->on_device_shape(),
159157
ShapeUtil::MakeShape(S32, {}));
@@ -167,15 +165,15 @@ TEST_F(GpuProfilerTest, FailingExecutablesReturnStatus) {
167165
ROOT c = s32[] constant(1)
168166
}
169167
)";
170-
TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
171-
ParseAndReturnVerifiedModule(kHloModule));
168+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
169+
ParseAndReturnVerifiedModule(kHloModule));
172170
MockExecutable mock_executable(module, /*duration_ns=*/0,
173171
/*should_fail=*/true);
174172

175173
auto profiler =
176174
GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
177-
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
178-
profiler->CreateInputBuffers(&mock_executable));
175+
ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
176+
profiler->CreateInputBuffers(&mock_executable));
179177
EXPECT_THAT(profiler->Profile(&mock_executable, *buffers),
180178
StatusIs(absl::StatusCode::kInternal));
181179
}
@@ -191,14 +189,14 @@ TEST_P(GpuProfilerTestWithRedzonePadding, CheckInputBuffers) {
191189
ROOT c = s32[] constant(1)
192190
}
193191
)";
194-
TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
195-
ParseAndReturnVerifiedModule(kHloModule));
192+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<HloModule> module,
193+
ParseAndReturnVerifiedModule(kHloModule));
196194
MockExecutable mock_executable(module, 1000);
197195
ProfileOptions options;
198196
options.redzone_padding_bytes = GetParam();
199197
auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
200-
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
201-
profiler->CreateInputBuffers(&mock_executable));
198+
ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
199+
profiler->CreateInputBuffers(&mock_executable));
202200
TF_EXPECT_OK(profiler->CheckInputBuffers(*buffers));
203201
}
204202

@@ -210,33 +208,33 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWhenBuffersAreSame) {
210208
ProfileOptions options;
211209
auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
212210

213-
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
211+
ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
214212
auto allocator =
215213
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
216214
stream_exec_);
217-
TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
218-
CreateTestBuffer(allocator.get(), stream_exec_,
219-
stream.get(), /*value=*/1));
220-
TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
221-
CreateTestBuffer(allocator.get(), stream_exec_,
222-
stream.get(), /*value=*/1));
215+
ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
216+
CreateTestBuffer(allocator.get(), stream_exec_,
217+
stream.get(), /*value=*/1));
218+
ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
219+
CreateTestBuffer(allocator.get(), stream_exec_,
220+
stream.get(), /*value=*/1));
223221
EXPECT_THAT(profiler->CheckOutputBuffer(output, reference, /*rtol=*/0.0),
224222
StatusIs(absl::StatusCode::kOk));
225223
}
226224

227225
TEST_F(GpuProfilerTest, CheckOutputBufferWhenBuffersAreDifferent) {
228226
ProfileOptions options;
229227
auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
230-
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
228+
ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
231229
auto allocator =
232230
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
233231
stream_exec_);
234-
TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
235-
CreateTestBuffer(allocator.get(), stream_exec_,
236-
stream.get(), /*value=*/1));
237-
TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
238-
CreateTestBuffer(allocator.get(), stream_exec_,
239-
stream.get(), /*value=*/2));
232+
ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer output,
233+
CreateTestBuffer(allocator.get(), stream_exec_,
234+
stream.get(), /*value=*/1));
235+
ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer reference,
236+
CreateTestBuffer(allocator.get(), stream_exec_,
237+
stream.get(), /*value=*/2));
240238
EXPECT_THAT(profiler->CheckOutputBuffer(output, reference, /*rtol=*/0.0),
241239
StatusIs(absl::StatusCode::kInternal));
242240
}
@@ -245,15 +243,15 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreSame) {
245243
ProfileOptions options;
246244
auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
247245

248-
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
246+
ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
249247
auto allocator =
250248
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
251249
stream_exec_);
252-
TF_ASSERT_OK_AND_ASSIGN(
250+
ASSERT_OK_AND_ASSIGN(
253251
ScopedShapedBuffer output,
254252
CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
255253
/*value1=*/1, /*value2=*/2));
256-
TF_ASSERT_OK_AND_ASSIGN(
254+
ASSERT_OK_AND_ASSIGN(
257255
ScopedShapedBuffer reference,
258256
CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
259257
/*value1=*/1, /*value2=*/2));
@@ -265,19 +263,19 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreDifferent) {
265263
ProfileOptions options;
266264
auto profiler = GpuProfiler::Create(stream_exec_, options, allocator_.get());
267265

268-
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
266+
ASSERT_OK_AND_ASSIGN(auto stream, stream_exec_->CreateStream());
269267
auto allocator =
270268
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
271269
stream_exec_);
272-
TF_ASSERT_OK_AND_ASSIGN(
270+
ASSERT_OK_AND_ASSIGN(
273271
ScopedShapedBuffer reference,
274272
CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
275273
/*value1=*/1, /*value2=*/2));
276-
TF_ASSERT_OK_AND_ASSIGN(
274+
ASSERT_OK_AND_ASSIGN(
277275
ScopedShapedBuffer output_error_in_first_element,
278276
CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
279277
/*value1=*/0, /*value2=*/2));
280-
TF_ASSERT_OK_AND_ASSIGN(
278+
ASSERT_OK_AND_ASSIGN(
281279
ScopedShapedBuffer output_error_in_second_element,
282280
CreateTupleTestBuffer(allocator.get(), stream_exec_, stream.get(),
283281
/*value1=*/1, /*value2=*/3));
@@ -289,8 +287,8 @@ TEST_F(GpuProfilerTest, CheckOutputBufferWithTupleShapeAreDifferent) {
289287
StatusIs(absl::StatusCode::kInternal));
290288
}
291289

292-
TEST_F(GpuProfilerTest, CheckScratchBytesArePopulatedUsingBufferAssignment) {
293-
constexpr absl::string_view kHloModule = R"(
290+
TEST_F(GpuProfilerTest, CheckScratchBytesArePopulated) {
291+
constexpr absl::string_view kHloModule = R"hlo(
294292
HloModule gemm_fusion_dot.1, is_scheduled=true, entry_computation_layout={(bf16[32,120,6,512]{3,2,1,0}, f32[3072,512]{1,0})->bf16[3840,512]{1,0}}, frontend_attributes={fingerprint_before_lhs="40f912baf5b53a4f75b1ba9b3442042f"}
295293
296294
%wrapped_convert_computation (param_0: f32[3072,512]) -> bf16[3072,512] {
@@ -307,19 +305,19 @@ ENTRY %entry_computation (transpose.562: bf16[32,120,6,512], Arg_1.2: f32[3072,5
307305
%custom-call.1 = (bf16[512,3840]{0,1}, s8[26738688]{0}) custom-call(%wrapped_convert, %bitcast.1), custom_call_target="__cublas$lt$matmul", backend_config={"operation_queue_id":"0","gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["1"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"1572864","rhs_stride":"11796480","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[]}
308306
%get-tuple-element = bf16[512,3840]{0,1} get-tuple-element(%custom-call.1), index=0
309307
ROOT %bitcast.2 = bf16[3840,512]{1,0} bitcast(%get-tuple-element)
310-
})";
308+
})hlo";
311309
NVPTXCompiler compiler;
312-
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
313-
ParseAndReturnVerifiedModule(kHloModule));
314-
TF_ASSERT_OK_AND_ASSIGN(auto gpu_executable,
315-
compiler.RunBackend(std::move(module), stream_exec_,
316-
GpuCompiler::CompileOptions()));
310+
ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
311+
ParseAndReturnVerifiedModule(kHloModule));
312+
ASSERT_OK_AND_ASSIGN(auto gpu_executable,
313+
compiler.RunBackend(std::move(module), stream_exec_,
314+
GpuCompiler::CompileOptions()));
317315
auto profiler =
318316
GpuProfiler::Create(stream_exec_, ProfileOptions(), allocator_.get());
319-
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
320-
profiler->CreateInputBuffers(gpu_executable.get()));
321-
TF_ASSERT_OK_AND_ASSIGN(ProfileResult profile,
322-
profiler->Profile(gpu_executable.get(), *buffers));
317+
ASSERT_OK_AND_ASSIGN(std::unique_ptr<InputBuffers> buffers,
318+
profiler->CreateInputBuffers(gpu_executable.get()));
319+
ASSERT_OK_AND_ASSIGN(ProfileResult profile,
320+
profiler->Profile(gpu_executable.get(), *buffers));
323321
EXPECT_EQ(profile.scratch_bytes, 26738688);
324322
}
325323

third_party/xla/xla/backends/gpu/runtime/thunk.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,13 @@ absl::Status ThunkSequence::WalkNested(Thunk::Walker callback) {
508508
return absl::OkStatus();
509509
}
510510

511+
absl::Status ThunkSequence::WalkNested(Thunk::ConstWalker callback) const {
512+
for (const auto& thunk : *this) {
513+
RETURN_IF_ERROR(thunk->Walk(callback));
514+
}
515+
return absl::OkStatus();
516+
}
517+
511518
absl::Status ThunkSequence::TransformNested(Thunk::Transformer callback) {
512519
for (std::unique_ptr<Thunk>& thunk : *this) {
513520
RETURN_IF_ERROR(thunk->TransformNested(callback));

0 commit comments

Comments
 (0)