PR tensorflow#43897: [ROCm] Don't use async deallocaton in MIOpen autotuner backend

draganmladjenovic · tensorflower-gardener · commit bb81f08af80d · 2026-06-08T13:57:46.000-07:00
Imported from GitHub PR openxla/xla#43897 📝 Summary of Changes Introduce file local OwningScratchAllocator implementation that does deallocation on destruction. 🎯 Justification OwningScratchAllocator implementation moved to async deallocaton model which doesn't work for miopen backend. 🚀 Kind of Contribution 🐛 Bug Fix Copybara import of the project: -- 945b5c2767fc40f51f9c045ba8766da3d728785c by Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>: [ROCm] Don't use async deallocaton in MIOpen autotuner backend Merging this change closes tensorflow#43897 PiperOrigin-RevId: 928753564
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
@@ -64,6 +64,32 @@ using MIOpenBackendConfig = stream_executor::dnn::AlgorithmProto;
 
 namespace {
 
+struct OwningScratchAllocator : public se::ScratchAllocator {
+  OwningScratchAllocator(int device_ordinal,
+                         se::DeviceAddressAllocator* allocator)
+      : device_ordinal_(device_ordinal), allocator_(allocator) {}
+
+  int64_t GetMemoryLimitInBytes() override { return -1; }
+
+  absl::StatusOr<se::DeviceAddress<uint8_t>> AllocateBytes(
+      int64_t byte_size) override {
+    if (byte_size < 0) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("byte_size must be non-negative, but got ", byte_size));
+    }
+    ASSIGN_OR_RETURN(se::ScopedDeviceAddress<uint8_t> buffer,
+                     allocator_->Allocate(device_ordinal_, byte_size,
+                                          /*retry_on_failure=*/false));
+    buffers_.push_back(std::move(buffer));
+    return *buffers_.back();
+  }
+
+ private:
+  int device_ordinal_;
+  se::DeviceAddressAllocator* allocator_;
+  absl::InlinedVector<se::ScopedDeviceAddress<uint8_t>, 4> buffers_;
+};
+
 bool IsCustomCallToDnnFusedConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() != HloOpcode::kCustomCall) {
     return false;
@@ -287,8 +313,8 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
                                          allow_tf32,
                                          /*require_command_buffer=*/false};
 
-  se::OwningScratchAllocator<4> scratch_allocator(
-      stream_executor->device_ordinal(), allocator);
+  OwningScratchAllocator scratch_allocator(stream_executor->device_ordinal(),
+                                           allocator);
 
   const auto initialize_buffer = [stream](se::DeviceAddressBase buffer) {
     // Although we don't have evidence this matters, zero out the buffers