pytorch
diff --git a/‎backends/aoti/slim/core/storage.h‎
Lines changed: 14 additions & 30 deletions b/‎backends/aoti/slim/core/storage.h‎
Lines changed: 14 additions & 30 deletions
diff --git a/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 0 deletions b/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cuda/runtime/TARGETS‎
Lines changed: 29 additions & 0 deletions b/‎backends/cuda/runtime/TARGETS‎
Lines changed: 29 additions & 0 deletions
@@ -13,6 +13,7 @@
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -107,9 +108,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param device The target CUDA device (used to get the stream).
   /// @return Pointer to allocated device memory.
   static void* allocate(size_t nbytes, const c10::Device& device) {
-    // Get the current stream for this device (set by CUDAStreamGuard if any)
-    // This follows PyTorch's pattern where the allocator assumes the caller
-    // has already set the correct device via CUDAStreamGuard.
     auto stream_result =
         executorch::backends::cuda::getCurrentCUDAStream(device.index());
     ET_CHECK_MSG(
@@ -118,31 +116,23 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
         static_cast<int>(device.index()));
 
     cudaStream_t stream = stream_result.get();
-    void* data = nullptr;
-    ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
-    return data;
+    auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
+        nbytes, device.index(), stream);
+    ET_CHECK_MSG(
+        result.ok(),
+        "CudaAllocator::allocate_async failed for %zu bytes on device %d",
+        nbytes,
+        static_cast<int>(device.index()));
+    return result.get();
   }
 
-  /// Frees CUDA device memory on the current stream.
-  /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
-    // Get the current stream for the current device
-    // Currently all cuda slimtensors should be on the same device same stream,
-    // so we can just use the stream on current device.
-    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
-    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
     ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
-    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    executorch::backends::cuda::CudaAllocator::deallocate_async(
+        ptr, -1, stream_result.get());
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
-  /// @param stream CUDA stream for async copy.
   static void memcpy_async(
       void* dst,
       const void* src,
@@ -151,7 +141,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& src_device,
       cudaStream_t stream) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
@@ -164,23 +153,18 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
           static_cast<int>(dst_device.index()));
     }
 
-    ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
+    auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
+        dst, src, nbytes, direction, stream);
+    ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
   static void memcpy(
       void* dst,
       const void* src,
       size_t nbytes,
       const c10::Device& dst_device,
       const c10::Device& src_device) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
 
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
             "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/backends/cuda/runtime:cuda_allocator",
         ],
     )
 
 
@@ -74,6 +74,33 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "cuda_allocator",
+    srcs = [
+        "cuda_allocator.cpp",
+    ],
+    headers = [
+        "cuda_allocator.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        "//executorch/runtime/core:device_allocator",
+    ],
+    deps = [
+        "//executorch/runtime/platform:platform",
+    ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
@@ -92,6 +119,8 @@ runtime.cxx_library(
     deps = [
         ":cuda_platform",
         ":runtime_shims",
+        ":cuda_allocator",
+        ":cuda_platform",
         "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ def define_common_targets():`
`19`	`19`	`"//executorch/runtime/platform:platform",`
`20`	`20`	`"//executorch/backends/aoti/slim/c10/cuda:exception",`
`21`	`21`	`"//executorch/backends/aoti/slim/cuda:guard",`
	`22`	`+ "//executorch/backends/cuda/runtime:cuda_allocator",`
`22`	`23`	`],`
`23`	`24`	`)`
`24`	`25`