QuantumKitHub · lkdvos · Jun 8, 2026 · May 13, 2026 · Jun 2, 2026 · Jun 3, 2026
diff --git a/Project.toml b/Project.toml
@@ -9,16 +9,21 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 
 [extensions]
+StridedcuBLASExt = "cuBLAS"
 StridedGPUArraysExt = "GPUArrays"
+StridedAMDGPUExt = "AMDGPU"
 
 [compat]
 AMDGPU = "2"
 Aqua = "0.8"
 Adapt = "4"
 CUDACore = "6"
+cuBLAS = "6"
 cuRAND = "6"
 GPUArrays = "11.4.1"
 JLArrays = "0.3.1"
@@ -35,6 +40,7 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc"
 cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
@@ -43,4 +49,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "AMDGPU", "CUDACore", "cuRAND", "GPUArrays", "JLArrays", "Metal", "Adapt"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDACore", "cuBLAS", "cuRAND", "GPUArrays", "JLArrays", "Metal", "Adapt"]
diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl
@@ -0,0 +1,19 @@
+module StridedAMDGPUExt
+
+using Strided, StridedViews, AMDGPU, AMDGPU.rocBLAS, LinearAlgebra
+import Strided: blas_mul!
+
+const ROCStridedView{T, N, A <: ROCArray{T}} = StridedViews.StridedView{T, N, A}
+
+function Strided.blas_mul!(C::ROCStridedView{T, 2}, A::ROCStridedView{T, 2}, B::ROCStridedView{T, 2}, α::Number, β::Number) where {T <: LinearAlgebra.BlasFloat}
+    A2, CA = Strided.getblasmatrix(A)
+    B2, CB = Strided.getblasmatrix(B)
+    C2, CC = Strided.getblasmatrix(C)
+    A2a = Base.unsafe_wrap(ROCMatrix{T}, pointer(A2), size(A2))
+    B2a = Base.unsafe_wrap(ROCMatrix{T}, pointer(B2), size(B2))
+    C2a = Base.unsafe_wrap(ROCMatrix{T}, pointer(C2), size(C2))
+    AMDGPU.rocBLAS.gemm!(CA, CB, convert(T, α), A2a, B2a, convert(T, β), C2a)
+    return C
+end
+
+end
diff --git a/ext/StridedCUDACoreExt.jl b/ext/StridedCUDACoreExt.jl
diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
@@ -5,6 +5,8 @@ using GPUArrays: Adapt, KernelAbstractions
 using GPUArrays.KernelAbstractions: @kernel, @index
 using StridedViews: ParentIndex
 
+import Strided: isblasmatrix
+
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
 # StridedView backed by any GPU array type, with element type linked to the parent.
@@ -129,4 +131,17 @@ function Strided._mapreduce_block!(
     return nothing
 end
 
+function Strided.isblasmatrix(A::GPUStridedView{T, 2}) where {T <: LinearAlgebra.BlasFloat}
+    if A.op == identity
+        # unsafe wrap approach doesn't work if second condition not met
+        return stride(A, 1) == 1 && size(A, 1) == size(parent(A), 1)
+    elseif A.op == conj
+        # this is converted to adjoint
+        # unsafe wrap approach doesn't work if second condition not met
+        return stride(A, 2) == 1 && size(A, 2) == size(parent(A), 2)
+    else # should never happen
+        return false
+    end
+end
+
 end
diff --git a/ext/StridedcuBLASExt.jl b/ext/StridedcuBLASExt.jl
@@ -0,0 +1,19 @@
+module StridedcuBLASExt
+
+using Strided, StridedViews, cuBLAS, cuBLAS.CUDACore, LinearAlgebra
+import Strided: blas_mul!
+
+const CuStridedView{T, N, A <: CuArray{T}} = StridedViews.StridedView{T, N, A}
+
+function Strided.blas_mul!(C::CuStridedView{T, 2}, A::CuStridedView{T, 2}, B::CuStridedView{T, 2}, α::Number, β::Number) where {T <: LinearAlgebra.BlasFloat}
+    A2, CA = Strided.getblasmatrix(A)
+    B2, CB = Strided.getblasmatrix(B)
+    C2, CC = Strided.getblasmatrix(C)
+    A2a = Base.unsafe_wrap(CuMatrix{T}, pointer(A2), size(A2))
+    B2a = Base.unsafe_wrap(CuMatrix{T}, pointer(B2), size(B2))
+    C2a = Base.unsafe_wrap(CuMatrix{T}, pointer(C2), size(C2))
+    cuBLAS.gemm!(CA, CB, convert(T, α), A2a, B2a, convert(T, β), C2a)
+    return C
+end
+
+end
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -100,13 +100,21 @@ function _mul!(
         α::Number, β::Number
     ) where {T <: LinearAlgebra.BlasFloat}
     if stride(C, 1) == 1 && isblasmatrix(A) && isblasmatrix(B)
-        nthreads = use_threaded_mul() ? get_num_threads() : 1
-        _threaded_blas_mul!(C, A, B, α, β, nthreads)
+        return blas_mul!(C, A, B, α, β)
     else
         return __mul!(C, A, B, α, β)
     end
 end
 
+# for CPU based arrays, this is valid
+function blas_mul!(
+        C::StridedView{T, 2}, A::StridedView{T, 2}, B::StridedView{T, 2},
+        α::Number, β::Number
+    ) where {T <: LinearAlgebra.BlasFloat}
+    nthreads = use_threaded_mul() ? get_num_threads() : 1
+    return _threaded_blas_mul!(C, A, B, α, β, nthreads)
+end
+
 function _threaded_blas_mul!(
         C::StridedView{T, 2}, A::StridedView{T, 2}, B::StridedView{T, 2},
         α::Number, β::Number,

diff --git a/test/cuda.jl b/test/cuda.jl
diff --git a/test/gpu.jl b/test/gpu.jl
@@ -14,10 +14,21 @@ end
 # types to test for
 ATs = []
 !is_buildkite && push!(ATs, JLArray)
-CUDACore.functional() && push!(ATs, CuArray)
+CUDACore.functional() && cuBLAS.functional() && push!(ATs, CuArray)
 AMDGPU.functional() && push!(ATs, ROCArray)
 Metal.functional() && push!(ATs, MtlArray)
 
+@testset "isblasmatrix ($AT)" for AT in ATs
+    for T in (Float32, ComplexF32)
+        A1 = StridedView(AT(randn(T, 20, 20)))
+        @test Strided.isblasmatrix(A1)
+        A2 = view(A1, 1:4:20, 1:5:20)
+        @test !Strided.isblasmatrix(A2)
+        A3 = view(conj!(A1), 1:4:20, 1:20) # stride(A3, 2) is not 1
+        @test !Strided.isblasmatrix(A3)
+    end
+end
+
 @testset "in-place matrix operations ($AT)" for AT in ATs
     for T in (Float32, ComplexF32)
         A1 = StridedView(randn(T, 20, 20))
@@ -38,6 +49,38 @@ Metal.functional() && push!(ATs, MtlArray)
     end
 end
 
+@testset "mul! ($AT{$T})" for AT in ATs, T in (Float32, ComplexF32)
+    N = 2
+    α = rand(T)
+    β = rand(T)
+    dims = ntuple(Returns(div(64, N)), N)
+    A1 = permutedims(StridedView(rand(T, dims)), randperm(N))
+    A2 = permutedims(StridedView(rand(T, dims)), randperm(N))
+    A3 = permutedims(StridedView(rand(T, dims)), randperm(N))
+    @test compare((C, A, B) -> mul!(C, A, B, α, β), AT, A1, A2, A3)
+    # test BLAS for all op combinations
+    @testset for sz in ((32, 64), (64, 64), (64, 32))
+        vA1 = view(StridedView(rand(T, sz)), 1:32, 1:32)
+        vA2 = view(StridedView(rand(T, sz)), 1:32, 1:32)
+        vA3 = view(StridedView(rand(T, sz)), 1:32, 1:32)
+        @testset for f1 in (identity, conj, adjoint, transpose), f2 in (identity, conj, adjoint, transpose)
+            @test compare((C, A, B) -> mul!(C, A, B, α, β), AT, vA1, f1(vA2), f2(vA3))
+        end
+    end
+    # non-BLAS fallback path
+    vA1 = view(StridedView(rand(T, (32, 32))), 1:32, 1:32)
+    vA2 = view(StridedView(rand(T, (32, 64))), 1:32, 1:2:64)
+    vA3 = view(StridedView(rand(T, (64, 32))), 1:2:64, 1:32)
+    @testset for f1 in (identity, conj, adjoint, transpose), f2 in (identity, conj, adjoint, transpose)
+        @test compare((C, A, B) -> mul!(C, A, B, α, β), AT, vA1, f1(vA2), f2(vA3))
+    end
+    # non-BLAS fallback path
+    vA1 = view(StridedView(rand(T, (64, 32))), 1:2:64, 1:32)
+    vA2 = view(StridedView(rand(T, (32, 64))), 1:32, 1:2:64)
+    vA3 = view(StridedView(rand(T, (64, 32))), 1:2:64, 1:32)
+    @test compare((C, A, B) -> mul!(C, A, B, α, β), AT, vA1, vA2, vA3)
+end
+
 @testset "map, scale!, axpy!, axpby! ($AT)" for AT in ATs
     for T in (Float32, ComplexF32)
         for N in 2:6
@@ -69,6 +112,22 @@ end
     end
 end
 
+@testset "copy ($AT)" for AT in ATs
+    N = 2
+    for m1 in (0, 16, 32), m2 in (0, 16, 32), T in (Float32, ComplexF32)
+        dims = (m1, m2)
+        A1 = StridedView(rand(T, dims))
+        A2 = StridedView(rand(T, dims))
+        A3 = StridedView(rand(T, dims))
+        for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            B1 = f1(copy(A1))
+            B2 = f2(copy(A2))
+            @test compare((x, y) -> copy!(y, x), AT, B1, B2)
+        end
+    end
+end
+
 @testset "broadcasting ($AT)" for AT in ATs
     for T in (Float32, ComplexF32)
         A0 = StridedView(rand(T, ()))

diff --git a/test/jlarrays.jl b/test/jlarrays.jl
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -7,7 +7,7 @@ using Aqua
 using Adapt, GPUArrays
 using JLArrays
 using AMDGPU
-using CUDACore, cuRAND
+using CUDACore, cuRAND, cuBLAS
 using Metal
 
 Random.seed!(1234)