JuliaGPU · maleadt · Jun 8, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
@@ -54,7 +54,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.18"
+GPUCompiler = "1.19"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
@@ -16,8 +16,8 @@ const highest = v"999"
 #     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
 #
 # Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
-# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
-# below: an unsupported combination simply has no entry.
+# require for them, is encoded directly in the keys of `ptx_sm_db` below (and the equivalent
+# database in GPUCompiler): an unsupported combination simply has no entry.
 
 
 ## version range
@@ -229,6 +229,7 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
     sm"80"   => between(v"11", highest),
     sm"86"   => between(v"13", highest),
     sm"87"   => between(v"16", highest),
+    sm"88"   => between(v"21", highest),
     sm"89"   => between(v"16", highest),
     sm"90"   => between(v"16", highest),
     sm"90a"  => between(v"18", highest),
@@ -241,6 +242,9 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
     sm"103"  => between(v"21", highest),
     sm"103a" => between(v"21", highest),
     sm"103f" => between(v"21", highest),
+    sm"110"  => between(v"22", highest),
+    sm"110a" => between(v"22", highest),
+    sm"110f" => between(v"22", highest),
     sm"120"  => between(v"20", highest),
     sm"120a" => between(v"20", highest),
     sm"120f" => between(v"21", highest),
@@ -314,9 +318,15 @@ end
 
 ## high-level functions that return target and isa support
 
-function llvm_compat(version=LLVM.version())
-    LLVM.InitializeNVPTXTarget()
+# the LLVM version of the external NVPTX back-end used for machine-code generation,
+# as opposed to `LLVM.version()`, which identifies the in-process LLVM that only
+# handles the middle end (the JLL is versioned after the LLVM release it provides).
+const nvptx_llvm_version = pkgversion(NVPTX_LLVM_Backend_jll)
 
+# by default, return the capabilities of the external back-end, which is typically much
+# newer than the in-process LLVM (which only drives the middle end, and is not
+# configured for any particular device).
+function llvm_compat(version=nvptx_llvm_version)
     # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
     # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
     # support is per-CC -- the names track the value type.
@@ -328,3 +338,4 @@ function ptxas_compat(version=compiler_version())
     return (cap=ptxas_cap_support(version),
             ptx=ptxas_ptx_support(version))
 end
+
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
@@ -210,7 +210,7 @@ end
     if ptx !== nothing
         # explicit request: take it exactly, validating against the toolchain
         ptx in llvm_support.ptx ||
-            error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
+            error("Requested PTX ISA $ptx is not supported by LLVM $(nvptx_llvm_version)")
         ptx in ptxas_support.ptx ||
             error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
         llvm_ptx = ptxas_ptx = ptx
@@ -220,7 +220,7 @@ end
         llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
         ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
         isempty(llvm_ptxs) &&
-            error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
+            error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(nvptx_llvm_version)")
         isempty(ptxas_ptxs) &&
             error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
         ptxas_ptx = maximum(ptxas_ptxs)
@@ -260,7 +260,7 @@ end
             sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
         end
         isempty(baseline_candidates) &&
-            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
+            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(nvptx_llvm_version)")
         llvm_sm = argmax(sm_key, baseline_candidates)
     end
 
@@ -272,6 +272,45 @@ end
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
+# does the host-side layout of an argument type match the device-side one?
+#
+# the back-end unconditionally aligns 128-bit integers to 16 bytes, whereas Julia only
+# started doing so in 1.12, so aggregates with (U)Int128 fields may lay out differently.
+# returns the device-side (size, alignment) of `T`, `:opaque` for types whose layout is
+# defined by Julia on both sides (e.g. unions), or `:mismatch`.
+function device_layout(@nospecialize(T))
+    if T === Int128 || T === UInt128
+        return (16, 16)
+    elseif !(T isa DataType)
+        return :opaque
+    elseif fieldcount(T) == 0
+        return (sizeof(T), Base.datatype_alignment(T))
+    end
+    offset = 0
+    align = 1
+    for i in 1:fieldcount(T)
+        field = device_layout(fieldtype(T, i))
+        field === :mismatch && return :mismatch
+        if field === :opaque || offset < 0
+            # we cannot track offsets anymore, but keep verifying nested layouts
+            offset = -1
+            continue
+        end
+        field_size, field_align = field
+        offset = cld(offset, field_align) * field_align
+        offset == fieldoffset(T, i) || return :mismatch
+        offset += field_size
+        align = max(align, field_align)
+    end
+    offset < 0 && return :opaque
+    size = cld(offset, align) * align
+    size == sizeof(T) || return :mismatch
+    return (size, align)
+end
+device_compatible_layout(@nospecialize(T)) =
+    # since Julia 1.12, host and device layouts are identical
+    Base.datatype_alignment(Int128) == 16 || device_layout(T) !== :mismatch
+
 # compile to executable machine code
 function compile(@nospecialize(job::CompilerJob))
     # lower to PTX
@@ -282,7 +321,9 @@ function compile(@nospecialize(job::CompilerJob))
 
     # check if we'll need the device runtime
     undefined_fs = filter(collect(functions(meta.ir))) do f
-        isdeclaration(f) && !LLVM.isintrinsic(f)
+        isdeclaration(f) && !LLVM.isintrinsic(f) &&
+        # intrinsics unknown to the in-process LLVM are still lowered by the back-end
+        !startswith(LLVM.name(f), "llvm.")
     end
     intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail",
                      "__nvvm_reflect" #= TODO: should have been optimized away =#]
@@ -312,6 +353,12 @@ function compile(@nospecialize(job::CompilerJob))
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
         !isghosttype(dt) && !Core.Compiler.isconstType(dt)
     end
+    for dt in argtypes
+        if !device_compatible_layout(dt)
+            error("""Kernel argument of type $dt contains Int128 fields whose layout differs between this version of Julia and the device.
+                     Use Julia 1.12 or later, where 128-bit integers are aligned to 16 bytes, matching the device.""")
+        end
+    end
     param_usage = sum(aligned_sizeof, argtypes)
     param_limit = 4096
     if cap >= v"7.0" && ptx_param >= v"8.1"

diff --git a/CUDACore/src/device/intrinsics/atomics.jl b/CUDACore/src/device/intrinsics/atomics.jl
@@ -85,7 +85,7 @@ for T in (Int32, Int64, UInt32, UInt64)
     end
 end
 
-for T in (:Float32, :Float64)
+for T in (:Float16, :Float32, :Float64)
     ops = [:add]
 
     for op in ops
@@ -107,6 +107,19 @@ for T in (:Float32, :Float64)
         atomic_add!(ptr, -val)
 end
 
+# BFloat16 requires Julia 1.11 for bfloat codegen support; on older versions (and older
+# devices, where the back-end expands the operation) the compare-and-swap fallback is used.
+@static if VERSION >= v"1.11"
+    @eval @inline atomic_add!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
+                                         LLVMPtr{BFloat16,AS.Global},
+                                         LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
+        llvm_atomic_op($(Val(binops[:fadd])), ptr, val)
+    @eval @inline atomic_sub!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
+                                         LLVMPtr{BFloat16,AS.Global},
+                                         LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
+        atomic_add!(ptr, -val)
+end
+
 @generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
     @dispose ctx=Context() begin
         T_val = convert(LLVMType, T)
@@ -178,25 +191,6 @@ for A in (AS.Generic, AS.Global, AS.Shared)
 end
 
 
-## PTX
-
-# half-precision atomics using PTX instruction
-
-for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
-    if A == AS.Global
-        scope = ".global"
-    elseif A == AS.Shared
-        scope = ".shared"
-    else
-        scope = ""
-    end
-
-    intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
-    @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
-        @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
-end
-
-
 ## Julia
 
 # floating-point CAS via bitcasting
@@ -465,6 +459,27 @@ end
     end
 end
 
+@inline function atomic_arrayset(A::AbstractArray{Float16}, I::Integer, op::typeof(+),
+                                 val::Float16)
+    ptr = pointer(A, I)
+    if compute_capability() >= sv"7.0"
+        atomic_add!(ptr, val)
+    else
+        atomic_op!(ptr, op, val)
+    end
+end
+@static if VERSION >= v"1.11"
+    @inline function atomic_arrayset(A::AbstractArray{BFloat16}, I::Integer, op::typeof(+),
+                                     val::BFloat16)
+        ptr = pointer(A, I)
+        if compute_capability() >= sv"9.0"
+            atomic_add!(ptr, val)
+        else
+            atomic_op!(ptr, op, val)
+        end
+    end
+end
+
 # fallback using compare-and-swap
 @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
     atomic_op!(pointer(A, I), op, val)
diff --git a/CUDACore/src/device/intrinsics/indexing.jl b/CUDACore/src/device/intrinsics/indexing.jl
@@ -210,7 +210,13 @@ end
 Returns a 32-bit mask indicating which threads in a warp are active with the current
 executing thread.
 """ active_mask
-@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
+@static if LLVM.version() >= v"20"
+@inline active_mask() = ccall("llvm.nvvm.activemask", llvmcall, UInt32, ())
+else
+# the intrinsic isn't available yet, so use inline assembly. mark it side-effecting to
+# prevent hoisting or merging across divergent control flow (the intrinsic is convergent).
+@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", true, UInt32, Tuple{})
+end
 
 end
 

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -207,7 +207,7 @@ end
 end
 @device_override FastMath.exp2_fast(x::Float64) = exp2(x)
 @device_override FastMath.exp2_fast(x::Float32) =
-    @asmcall("ex2.approx.f32 \$0, \$1;", "=r,r", Float32, Tuple{Float32}, x)
+    ccall("llvm.nvvm.ex2.approx.f", llvmcall, Float32, (Float32,), x)
 @device_override function FastMath.exp2_fast(x::Float16)
     if compute_capability() >= sv"7.5"
         ccall("llvm.nvvm.ex2.approx.f16", llvmcall, Float16, (Float16,), x)
@@ -446,23 +446,6 @@ end
     ifelse(anynan, NaN, minval), ifelse(anynan, NaN, maxval)
 end
 
-@static if Base.thismajor(LLVM.version()) <= v"20"
-    # LLVM 20 and below generate non-existing instructions for Julia's default methods of
-    # fast min/max on fp64: https://github.com/JuliaGPU/CUDA.jl/issues/2886
-    for T in (Float16, Float32, Float64)
-        @eval begin
-            @device_override @inline Base.FastMath.max_fast(x::$T, y::$T) = ifelse(y > x, y, x)
-            @device_override @inline Base.FastMath.min_fast(x::$T, y::$T) = ifelse(y > x, x, y)
-            @device_override @inline Base.FastMath.minmax_fast(x::$T, y::$T) = ifelse(y > x, (x, y), (y, x))
-        end
-    end
-
-    # For Float16, this even happens with a non-fastmath @llvm.minimum/maximum.f16
-    @device_override @inline Base.max(x::Float16, y::Float16) = ifelse(y > x, y, x)
-    @device_override @inline Base.min(x::Float16, y::Float16) = ifelse(y > x, x, y)
-
-end
-
 @device_function saturate(x::Float32) = ccall("extern __nv_saturatef", llvmcall, Cfloat, (Cfloat,), x)
 
 

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -166,15 +166,24 @@ end
         @inbounds y[] = 0
         return
     end
+
+    # dynamically-indexed aggregate arguments should load directly from parameter space
+    # instead of being copied to local memory first
+    @test @filecheck CUDA.code_ptx(Tuple{CuDeviceArray{Float32,1,AS.Global},
+                                         NTuple{32,Float32}, Int}) do out, t, i
+        @check_not ".local"
+        @inbounds out[1] = t[i]
+        return
+    end
 end
 
 @testset "header rewrite (.target/.version bump)" begin
-    # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 +
-    # LLVM 18 on a Blackwell device), `_compiler_config` produces a split
-    # config and `mcgen` rewrites `.target`/`.version` in the emitted asm.
+    # When the LLVM back-end can't reach the device cap (e.g., a device newer
+    # than what NVPTX_LLVM_Backend_jll supports), `_compiler_config` produces a
+    # split config and `mcgen` rewrites `.target`/`.version` in the emitted asm.
     # `.attribute(.unified)` is target-gated on sm_90+ across CUDA 12.0+ —
     # picked here as a stable cross-toolkit feature gate that exercises the
-    # rewrite without requiring Blackwell hardware in CI.
+    # rewrite without requiring such hardware in CI.
     asm_pre = """
     .version 8.0
     .target sm_75

diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
@@ -577,9 +577,8 @@ using SpecialFunctions
     end
 
     @testset "div/inv PTX" begin
-        # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's
-        # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
-        # `fdiv 1.0, x` to `rcp.rn`.
+        # `Base.{/, inv}` lower to plain `fdiv`; NVPTX pattern-matches
+        # `fdiv 1.0, x` (i.e. `inv`) to the dedicated `rcp` instructions.
         for (T, s) in ((Float32, "f32"), (Float64, "f64"))
             @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
                 @check "div.rn.$s"
@@ -591,24 +590,28 @@ using SpecialFunctions
             end
         end
 
-        # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since
-        # the job isn't fast; f64 always uses rcp+Newton.
+        # `@fastmath` on f32: the back-end honors `afn`, picking the non-FTZ
+        # variants since the job isn't fast, and the dedicated `rcp` for
+        # reciprocals. f64 is rewritten by GPUCompiler to rcp+Newton, so also
+        # check for the refinement fmas (a raw rcp would be too inaccurate).
         @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
             @check "div.approx.f32"
             @check_not "div.approx.ftz"
             @fastmath x / y
         end
         @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
-            @check "div.approx.f32"
-            @check_not "div.approx.ftz"
+            @check "rcp.approx.f32"
+            @check_not "rcp.approx.ftz"
             @fastmath inv(x)
         end
         @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
             @check "rcp.approx.ftz.f64"
+            @check "fma.rn.f64"
             @fastmath x / y
         end
         @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
             @check "rcp.approx.ftz.f64"
+            @check "fma.rn.f64"
             @fastmath inv(x)
         end
 
@@ -619,11 +622,12 @@ using SpecialFunctions
             x / y
         end
         @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
-            @check "div.approx.ftz.f32"
+            @check "rcp.approx.ftz.f32"
             inv(x)
         end
         @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
             @check "rcp.approx.ftz.f64"
+            @check "fma.rn.f64"
             x / y
         end
     end