JuliaGPU · maleadt · Jun 8, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
@@ -54,7 +54,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.18"
+GPUCompiler = "1.19"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
@@ -16,8 +16,8 @@ const highest = v"999"
 #     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
 #
 # Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
-# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
-# below: an unsupported combination simply has no entry.
+# require for them, is encoded directly in the keys of `ptx_sm_db` below (and the equivalent
+# database in GPUCompiler): an unsupported combination simply has no entry.
 
 
 ## version range
@@ -229,6 +229,7 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
     sm"80"   => between(v"11", highest),
     sm"86"   => between(v"13", highest),
     sm"87"   => between(v"16", highest),
+    sm"88"   => between(v"21", highest),
     sm"89"   => between(v"16", highest),
     sm"90"   => between(v"16", highest),
     sm"90a"  => between(v"18", highest),
@@ -241,6 +242,9 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
     sm"103"  => between(v"21", highest),
     sm"103a" => between(v"21", highest),
     sm"103f" => between(v"21", highest),
+    sm"110"  => between(v"22", highest),
+    sm"110a" => between(v"22", highest),
+    sm"110f" => between(v"22", highest),
     sm"120"  => between(v"20", highest),
     sm"120a" => between(v"20", highest),
     sm"120f" => between(v"21", highest),
@@ -314,9 +318,15 @@ end
 
 ## high-level functions that return target and isa support
 
-function llvm_compat(version=LLVM.version())
-    LLVM.InitializeNVPTXTarget()
+# the LLVM version of the external NVPTX back-end used for machine-code generation,
+# as opposed to `LLVM.version()`, which identifies the in-process LLVM that only
+# handles the middle end (the JLL is versioned after the LLVM release it provides).
+const nvptx_llvm_version = pkgversion(NVPTX_LLVM_Backend_jll)
 
+# by default, return the capabilities of the external back-end, which is typically much
+# newer than the in-process LLVM (which only drives the middle end, and is not
+# configured for any particular device).
+function llvm_compat(version=nvptx_llvm_version)
     # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
     # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
     # support is per-CC -- the names track the value type.
@@ -328,3 +338,4 @@ function ptxas_compat(version=compiler_version())
     return (cap=ptxas_cap_support(version),
             ptx=ptxas_ptx_support(version))
 end
+
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
@@ -210,7 +210,7 @@ end
     if ptx !== nothing
         # explicit request: take it exactly, validating against the toolchain
         ptx in llvm_support.ptx ||
-            error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
+            error("Requested PTX ISA $ptx is not supported by LLVM $(nvptx_llvm_version)")
         ptx in ptxas_support.ptx ||
             error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
         llvm_ptx = ptxas_ptx = ptx
@@ -220,7 +220,7 @@ end
         llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
         ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
         isempty(llvm_ptxs) &&
-            error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
+            error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(nvptx_llvm_version)")
         isempty(ptxas_ptxs) &&
             error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
         ptxas_ptx = maximum(ptxas_ptxs)
@@ -260,7 +260,7 @@ end
             sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
         end
         isempty(baseline_candidates) &&
-            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
+            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(nvptx_llvm_version)")
         llvm_sm = argmax(sm_key, baseline_candidates)
     end
 
@@ -272,6 +272,46 @@ end
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
+# does the host-side layout of an argument type match the device-side one?
+#
+# the back-end unconditionally aligns 128-bit integers to 16 bytes, whereas Julia only
+# started doing so in 1.12, so aggregates with (U)Int128 fields may lay out differently.
+# returns the device-side (size, alignment) of `T`, `:opaque` for types whose layout is
+# defined by Julia on both sides (e.g. unions, or non-isbits types passed by reference),
+# or `:mismatch`.
+function device_layout(@nospecialize(T))
+    if T === Int128 || T === UInt128
+        return (16, 16)
+    elseif !(T isa DataType) || !isbitstype(T)
+        return :opaque
+    elseif fieldcount(T) == 0
+        return (sizeof(T), Base.datatype_alignment(T))
+    end
+    offset = 0
+    align = 1
+    for i in 1:fieldcount(T)
+        field = device_layout(fieldtype(T, i))
+        field === :mismatch && return :mismatch
+        if field === :opaque || offset < 0
+            # we cannot track offsets anymore, but keep verifying nested layouts
+            offset = -1
+            continue
+        end
+        field_size, field_align = field
+        offset = cld(offset, field_align) * field_align
+        offset == fieldoffset(T, i) || return :mismatch
+        offset += field_size
+        align = max(align, field_align)
+    end
+    offset < 0 && return :opaque
+    size = cld(offset, align) * align
+    size == sizeof(T) || return :mismatch
+    return (size, align)
+end
+device_compatible_layout(@nospecialize(T)) =
+    # since Julia 1.12, host and device layouts are identical
+    Base.datatype_alignment(Int128) == 16 || device_layout(T) !== :mismatch
+
 # compile to executable machine code
 function compile(@nospecialize(job::CompilerJob))
     # lower to PTX
@@ -282,7 +322,9 @@ function compile(@nospecialize(job::CompilerJob))
 
     # check if we'll need the device runtime
     undefined_fs = filter(collect(functions(meta.ir))) do f
-        isdeclaration(f) && !LLVM.isintrinsic(f)
+        isdeclaration(f) && !LLVM.isintrinsic(f) &&
+        # intrinsics unknown to the in-process LLVM are still lowered by the back-end
+        !startswith(LLVM.name(f), "llvm.")
     end
     intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail",
                      "__nvvm_reflect" #= TODO: should have been optimized away =#]
@@ -312,6 +354,12 @@ function compile(@nospecialize(job::CompilerJob))
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
         !isghosttype(dt) && !Core.Compiler.isconstType(dt)
     end
+    for dt in argtypes
+        if !device_compatible_layout(dt)
+            error("""Kernel argument of type $dt contains Int128 fields whose layout differs between this version of Julia and the device.
+                     Use Julia 1.12 or later, where 128-bit integers are aligned to 16 bytes, matching the device.""")
+        end
+    end
     param_usage = sum(aligned_sizeof, argtypes)
     param_limit = 4096
     if cap >= v"7.0" && ptx_param >= v"8.1"

diff --git a/CUDACore/src/device/intrinsics/atomics.jl b/CUDACore/src/device/intrinsics/atomics.jl
@@ -85,7 +85,7 @@ for T in (Int32, Int64, UInt32, UInt64)
     end
 end
 
-for T in (:Float32, :Float64)
+for T in (:Float16, :Float32, :Float64)
     ops = [:add]
 
     for op in ops
@@ -107,6 +107,19 @@ for T in (:Float32, :Float64)
         atomic_add!(ptr, -val)
 end
 
+# BFloat16 requires Julia 1.11 for bfloat codegen support; on older versions (and older
+# devices, where the back-end expands the operation) the compare-and-swap fallback is used.
+@static if VERSION >= v"1.11"
+    @eval @inline atomic_add!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
+                                         LLVMPtr{BFloat16,AS.Global},
+                                         LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
+        llvm_atomic_op($(Val(binops[:fadd])), ptr, val)
+    @eval @inline atomic_sub!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
+                                         LLVMPtr{BFloat16,AS.Global},
+                                         LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
+        atomic_add!(ptr, -val)
+end
+
 @generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
     @dispose ctx=Context() begin
         T_val = convert(LLVMType, T)
@@ -178,25 +191,6 @@ for A in (AS.Generic, AS.Global, AS.Shared)
 end
 
 
-## PTX
-
-# half-precision atomics using PTX instruction
-
-for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
-    if A == AS.Global
-        scope = ".global"
-    elseif A == AS.Shared
-        scope = ".shared"
-    else
-        scope = ""
-    end
-
-    intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
-    @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
-        @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
-end
-
-
 ## Julia
 
 # floating-point CAS via bitcasting
@@ -465,6 +459,27 @@ end
     end
 end
 
+@inline function atomic_arrayset(A::AbstractArray{Float16}, I::Integer, op::typeof(+),
+                                 val::Float16)
+    ptr = pointer(A, I)
+    if compute_capability() >= sv"7.0"
+        atomic_add!(ptr, val)
+    else
+        atomic_op!(ptr, op, val)
+    end
+end
+@static if VERSION >= v"1.11"
+    @inline function atomic_arrayset(A::AbstractArray{BFloat16}, I::Integer, op::typeof(+),
+                                     val::BFloat16)
+        ptr = pointer(A, I)
+        if compute_capability() >= sv"9.0"
+            atomic_add!(ptr, val)
+        else
+            atomic_op!(ptr, op, val)
+        end
+    end
+end
+
 # fallback using compare-and-swap
 @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
     atomic_op!(pointer(A, I), op, val)
diff --git a/CUDACore/src/device/intrinsics/indexing.jl b/CUDACore/src/device/intrinsics/indexing.jl
@@ -210,7 +210,13 @@ end
 Returns a 32-bit mask indicating which threads in a warp are active with the current
 executing thread.
 """ active_mask
-@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
+@static if LLVM.version() >= v"20"
+@inline active_mask() = ccall("llvm.nvvm.activemask", llvmcall, UInt32, ())
+else
+# the intrinsic isn't available yet, so use inline assembly. mark it side-effecting to
+# prevent hoisting or merging across divergent control flow (the intrinsic is convergent).
+@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", true, UInt32, Tuple{})
+end
 
 end
 

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -207,7 +207,7 @@ end
 end
 @device_override FastMath.exp2_fast(x::Float64) = exp2(x)
 @device_override FastMath.exp2_fast(x::Float32) =
-    @asmcall("ex2.approx.f32 \$0, \$1;", "=r,r", Float32, Tuple{Float32}, x)
+    ccall("llvm.nvvm.ex2.approx.f", llvmcall, Float32, (Float32,), x)
 @device_override function FastMath.exp2_fast(x::Float16)
     if compute_capability() >= sv"7.5"
         ccall("llvm.nvvm.ex2.approx.f16", llvmcall, Float16, (Float16,), x)
@@ -376,92 +376,39 @@ end
 #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
 #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
 #@device_override Base.min(x::UInt64, y::UInt64) = convert(UInt64, ccall("extern __nv_ullmin", llvmcall, Int64, (Int64, Int64), x, y))
-# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN don't match Julia's
+# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN and signed zeros don't match Julia's
 #@device_override Base.min(x::Float64, y::Float64) = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 #@device_override Base.min(x::Float32, y::Float32) = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-@device_override @inline function Base.min(x::Float32, y::Float32)
-    if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
-        # LLVM 14+ can do the right thing, but only on sm_80+
-        # (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
-        ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y)
-    else
-        # we follow PTX semantics, returning canonical NaN if either input is NaN
-        anynan = isnan(x) | isnan(y)
-        minval = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-        ifelse(anynan, NaN32, minval)
-    end
-end
-@device_override @inline function Base.min(x::Float64, y::Float64)
-    # PTX doesn't support min.NaN.f64, so we have to do it ourselves
-    anynan = isnan(x) | isnan(y)
-    minval = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-    ifelse(anynan, NaN, minval)
-end
+# Julia's floating-point min/max match IEEE 754-2019 minimum/maximum, i.e.,
+# `llvm.minimum`/`llvm.maximum`, which the external back-end legalizes for every
+# subtarget: native min.NaN/max.NaN instructions on sm_80+, an expansion with
+# NaN/signed-zero fix-ups elsewhere. Don't be tempted to use `llvm.minnum`
+# (libdevice's `__nv_fmin`) with a NaN fix-up instead: its loose signed-zero
+# semantics leak into constant folding, e.g., folding `min(0.0, -0.0)` to
+# `0.0` where the host returns `-0.0`.
+# Julia 1.12+ lowers `Base.min` to `llvm.minimum` by itself; keep the overrides
+# for uniform codegen on older versions.
+@device_override Base.min(x::Float32, y::Float32) =
+    ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y)
+@device_override Base.min(x::Float64, y::Float64) =
+    ccall("llvm.minimum.f64", llvmcall, Float64, (Float64, Float64), x, y)
 
 #@device_override Base.max(x::Int32, y::Int32) = ccall("extern __nv_max", llvmcall, Int32, (Int32, Int32), x, y)
 #@device_override Base.max(x::Int64, y::Int64) = ccall("extern __nv_llmax", llvmcall, Int64, (Int64, Int64), x, y)
 #@device_override Base.max(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umax", llvmcall, Int32, (Int32, Int32), x, y))
 #@device_override Base.max(x::UInt64, y::UInt64) = convert(UInt64, ccall("extern __nv_ullmax", llvmcall, Int64, (Int64, Int64), x, y))
-# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN don't match Julia's
+# JuliaGPU/CUDA.jl#2111: fmax semantics wrt. NaN and signed zeros don't match Julia's
 #@device_override Base.max(x::Float64, y::Float64) = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 #@device_override Base.max(x::Float32, y::Float32) = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-@device_override @inline function Base.max(x::Float32, y::Float32)
-    if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
-        # LLVM 14+ can do the right thing, but only on sm_80+
-        # (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
-        ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
-    else
-        # we follow PTX semantics, returning canonical NaN if either input is NaN
-        anynan = isnan(x) | isnan(y)
-        maxval = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-        ifelse(anynan, NaN32, maxval)
-    end
-end
-@device_override @inline function Base.max(x::Float64, y::Float64)
-    # PTX doesn't support max.NaN.f64, so we have to do it ourselves
-    anynan = isnan(x) | isnan(y)
-    maxval = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-    ifelse(anynan, NaN, maxval)
-end
-
-@device_override @inline function Base.minmax(x::Float32, y::Float32)
-    if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
-        # LLVM 14+ can do the right thing, but only on sm_80+
-        # (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
-        ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y),
-        ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
-    else
-        # we follow PTX semantics, returning canonical NaN if either input is NaN
-        anynan = isnan(x) | isnan(y)
-        minval = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-        maxval = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-        ifelse(anynan, NaN32, minval), ifelse(anynan, NaN32, maxval)
-    end
-end
-@device_override @inline function Base.minmax(x::Float64, y::Float64)
-    # PTX doesn't support (min|max).NaN.f64, so we have to do it ourselves
-    anynan = isnan(x) | isnan(y)
-    minval = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-    maxval = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-    ifelse(anynan, NaN, minval), ifelse(anynan, NaN, maxval)
-end
-
-@static if Base.thismajor(LLVM.version()) <= v"20"
-    # LLVM 20 and below generate non-existing instructions for Julia's default methods of
-    # fast min/max on fp64: https://github.com/JuliaGPU/CUDA.jl/issues/2886
-    for T in (Float16, Float32, Float64)
-        @eval begin
-            @device_override @inline Base.FastMath.max_fast(x::$T, y::$T) = ifelse(y > x, y, x)
-            @device_override @inline Base.FastMath.min_fast(x::$T, y::$T) = ifelse(y > x, x, y)
-            @device_override @inline Base.FastMath.minmax_fast(x::$T, y::$T) = ifelse(y > x, (x, y), (y, x))
-        end
-    end
-
-    # For Float16, this even happens with a non-fastmath @llvm.minimum/maximum.f16
-    @device_override @inline Base.max(x::Float16, y::Float16) = ifelse(y > x, y, x)
-    @device_override @inline Base.min(x::Float16, y::Float16) = ifelse(y > x, x, y)
-
-end
+@device_override Base.max(x::Float32, y::Float32) =
+    ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
+@device_override Base.max(x::Float64, y::Float64) =
+    ccall("llvm.maximum.f64", llvmcall, Float64, (Float64, Float64), x, y)
+
+# Base's AbstractFloat minmax simply calls min/max, but Julia 1.10/1.11 had
+# open-coded definitions for Float32/Float64; override for uniform codegen.
+@device_override Base.minmax(x::Float32, y::Float32) = min(x, y), max(x, y)
+@device_override Base.minmax(x::Float64, y::Float64) = min(x, y), max(x, y)
 
 @device_function saturate(x::Float32) = ccall("extern __nv_saturatef", llvmcall, Cfloat, (Cfloat,), x)