Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CUDACore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ ChainRulesCore = "1"
EnzymeCore = "0.8.2"
ExprTools = "0.1"
GPUArrays = "11.5.4"
GPUCompiler = "1.18"
GPUCompiler = "1.19"
GPUToolbox = "1.1"
KernelAbstractions = "0.9.38"
LLVM = "9.6"
Expand Down
19 changes: 15 additions & 4 deletions CUDACore/src/compatibility.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ const highest = v"999"
# exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
#
# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
# below: an unsupported combination simply has no entry.
# require for them, is encoded directly in the keys of `ptx_sm_db` below (and the equivalent
# database in GPUCompiler): an unsupported combination simply has no entry.


## version range
Expand Down Expand Up @@ -229,6 +229,7 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
sm"80" => between(v"11", highest),
sm"86" => between(v"13", highest),
sm"87" => between(v"16", highest),
sm"88" => between(v"21", highest),
sm"89" => between(v"16", highest),
sm"90" => between(v"16", highest),
sm"90a" => between(v"18", highest),
Expand All @@ -241,6 +242,9 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
sm"103" => between(v"21", highest),
sm"103a" => between(v"21", highest),
sm"103f" => between(v"21", highest),
sm"110" => between(v"22", highest),
sm"110a" => between(v"22", highest),
sm"110f" => between(v"22", highest),
sm"120" => between(v"20", highest),
sm"120a" => between(v"20", highest),
sm"120f" => between(v"21", highest),
Expand Down Expand Up @@ -314,9 +318,15 @@ end

## high-level functions that return target and isa support

function llvm_compat(version=LLVM.version())
LLVM.InitializeNVPTXTarget()
# the LLVM version of the external NVPTX back-end used for machine-code generation,
# as opposed to `LLVM.version()`, which identifies the in-process LLVM that only
# handles the middle end (the JLL is versioned after the LLVM release it provides).
const nvptx_llvm_version = pkgversion(NVPTX_LLVM_Backend_jll)

# by default, return the capabilities of the external back-end, which is typically much
# newer than the in-process LLVM (which only drives the middle end, and is not
# configured for any particular device).
function llvm_compat(version=nvptx_llvm_version)
# `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
# `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
# support is per-CC -- the names track the value type.
Expand All @@ -328,3 +338,4 @@ function ptxas_compat(version=compiler_version())
return (cap=ptxas_cap_support(version),
ptx=ptxas_ptx_support(version))
end

55 changes: 51 additions & 4 deletions CUDACore/src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ end
if ptx !== nothing
# explicit request: take it exactly, validating against the toolchain
ptx in llvm_support.ptx ||
error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
error("Requested PTX ISA $ptx is not supported by LLVM $(nvptx_llvm_version)")
ptx in ptxas_support.ptx ||
error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
llvm_ptx = ptxas_ptx = ptx
Expand All @@ -220,7 +220,7 @@ end
llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
isempty(llvm_ptxs) &&
error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(nvptx_llvm_version)")
isempty(ptxas_ptxs) &&
error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
ptxas_ptx = maximum(ptxas_ptxs)
Expand Down Expand Up @@ -260,7 +260,7 @@ end
sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
end
isempty(baseline_candidates) &&
error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(nvptx_llvm_version)")
llvm_sm = argmax(sm_key, baseline_candidates)
end

Expand All @@ -272,6 +272,45 @@ end
CompilerConfig(target, params; kernel, name, always_inline)
end

# does the host-side layout of an argument type match the device-side one?
#
# the back-end unconditionally aligns 128-bit integers to 16 bytes, whereas Julia only
# started doing so in 1.12, so aggregates with (U)Int128 fields may lay out differently.
# returns the device-side (size, alignment) of `T`, `:opaque` for types whose layout is
# defined by Julia on both sides (e.g. unions), or `:mismatch`.
function device_layout(@nospecialize(T))
if T === Int128 || T === UInt128
return (16, 16)
elseif !(T isa DataType)
return :opaque
elseif fieldcount(T) == 0
return (sizeof(T), Base.datatype_alignment(T))
end
offset = 0
align = 1
for i in 1:fieldcount(T)
field = device_layout(fieldtype(T, i))
field === :mismatch && return :mismatch
if field === :opaque || offset < 0
# we cannot track offsets anymore, but keep verifying nested layouts
offset = -1
continue
end
field_size, field_align = field
offset = cld(offset, field_align) * field_align
offset == fieldoffset(T, i) || return :mismatch
offset += field_size
align = max(align, field_align)
end
offset < 0 && return :opaque
size = cld(offset, align) * align
size == sizeof(T) || return :mismatch
return (size, align)
end
device_compatible_layout(@nospecialize(T)) =
# since Julia 1.12, host and device layouts are identical
Base.datatype_alignment(Int128) == 16 || device_layout(T) !== :mismatch

# compile to executable machine code
function compile(@nospecialize(job::CompilerJob))
# lower to PTX
Expand All @@ -282,7 +321,9 @@ function compile(@nospecialize(job::CompilerJob))

# check if we'll need the device runtime
undefined_fs = filter(collect(functions(meta.ir))) do f
isdeclaration(f) && !LLVM.isintrinsic(f)
isdeclaration(f) && !LLVM.isintrinsic(f) &&
# intrinsics unknown to the in-process LLVM are still lowered by the back-end
!startswith(LLVM.name(f), "llvm.")
end
intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail",
"__nvvm_reflect" #= TODO: should have been optimized away =#]
Expand Down Expand Up @@ -312,6 +353,12 @@ function compile(@nospecialize(job::CompilerJob))
argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
!isghosttype(dt) && !Core.Compiler.isconstType(dt)
end
for dt in argtypes
if !device_compatible_layout(dt)
error("""Kernel argument of type $dt contains Int128 fields whose layout differs between this version of Julia and the device.
Use Julia 1.12 or later, where 128-bit integers are aligned to 16 bytes, matching the device.""")
end
end
param_usage = sum(aligned_sizeof, argtypes)
param_limit = 4096
if cap >= v"7.0" && ptx_param >= v"8.1"
Expand Down
55 changes: 35 additions & 20 deletions CUDACore/src/device/intrinsics/atomics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ for T in (Int32, Int64, UInt32, UInt64)
end
end

for T in (:Float32, :Float64)
for T in (:Float16, :Float32, :Float64)
ops = [:add]

for op in ops
Expand All @@ -107,6 +107,19 @@ for T in (:Float32, :Float64)
atomic_add!(ptr, -val)
end

# BFloat16 requires Julia 1.11 for bfloat codegen support; on older versions (and older
# devices, where the back-end expands the operation) the compare-and-swap fallback is used.
@static if VERSION >= v"1.11"
@eval @inline atomic_add!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
LLVMPtr{BFloat16,AS.Global},
LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
llvm_atomic_op($(Val(binops[:fadd])), ptr, val)
@eval @inline atomic_sub!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
LLVMPtr{BFloat16,AS.Global},
LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
atomic_add!(ptr, -val)
end

@generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
@dispose ctx=Context() begin
T_val = convert(LLVMType, T)
Expand Down Expand Up @@ -178,25 +191,6 @@ for A in (AS.Generic, AS.Global, AS.Shared)
end


## PTX

# half-precision atomics using PTX instruction

for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
if A == AS.Global
scope = ".global"
elseif A == AS.Shared
scope = ".shared"
else
scope = ""
end

intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
@eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
@asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
end


## Julia

# floating-point CAS via bitcasting
Expand Down Expand Up @@ -465,6 +459,27 @@ end
end
end

@inline function atomic_arrayset(A::AbstractArray{Float16}, I::Integer, op::typeof(+),
val::Float16)
ptr = pointer(A, I)
if compute_capability() >= sv"7.0"
atomic_add!(ptr, val)
else
atomic_op!(ptr, op, val)
end
end
@static if VERSION >= v"1.11"
@inline function atomic_arrayset(A::AbstractArray{BFloat16}, I::Integer, op::typeof(+),
val::BFloat16)
ptr = pointer(A, I)
if compute_capability() >= sv"9.0"
atomic_add!(ptr, val)
else
atomic_op!(ptr, op, val)
end
end
end

# fallback using compare-and-swap
@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
atomic_op!(pointer(A, I), op, val)
8 changes: 7 additions & 1 deletion CUDACore/src/device/intrinsics/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,13 @@ end
Returns a 32-bit mask indicating which threads in a warp are active with the current
executing thread.
""" active_mask
@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
@static if LLVM.version() >= v"20"
@inline active_mask() = ccall("llvm.nvvm.activemask", llvmcall, UInt32, ())
else
# the intrinsic isn't available yet, so use inline assembly. mark it side-effecting to
# prevent hoisting or merging across divergent control flow (the intrinsic is convergent).
@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", true, UInt32, Tuple{})
end

end

Expand Down
19 changes: 1 addition & 18 deletions CUDACore/src/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ end
end
@device_override FastMath.exp2_fast(x::Float64) = exp2(x)
@device_override FastMath.exp2_fast(x::Float32) =
@asmcall("ex2.approx.f32 \$0, \$1;", "=r,r", Float32, Tuple{Float32}, x)
ccall("llvm.nvvm.ex2.approx.f", llvmcall, Float32, (Float32,), x)
@device_override function FastMath.exp2_fast(x::Float16)
if compute_capability() >= sv"7.5"
ccall("llvm.nvvm.ex2.approx.f16", llvmcall, Float16, (Float16,), x)
Expand Down Expand Up @@ -446,23 +446,6 @@ end
ifelse(anynan, NaN, minval), ifelse(anynan, NaN, maxval)
end

@static if Base.thismajor(LLVM.version()) <= v"20"
# LLVM 20 and below generate non-existing instructions for Julia's default methods of
# fast min/max on fp64: https://github.com/JuliaGPU/CUDA.jl/issues/2886
for T in (Float16, Float32, Float64)
@eval begin
@device_override @inline Base.FastMath.max_fast(x::$T, y::$T) = ifelse(y > x, y, x)
@device_override @inline Base.FastMath.min_fast(x::$T, y::$T) = ifelse(y > x, x, y)
@device_override @inline Base.FastMath.minmax_fast(x::$T, y::$T) = ifelse(y > x, (x, y), (y, x))
end
end

# For Float16, this even happens with a non-fastmath @llvm.minimum/maximum.f16
@device_override @inline Base.max(x::Float16, y::Float16) = ifelse(y > x, y, x)
@device_override @inline Base.min(x::Float16, y::Float16) = ifelse(y > x, x, y)

end

@device_function saturate(x::Float32) = ccall("extern __nv_saturatef", llvmcall, Cfloat, (Cfloat,), x)


Expand Down
17 changes: 13 additions & 4 deletions test/core/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,24 @@ end
@inbounds y[] = 0
return
end

# dynamically-indexed aggregate arguments should load directly from parameter space
# instead of being copied to local memory first
@test @filecheck CUDA.code_ptx(Tuple{CuDeviceArray{Float32,1,AS.Global},
NTuple{32,Float32}, Int}) do out, t, i
@check_not ".local"
@inbounds out[1] = t[i]
return
end
end

@testset "header rewrite (.target/.version bump)" begin
# When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 +
# LLVM 18 on a Blackwell device), `_compiler_config` produces a split
# config and `mcgen` rewrites `.target`/`.version` in the emitted asm.
# When the LLVM back-end can't reach the device cap (e.g., a device newer
# than what NVPTX_LLVM_Backend_jll supports), `_compiler_config` produces a
# split config and `mcgen` rewrites `.target`/`.version` in the emitted asm.
# `.attribute(.unified)` is target-gated on sm_90+ across CUDA 12.0+ —
# picked here as a stable cross-toolkit feature gate that exercises the
# rewrite without requiring Blackwell hardware in CI.
# rewrite without requiring such hardware in CI.
asm_pre = """
.version 8.0
.target sm_75
Expand Down
20 changes: 12 additions & 8 deletions test/core/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,8 @@ using SpecialFunctions
end

@testset "div/inv PTX" begin
# `Base.{/, inv}` and their fast variants are handled by GPUCompiler's
# `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
# `fdiv 1.0, x` to `rcp.rn`.
# `Base.{/, inv}` lower to plain `fdiv`; NVPTX pattern-matches
# `fdiv 1.0, x` (i.e. `inv`) to the dedicated `rcp` instructions.
for (T, s) in ((Float32, "f32"), (Float64, "f64"))
@test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
@check "div.rn.$s"
Expand All @@ -591,24 +590,28 @@ using SpecialFunctions
end
end

# `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since
# the job isn't fast; f64 always uses rcp+Newton.
# `@fastmath` on f32: the back-end honors `afn`, picking the non-FTZ
# variants since the job isn't fast, and the dedicated `rcp` for
# reciprocals. f64 is rewritten by GPUCompiler to rcp+Newton, so also
# check for the refinement fmas (a raw rcp would be too inaccurate).
@test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
@check "div.approx.f32"
@check_not "div.approx.ftz"
@fastmath x / y
end
@test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
@check "div.approx.f32"
@check_not "div.approx.ftz"
@check "rcp.approx.f32"
@check_not "rcp.approx.ftz"
@fastmath inv(x)
end
@test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
@check "rcp.approx.ftz.f64"
@check "fma.rn.f64"
@fastmath x / y
end
@test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
@check "rcp.approx.ftz.f64"
@check "fma.rn.f64"
@fastmath inv(x)
end

Expand All @@ -619,11 +622,12 @@ using SpecialFunctions
x / y
end
@test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
@check "div.approx.ftz.f32"
@check "rcp.approx.ftz.f32"
inv(x)
end
@test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
@check "rcp.approx.ftz.f64"
@check "fma.rn.f64"
x / y
end
end
Expand Down