Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CUDACore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ ChainRulesCore = "1"
EnzymeCore = "0.8.2"
ExprTools = "0.1"
GPUArrays = "11.5.4"
GPUCompiler = "1.18"
GPUCompiler = "1.19"
GPUToolbox = "1.1"
KernelAbstractions = "0.9.38"
LLVM = "9.6"
Expand Down
19 changes: 15 additions & 4 deletions CUDACore/src/compatibility.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ const highest = v"999"
# exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
#
# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
# below: an unsupported combination simply has no entry.
# require for them, is encoded directly in the keys of `ptx_sm_db` below (and the equivalent
# database in GPUCompiler): an unsupported combination simply has no entry.


## version range
Expand Down Expand Up @@ -229,6 +229,7 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
sm"80" => between(v"11", highest),
sm"86" => between(v"13", highest),
sm"87" => between(v"16", highest),
sm"88" => between(v"21", highest),
sm"89" => between(v"16", highest),
sm"90" => between(v"16", highest),
sm"90a" => between(v"18", highest),
Expand All @@ -241,6 +242,9 @@ const llvm_sm_db = Dict{SMVersion, VersionRange}(
sm"103" => between(v"21", highest),
sm"103a" => between(v"21", highest),
sm"103f" => between(v"21", highest),
sm"110" => between(v"22", highest),
sm"110a" => between(v"22", highest),
sm"110f" => between(v"22", highest),
sm"120" => between(v"20", highest),
sm"120a" => between(v"20", highest),
sm"120f" => between(v"21", highest),
Expand Down Expand Up @@ -314,9 +318,15 @@ end

## high-level functions that return target and isa support

function llvm_compat(version=LLVM.version())
LLVM.InitializeNVPTXTarget()
# the LLVM version of the external NVPTX back-end used for machine-code generation,
# as opposed to `LLVM.version()`, which identifies the in-process LLVM that only
# handles the middle end (the JLL is versioned after the LLVM release it provides).
const nvptx_llvm_version = pkgversion(NVPTX_LLVM_Backend_jll)

# by default, return the capabilities of the external back-end, which is typically much
# newer than the in-process LLVM (which only drives the middle end, and is not
# configured for any particular device).
function llvm_compat(version=nvptx_llvm_version)
# `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
# `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
# support is per-CC -- the names track the value type.
Expand All @@ -328,3 +338,4 @@ function ptxas_compat(version=compiler_version())
return (cap=ptxas_cap_support(version),
ptx=ptxas_ptx_support(version))
end

56 changes: 52 additions & 4 deletions CUDACore/src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ end
if ptx !== nothing
# explicit request: take it exactly, validating against the toolchain
ptx in llvm_support.ptx ||
error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
error("Requested PTX ISA $ptx is not supported by LLVM $(nvptx_llvm_version)")
ptx in ptxas_support.ptx ||
error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
llvm_ptx = ptxas_ptx = ptx
Expand All @@ -220,7 +220,7 @@ end
llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
isempty(llvm_ptxs) &&
error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(nvptx_llvm_version)")
isempty(ptxas_ptxs) &&
error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
ptxas_ptx = maximum(ptxas_ptxs)
Expand Down Expand Up @@ -260,7 +260,7 @@ end
sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
end
isempty(baseline_candidates) &&
error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(nvptx_llvm_version)")
llvm_sm = argmax(sm_key, baseline_candidates)
end

Expand All @@ -272,6 +272,46 @@ end
CompilerConfig(target, params; kernel, name, always_inline)
end

# does the host-side layout of an argument type match the device-side one?
#
# the back-end unconditionally aligns 128-bit integers to 16 bytes, whereas Julia only
# started doing so in 1.12, so aggregates with (U)Int128 fields may lay out differently.
# returns the device-side (size, alignment) of `T`, `:opaque` for types whose layout is
# defined by Julia on both sides (e.g. unions, or non-isbits types passed by reference),
# or `:mismatch`.
function device_layout(@nospecialize(T))
if T === Int128 || T === UInt128
return (16, 16)
elseif !(T isa DataType) || !isbitstype(T)
return :opaque
elseif fieldcount(T) == 0
return (sizeof(T), Base.datatype_alignment(T))
end
offset = 0
align = 1
for i in 1:fieldcount(T)
field = device_layout(fieldtype(T, i))
field === :mismatch && return :mismatch
if field === :opaque || offset < 0
# we cannot track offsets anymore, but keep verifying nested layouts
offset = -1
continue
end
field_size, field_align = field
offset = cld(offset, field_align) * field_align
offset == fieldoffset(T, i) || return :mismatch
offset += field_size
align = max(align, field_align)
end
offset < 0 && return :opaque
size = cld(offset, align) * align
size == sizeof(T) || return :mismatch
return (size, align)
end
device_compatible_layout(@nospecialize(T)) =
# since Julia 1.12, host and device layouts are identical
Base.datatype_alignment(Int128) == 16 || device_layout(T) !== :mismatch

# compile to executable machine code
function compile(@nospecialize(job::CompilerJob))
# lower to PTX
Expand All @@ -282,7 +322,9 @@ function compile(@nospecialize(job::CompilerJob))

# check if we'll need the device runtime
undefined_fs = filter(collect(functions(meta.ir))) do f
isdeclaration(f) && !LLVM.isintrinsic(f)
isdeclaration(f) && !LLVM.isintrinsic(f) &&
# intrinsics unknown to the in-process LLVM are still lowered by the back-end
!startswith(LLVM.name(f), "llvm.")
end
intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail",
"__nvvm_reflect" #= TODO: should have been optimized away =#]
Expand Down Expand Up @@ -312,6 +354,12 @@ function compile(@nospecialize(job::CompilerJob))
argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
!isghosttype(dt) && !Core.Compiler.isconstType(dt)
end
for dt in argtypes
if !device_compatible_layout(dt)
error("""Kernel argument of type $dt contains Int128 fields whose layout differs between this version of Julia and the device.
Use Julia 1.12 or later, where 128-bit integers are aligned to 16 bytes, matching the device.""")
end
end
param_usage = sum(aligned_sizeof, argtypes)
param_limit = 4096
if cap >= v"7.0" && ptx_param >= v"8.1"
Expand Down
55 changes: 35 additions & 20 deletions CUDACore/src/device/intrinsics/atomics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ for T in (Int32, Int64, UInt32, UInt64)
end
end

for T in (:Float32, :Float64)
for T in (:Float16, :Float32, :Float64)
ops = [:add]

for op in ops
Expand All @@ -107,6 +107,19 @@ for T in (:Float32, :Float64)
atomic_add!(ptr, -val)
end

# BFloat16 requires Julia 1.11 for bfloat codegen support; on older versions (and older
# devices, where the back-end expands the operation) the compare-and-swap fallback is used.
@static if VERSION >= v"1.11"
@eval @inline atomic_add!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
LLVMPtr{BFloat16,AS.Global},
LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
llvm_atomic_op($(Val(binops[:fadd])), ptr, val)
@eval @inline atomic_sub!(ptr::Union{LLVMPtr{BFloat16,AS.Generic},
LLVMPtr{BFloat16,AS.Global},
LLVMPtr{BFloat16,AS.Shared}}, val::BFloat16) =
atomic_add!(ptr, -val)
end

@generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
@dispose ctx=Context() begin
T_val = convert(LLVMType, T)
Expand Down Expand Up @@ -178,25 +191,6 @@ for A in (AS.Generic, AS.Global, AS.Shared)
end


## PTX

# half-precision atomics using PTX instruction

for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
if A == AS.Global
scope = ".global"
elseif A == AS.Shared
scope = ".shared"
else
scope = ""
end

intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
@eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
@asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
end


## Julia

# floating-point CAS via bitcasting
Expand Down Expand Up @@ -465,6 +459,27 @@ end
end
end

@inline function atomic_arrayset(A::AbstractArray{Float16}, I::Integer, op::typeof(+),
val::Float16)
ptr = pointer(A, I)
if compute_capability() >= sv"7.0"
atomic_add!(ptr, val)
else
atomic_op!(ptr, op, val)
end
end
@static if VERSION >= v"1.11"
@inline function atomic_arrayset(A::AbstractArray{BFloat16}, I::Integer, op::typeof(+),
val::BFloat16)
ptr = pointer(A, I)
if compute_capability() >= sv"9.0"
atomic_add!(ptr, val)
else
atomic_op!(ptr, op, val)
end
end
end

# fallback using compare-and-swap
@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
atomic_op!(pointer(A, I), op, val)
8 changes: 7 additions & 1 deletion CUDACore/src/device/intrinsics/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,13 @@ end
Returns a 32-bit mask indicating which threads in a warp are active with the current
executing thread.
""" active_mask
@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
@static if LLVM.version() >= v"20"
@inline active_mask() = ccall("llvm.nvvm.activemask", llvmcall, UInt32, ())
else
# the intrinsic isn't available yet, so use inline assembly. mark it side-effecting to
# prevent hoisting or merging across divergent control flow (the intrinsic is convergent).
@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", true, UInt32, Tuple{})
end

end

Expand Down
103 changes: 25 additions & 78 deletions CUDACore/src/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ end
end
@device_override FastMath.exp2_fast(x::Float64) = exp2(x)
@device_override FastMath.exp2_fast(x::Float32) =
@asmcall("ex2.approx.f32 \$0, \$1;", "=r,r", Float32, Tuple{Float32}, x)
ccall("llvm.nvvm.ex2.approx.f", llvmcall, Float32, (Float32,), x)
@device_override function FastMath.exp2_fast(x::Float16)
if compute_capability() >= sv"7.5"
ccall("llvm.nvvm.ex2.approx.f16", llvmcall, Float16, (Float16,), x)
Expand Down Expand Up @@ -376,92 +376,39 @@ end
#@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
#@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
#@device_override Base.min(x::UInt64, y::UInt64) = convert(UInt64, ccall("extern __nv_ullmin", llvmcall, Int64, (Int64, Int64), x, y))
# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN don't match Julia's
# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN and signed zeros don't match Julia's
#@device_override Base.min(x::Float64, y::Float64) = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
#@device_override Base.min(x::Float32, y::Float32) = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
@device_override @inline function Base.min(x::Float32, y::Float32)
if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
# LLVM 14+ can do the right thing, but only on sm_80+
# (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y)
else
# we follow PTX semantics, returning canonical NaN if either input is NaN
anynan = isnan(x) | isnan(y)
minval = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
ifelse(anynan, NaN32, minval)
end
end
@device_override @inline function Base.min(x::Float64, y::Float64)
# PTX doesn't support min.NaN.f64, so we have to do it ourselves
anynan = isnan(x) | isnan(y)
minval = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
ifelse(anynan, NaN, minval)
end
# Julia's floating-point min/max match IEEE 754-2019 minimum/maximum, i.e.,
# `llvm.minimum`/`llvm.maximum`, which the external back-end legalizes for every
# subtarget: native min.NaN/max.NaN instructions on sm_80+, an expansion with
# NaN/signed-zero fix-ups elsewhere. Don't be tempted to use `llvm.minnum`
# (libdevice's `__nv_fmin`) with a NaN fix-up instead: its loose signed-zero
# semantics leak into constant folding, e.g., folding `min(0.0, -0.0)` to
# `0.0` where the host returns `-0.0`.
# Julia 1.12+ lowers `Base.min` to `llvm.minimum` by itself; keep the overrides
# for uniform codegen on older versions.
@device_override Base.min(x::Float32, y::Float32) =
ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y)
@device_override Base.min(x::Float64, y::Float64) =
ccall("llvm.minimum.f64", llvmcall, Float64, (Float64, Float64), x, y)

#@device_override Base.max(x::Int32, y::Int32) = ccall("extern __nv_max", llvmcall, Int32, (Int32, Int32), x, y)
#@device_override Base.max(x::Int64, y::Int64) = ccall("extern __nv_llmax", llvmcall, Int64, (Int64, Int64), x, y)
#@device_override Base.max(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umax", llvmcall, Int32, (Int32, Int32), x, y))
#@device_override Base.max(x::UInt64, y::UInt64) = convert(UInt64, ccall("extern __nv_ullmax", llvmcall, Int64, (Int64, Int64), x, y))
# JuliaGPU/CUDA.jl#2111: fmin semantics wrt. NaN don't match Julia's
# JuliaGPU/CUDA.jl#2111: fmax semantics wrt. NaN and signed zeros don't match Julia's
#@device_override Base.max(x::Float64, y::Float64) = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
#@device_override Base.max(x::Float32, y::Float32) = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
@device_override @inline function Base.max(x::Float32, y::Float32)
if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
# LLVM 14+ can do the right thing, but only on sm_80+
# (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
else
# we follow PTX semantics, returning canonical NaN if either input is NaN
anynan = isnan(x) | isnan(y)
maxval = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
ifelse(anynan, NaN32, maxval)
end
end
@device_override @inline function Base.max(x::Float64, y::Float64)
# PTX doesn't support max.NaN.f64, so we have to do it ourselves
anynan = isnan(x) | isnan(y)
maxval = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
ifelse(anynan, NaN, maxval)
end

@device_override @inline function Base.minmax(x::Float32, y::Float32)
if @static LLVM.version() < v"14" ? false : (compute_capability() >= sv"8.0")
# LLVM 14+ can do the right thing, but only on sm_80+
# (JuliaGPU/CUDA.jl#2148, llvm/llvm-project#64606)
ccall("llvm.minimum.f32", llvmcall, Float32, (Float32, Float32), x, y),
ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
else
# we follow PTX semantics, returning canonical NaN if either input is NaN
anynan = isnan(x) | isnan(y)
minval = ccall("extern __nv_fminf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
maxval = ccall("extern __nv_fmaxf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
ifelse(anynan, NaN32, minval), ifelse(anynan, NaN32, maxval)
end
end
@device_override @inline function Base.minmax(x::Float64, y::Float64)
# PTX doesn't support (min|max).NaN.f64, so we have to do it ourselves
anynan = isnan(x) | isnan(y)
minval = ccall("extern __nv_fmin", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
maxval = ccall("extern __nv_fmax", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
ifelse(anynan, NaN, minval), ifelse(anynan, NaN, maxval)
end

@static if Base.thismajor(LLVM.version()) <= v"20"
# LLVM 20 and below generate non-existing instructions for Julia's default methods of
# fast min/max on fp64: https://github.com/JuliaGPU/CUDA.jl/issues/2886
for T in (Float16, Float32, Float64)
@eval begin
@device_override @inline Base.FastMath.max_fast(x::$T, y::$T) = ifelse(y > x, y, x)
@device_override @inline Base.FastMath.min_fast(x::$T, y::$T) = ifelse(y > x, x, y)
@device_override @inline Base.FastMath.minmax_fast(x::$T, y::$T) = ifelse(y > x, (x, y), (y, x))
end
end

# For Float16, this even happens with a non-fastmath @llvm.minimum/maximum.f16
@device_override @inline Base.max(x::Float16, y::Float16) = ifelse(y > x, y, x)
@device_override @inline Base.min(x::Float16, y::Float16) = ifelse(y > x, x, y)

end
@device_override Base.max(x::Float32, y::Float32) =
ccall("llvm.maximum.f32", llvmcall, Float32, (Float32, Float32), x, y)
@device_override Base.max(x::Float64, y::Float64) =
ccall("llvm.maximum.f64", llvmcall, Float64, (Float64, Float64), x, y)

# Base's AbstractFloat minmax simply calls min/max, but Julia 1.10/1.11 had
# open-coded definitions for Float32/Float64; override for uniform codegen.
@device_override Base.minmax(x::Float32, y::Float32) = min(x, y), max(x, y)
@device_override Base.minmax(x::Float64, y::Float64) = min(x, y), max(x, y)

@device_function saturate(x::Float32) = ccall("extern __nv_saturatef", llvmcall, Cfloat, (Cfloat,), x)

Expand Down
Loading