feat: add dp4a device intrinsic

JohnCobbler · JohnCobbler · commit 99c2eedf35a9 · 2026-06-05T10:53:33.000Z
Add CUDACore.dp4a with the four signedness variants of the PTX dp4a
instruction (packed 4-element int8/uint8 dot product with 32-bit
accumulate), available on sm_61 and later.

On LLVM 21 and later the implementation uses the @llvm.nvvm.idp4a.[us].[us]
intrinsics added in LLVM 21; on older versions it falls back to inline PTX
via @asmcall. Both paths verified on sm_75: identical dp4a instruction
selection and bit-identical results against a byte-wise reference, on
Julia 1.11 (LLVM 16, asm path) and nightly (LLVM 21, intrinsic path).
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -3,7 +3,7 @@
 # we only use libdevice where needed. if possible, we go through LLVM instead,
 # ideally relying on Julia's existing definitions.
 
-@public fma, rsqrt, saturate, byte_perm, assume
+@public fma, rsqrt, saturate, byte_perm, dp4a, assume
 @public add_rn, add_rz, add_rm, add_rp
 @public sub_rn, sub_rz, sub_rm, sub_rp
 @public mul_rn, mul_rz, mul_rm, mul_rp
@@ -286,6 +286,60 @@ end
     ccall("extern __nv_byte_perm", llvmcall, Int32, (UInt32, UInt32, UInt32), x, y, z)
 end
 
+"""
+    dp4a(a, b, c)
+
+Packed 4-element int8 (or uint8) dot product with 32-bit accumulation, mapped to a single
+PTX `dp4a` instruction on sm_61+.
+
+The semantics depend on the signedness of `a` and `b`:
+
+- `dp4a(a::Int32,  b::Int32,  c::Int32)  -> Int32`  — signed × signed
+- `dp4a(a::Int32,  b::UInt32, c::Int32)  -> Int32`  — signed × unsigned
+- `dp4a(a::UInt32, b::Int32,  c::Int32)  -> Int32`  — unsigned × signed
+- `dp4a(a::UInt32, b::UInt32, c::UInt32) -> UInt32` — unsigned × unsigned
+
+Each 32-bit argument `a` and `b` is interpreted as four packed 8-bit integers. The result
+is `c + a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]` where the individual byte
+extractions respect the signed/unsigned interpretation of each operand.
+
+!!! note
+    Requires compute capability sm_61 or higher.
+"""
+function dp4a end
+
+@static if LLVM.version() >= v"21"
+    # LLVM 21 added @llvm.nvvm.idp4a.[us].[us]; prefer the intrinsic over inline PTX so
+    # the instruction participates in optimization and instruction selection.
+    @device_function dp4a(a::Int32, b::Int32, c::Int32) =
+        ccall("llvm.nvvm.idp4a.s.s", llvmcall, Int32, (Int32, Int32, Int32), a, b, c)
+
+    @device_function dp4a(a::Int32, b::UInt32, c::Int32) =
+        ccall("llvm.nvvm.idp4a.s.u", llvmcall, Int32, (Int32, UInt32, Int32), a, b, c)
+
+    @device_function dp4a(a::UInt32, b::Int32, c::Int32) =
+        ccall("llvm.nvvm.idp4a.u.s", llvmcall, Int32, (UInt32, Int32, Int32), a, b, c)
+
+    @device_function dp4a(a::UInt32, b::UInt32, c::UInt32) =
+        ccall("llvm.nvvm.idp4a.u.u", llvmcall, UInt32, (UInt32, UInt32, UInt32), a, b, c)
+else
+    @device_function dp4a(a::Int32, b::Int32, c::Int32) =
+        @asmcall("dp4a.s32.s32 \$0, \$1, \$2, \$3;", "=r,r,r,r", false,
+                 Int32, Tuple{Int32, Int32, Int32}, a, b, c)
+
+    @device_function dp4a(a::Int32, b::UInt32, c::Int32) =
+        @asmcall("dp4a.s32.u32 \$0, \$1, \$2, \$3;", "=r,r,r,r", false,
+                 Int32, Tuple{Int32, UInt32, Int32}, a, b, c)
+
+    @device_function dp4a(a::UInt32, b::Int32, c::Int32) =
+        @asmcall("dp4a.u32.s32 \$0, \$1, \$2, \$3;", "=r,r,r,r", false,
+                 Int32, Tuple{UInt32, Int32, Int32}, a, b, c)
+
+    @device_function dp4a(a::UInt32, b::UInt32, c::UInt32) =
+        @asmcall("dp4a.u32.u32 \$0, \$1, \$2, \$3;", "=r,r,r,r", false,
+                 UInt32, Tuple{UInt32, UInt32, UInt32}, a, b, c)
+end
+
 
 ## floating-point handling
 
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
@@ -339,6 +339,134 @@ using SpecialFunctions
         end
     end
 
+    # dp4a requires sm_61+
+    if capability(device()) >= v"6.1"
+    @testset "dp4a" begin
+        # Pure-Julia reference: unpack four bytes from a packed Int32/UInt32,
+        # dot-product them (with the respective signed/unsigned semantics), and
+        # add the accumulator.
+        function ref_dp4a_ss(a::Int32, b::Int32, c::Int32)
+            ba = reinterpret(NTuple{4,Int8}, a)
+            bb = reinterpret(NTuple{4,Int8}, b)
+            c + Int32(ba[1])*Int32(bb[1]) + Int32(ba[2])*Int32(bb[2]) +
+                Int32(ba[3])*Int32(bb[3]) + Int32(ba[4])*Int32(bb[4])
+        end
+        function ref_dp4a_su(a::Int32, b::UInt32, c::Int32)
+            ba = reinterpret(NTuple{4,Int8},  a)
+            bb = reinterpret(NTuple{4,UInt8}, b)
+            c + Int32(ba[1])*Int32(bb[1]) + Int32(ba[2])*Int32(bb[2]) +
+                Int32(ba[3])*Int32(bb[3]) + Int32(ba[4])*Int32(bb[4])
+        end
+        function ref_dp4a_us(a::UInt32, b::Int32, c::Int32)
+            ba = reinterpret(NTuple{4,UInt8}, a)
+            bb = reinterpret(NTuple{4,Int8},  b)
+            c + Int32(ba[1])*Int32(bb[1]) + Int32(ba[2])*Int32(bb[2]) +
+                Int32(ba[3])*Int32(bb[3]) + Int32(ba[4])*Int32(bb[4])
+        end
+        function ref_dp4a_uu(a::UInt32, b::UInt32, c::UInt32)
+            ba = reinterpret(NTuple{4,UInt8}, a)
+            bb = reinterpret(NTuple{4,UInt8}, b)
+            c + UInt32(ba[1])*UInt32(bb[1]) + UInt32(ba[2])*UInt32(bb[2]) +
+                UInt32(ba[3])*UInt32(bb[3]) + UInt32(ba[4])*UInt32(bb[4])
+        end
+
+        # Kernels: each writes one result per thread (we launch 1 thread, one
+        # case per test to keep the kernel signatures simple).
+        function kernel_ss(out, a, b, c)
+            out[] = CUDA.dp4a(a, b, c)
+            return
+        end
+        function kernel_su(out, a, b, c)
+            out[] = CUDA.dp4a(a, b, c)
+            return
+        end
+        function kernel_us(out, a, b, c)
+            out[] = CUDA.dp4a(a, b, c)
+            return
+        end
+        function kernel_uu(out, a, b, c)
+            out[] = CUDA.dp4a(a, b, c)
+            return
+        end
+
+        # Helper: pack four Int8/UInt8 values (little-endian: b0 in bits 7:0).
+        # Use reinterpret(Int32/UInt32, NTuple{4,Int8/UInt8}) — portable and avoids
+        # integer-width pitfalls in the shift+or approach.
+        pack_s(b0, b1, b2, b3) = reinterpret(Int32,  (b0%Int8,  b1%Int8,  b2%Int8,  b3%Int8))
+        pack_u(b0, b1, b2, b3) = reinterpret(UInt32, (b0%UInt8, b1%UInt8, b2%UInt8, b3%UInt8))
+
+        @testset "ss — signed × signed" begin
+            cases = [
+                # (a_bytes…, b_bytes…, c, label)
+                (Int32(0), Int32(0), Int32(0)),                       # all zeros
+                (pack_s(1,2,3,4), pack_s(5,6,7,8), Int32(10)),       # 1*5+2*6+3*7+4*8+10 = 80
+                (pack_s(127,127,127,127), pack_s(1,1,1,1), Int32(0)), # max positive bytes
+                (pack_s(-128,-128,-128,-128), pack_s(1,1,1,1), Int32(0)), # most-negative bytes
+                (pack_s(-1,-1,-1,-1), pack_s(-1,-1,-1,-1), Int32(0)), # neg*neg
+                (Int32(-1), Int32(-1), Int32(100)),                   # 0xFF packing
+            ]
+            for (a, b, c) in cases
+                expected = ref_dp4a_ss(a, b, c)
+                buf = CuArray{Int32}(undef, 1)
+                @cuda threads=1 kernel_ss(buf, a, b, c)
+                @test Array(buf)[] == expected
+            end
+        end
+
+        @testset "su — signed × unsigned" begin
+            cases = [
+                (Int32(0), UInt32(0), Int32(0)),
+                (pack_s(1,2,3,4), pack_u(5,6,7,8), Int32(10)),        # 1*5+…+10 = 80
+                (pack_s(127,0,-128,1), pack_u(255,128,1,0), Int32(5)),
+                (pack_s(-1,-1,-1,-1), pack_u(255,255,255,255), Int32(0)), # -1 * 255 * 4 = -1020
+            ]
+            for (a, b, c) in cases
+                expected = ref_dp4a_su(a, b, c)
+                buf = CuArray{Int32}(undef, 1)
+                @cuda threads=1 kernel_su(buf, a, b, c)
+                @test Array(buf)[] == expected
+            end
+        end
+
+        @testset "us — unsigned × signed" begin
+            cases = [
+                (UInt32(0), Int32(0), Int32(0)),
+                (pack_u(1,2,3,4), pack_s(5,6,7,8), Int32(10)),
+                (pack_u(255,128,0,1), pack_s(-1,1,-128,127), Int32(0)),
+            ]
+            for (a, b, c) in cases
+                expected = ref_dp4a_us(a, b, c)
+                buf = CuArray{Int32}(undef, 1)
+                @cuda threads=1 kernel_us(buf, a, b, c)
+                @test Array(buf)[] == expected
+            end
+        end
+
+        @testset "uu — unsigned × unsigned" begin
+            cases = [
+                (UInt32(0), UInt32(0), UInt32(0)),
+                (pack_u(1,2,3,4), pack_u(5,6,7,8), UInt32(10)),       # 80
+                (pack_u(255,255,255,255), pack_u(1,1,1,1), UInt32(0)), # 4*255 = 1020
+                (pack_u(255,255,255,255), pack_u(255,255,255,255), UInt32(0)), # 4*255^2 = 260100
+            ]
+            for (a, b, c) in cases
+                expected = ref_dp4a_uu(a, b, c)
+                buf = CuArray{UInt32}(undef, 1)
+                @cuda threads=1 kernel_uu(buf, a, b, c)
+                @test Array(buf)[] == expected
+            end
+        end
+
+        @testset "PTX instruction selection" begin
+            # Verify the backend emits the actual dp4a instruction, not a
+            # software emulation sequence.
+            buf = CuArray{Int32}(undef, 1)
+            ptx = sprint(io->(@device_code_ptx io=io @cuda launch=false kernel_ss(buf, Int32(0), Int32(0), Int32(0))))
+            @test occursin("dp4a", ptx)
+        end
+    end
+    end # capability >= v"6.1"
+
     @testset "@fastmath sincos" begin
         # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
         @test @filecheck CUDA.code_ptx(NTuple{3,CuDeviceArray{Float32,1,AS.Global}}) do a, b, c