@@ -339,6 +339,134 @@ using SpecialFunctions
339339 end
340340 end
341341
342+ # dp4a requires sm_61+
343+ if capability (device ()) >= v " 6.1"
344+ @testset " dp4a" begin
345+ # Pure-Julia reference: unpack four bytes from a packed Int32/UInt32,
346+ # dot-product them (with the respective signed/unsigned semantics), and
347+ # add the accumulator.
348+ function ref_dp4a_ss (a:: Int32 , b:: Int32 , c:: Int32 )
349+ ba = reinterpret (NTuple{4 ,Int8}, a)
350+ bb = reinterpret (NTuple{4 ,Int8}, b)
351+ c + Int32 (ba[1 ])* Int32 (bb[1 ]) + Int32 (ba[2 ])* Int32 (bb[2 ]) +
352+ Int32 (ba[3 ])* Int32 (bb[3 ]) + Int32 (ba[4 ])* Int32 (bb[4 ])
353+ end
354+ function ref_dp4a_su (a:: Int32 , b:: UInt32 , c:: Int32 )
355+ ba = reinterpret (NTuple{4 ,Int8}, a)
356+ bb = reinterpret (NTuple{4 ,UInt8}, b)
357+ c + Int32 (ba[1 ])* Int32 (bb[1 ]) + Int32 (ba[2 ])* Int32 (bb[2 ]) +
358+ Int32 (ba[3 ])* Int32 (bb[3 ]) + Int32 (ba[4 ])* Int32 (bb[4 ])
359+ end
360+ function ref_dp4a_us (a:: UInt32 , b:: Int32 , c:: Int32 )
361+ ba = reinterpret (NTuple{4 ,UInt8}, a)
362+ bb = reinterpret (NTuple{4 ,Int8}, b)
363+ c + Int32 (ba[1 ])* Int32 (bb[1 ]) + Int32 (ba[2 ])* Int32 (bb[2 ]) +
364+ Int32 (ba[3 ])* Int32 (bb[3 ]) + Int32 (ba[4 ])* Int32 (bb[4 ])
365+ end
366+ function ref_dp4a_uu (a:: UInt32 , b:: UInt32 , c:: UInt32 )
367+ ba = reinterpret (NTuple{4 ,UInt8}, a)
368+ bb = reinterpret (NTuple{4 ,UInt8}, b)
369+ c + UInt32 (ba[1 ])* UInt32 (bb[1 ]) + UInt32 (ba[2 ])* UInt32 (bb[2 ]) +
370+ UInt32 (ba[3 ])* UInt32 (bb[3 ]) + UInt32 (ba[4 ])* UInt32 (bb[4 ])
371+ end
372+
373+ # Kernels: each writes one result per thread (we launch 1 thread, one
374+ # case per test to keep the kernel signatures simple).
375+ function kernel_ss (out, a, b, c)
376+ out[] = CUDA. dp4a (a, b, c)
377+ return
378+ end
379+ function kernel_su (out, a, b, c)
380+ out[] = CUDA. dp4a (a, b, c)
381+ return
382+ end
383+ function kernel_us (out, a, b, c)
384+ out[] = CUDA. dp4a (a, b, c)
385+ return
386+ end
387+ function kernel_uu (out, a, b, c)
388+ out[] = CUDA. dp4a (a, b, c)
389+ return
390+ end
391+
392+ # Helper: pack four Int8/UInt8 values (little-endian: b0 in bits 7:0).
393+ # Use reinterpret(Int32/UInt32, NTuple{4,Int8/UInt8}) — portable and avoids
394+ # integer-width pitfalls in the shift+or approach.
395+ pack_s (b0, b1, b2, b3) = reinterpret (Int32, (b0% Int8, b1% Int8, b2% Int8, b3% Int8))
396+ pack_u (b0, b1, b2, b3) = reinterpret (UInt32, (b0% UInt8, b1% UInt8, b2% UInt8, b3% UInt8))
397+
398+ @testset " ss — signed × signed" begin
399+ cases = [
400+ # (a_bytes…, b_bytes…, c, label)
401+ (Int32 (0 ), Int32 (0 ), Int32 (0 )), # all zeros
402+ (pack_s (1 ,2 ,3 ,4 ), pack_s (5 ,6 ,7 ,8 ), Int32 (10 )), # 1*5+2*6+3*7+4*8+10 = 80
403+ (pack_s (127 ,127 ,127 ,127 ), pack_s (1 ,1 ,1 ,1 ), Int32 (0 )), # max positive bytes
404+ (pack_s (- 128 ,- 128 ,- 128 ,- 128 ), pack_s (1 ,1 ,1 ,1 ), Int32 (0 )), # most-negative bytes
405+ (pack_s (- 1 ,- 1 ,- 1 ,- 1 ), pack_s (- 1 ,- 1 ,- 1 ,- 1 ), Int32 (0 )), # neg*neg
406+ (Int32 (- 1 ), Int32 (- 1 ), Int32 (100 )), # 0xFF packing
407+ ]
408+ for (a, b, c) in cases
409+ expected = ref_dp4a_ss (a, b, c)
410+ buf = CuArray {Int32} (undef, 1 )
411+ @cuda threads= 1 kernel_ss (buf, a, b, c)
412+ @test Array (buf)[] == expected
413+ end
414+ end
415+
416+ @testset " su — signed × unsigned" begin
417+ cases = [
418+ (Int32 (0 ), UInt32 (0 ), Int32 (0 )),
419+ (pack_s (1 ,2 ,3 ,4 ), pack_u (5 ,6 ,7 ,8 ), Int32 (10 )), # 1*5+…+10 = 80
420+ (pack_s (127 ,0 ,- 128 ,1 ), pack_u (255 ,128 ,1 ,0 ), Int32 (5 )),
421+ (pack_s (- 1 ,- 1 ,- 1 ,- 1 ), pack_u (255 ,255 ,255 ,255 ), Int32 (0 )), # -1 * 255 * 4 = -1020
422+ ]
423+ for (a, b, c) in cases
424+ expected = ref_dp4a_su (a, b, c)
425+ buf = CuArray {Int32} (undef, 1 )
426+ @cuda threads= 1 kernel_su (buf, a, b, c)
427+ @test Array (buf)[] == expected
428+ end
429+ end
430+
431+ @testset " us — unsigned × signed" begin
432+ cases = [
433+ (UInt32 (0 ), Int32 (0 ), Int32 (0 )),
434+ (pack_u (1 ,2 ,3 ,4 ), pack_s (5 ,6 ,7 ,8 ), Int32 (10 )),
435+ (pack_u (255 ,128 ,0 ,1 ), pack_s (- 1 ,1 ,- 128 ,127 ), Int32 (0 )),
436+ ]
437+ for (a, b, c) in cases
438+ expected = ref_dp4a_us (a, b, c)
439+ buf = CuArray {Int32} (undef, 1 )
440+ @cuda threads= 1 kernel_us (buf, a, b, c)
441+ @test Array (buf)[] == expected
442+ end
443+ end
444+
445+ @testset " uu — unsigned × unsigned" begin
446+ cases = [
447+ (UInt32 (0 ), UInt32 (0 ), UInt32 (0 )),
448+ (pack_u (1 ,2 ,3 ,4 ), pack_u (5 ,6 ,7 ,8 ), UInt32 (10 )), # 80
449+ (pack_u (255 ,255 ,255 ,255 ), pack_u (1 ,1 ,1 ,1 ), UInt32 (0 )), # 4*255 = 1020
450+ (pack_u (255 ,255 ,255 ,255 ), pack_u (255 ,255 ,255 ,255 ), UInt32 (0 )), # 4*255^2 = 260100
451+ ]
452+ for (a, b, c) in cases
453+ expected = ref_dp4a_uu (a, b, c)
454+ buf = CuArray {UInt32} (undef, 1 )
455+ @cuda threads= 1 kernel_uu (buf, a, b, c)
456+ @test Array (buf)[] == expected
457+ end
458+ end
459+
460+ @testset " PTX instruction selection" begin
461+ # Verify the backend emits the actual dp4a instruction, not a
462+ # software emulation sequence.
463+ buf = CuArray {Int32} (undef, 1 )
464+ ptx = sprint (io-> (@device_code_ptx io= io @cuda launch= false kernel_ss (buf, Int32 (0 ), Int32 (0 ), Int32 (0 ))))
465+ @test occursin (" dp4a" , ptx)
466+ end
467+ end
468+ end # capability >= v"6.1"
469+
342470 @testset " @fastmath sincos" begin
343471 # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
344472 @test @filecheck CUDA. code_ptx (NTuple{3 ,CuDeviceArray{Float32,1 ,AS. Global}}) do a, b, c
0 commit comments