ROCm · carsonbrownlee · Apr 15, 2025 · Jul 8, 2025 · Jul 9, 2025 · Jul 11, 2025
@@ -1247,7 +1247,7 @@ Tests:
   beta: [ 0.0, 2.0 ]
   unit_check: 1
   algo_method: [2]
-  gpu_arch: '942'
+  gpu_arch: '9(42 | 50)'
 
 - name: matmul_groupedgemm_zero_n
   category: pre_checkin
@@ -1648,7 +1648,7 @@ Tests:
   alpha: 1
   beta: [ 0.0, 2.0 ]
   unit_check: 1
-  gpu_arch: '942'
+  gpu_arch: '9(42 | 50)'
 
 - name: matmul_gemm_double
   category: pre_checkin

@@ -72,7 +72,11 @@ class ArgumentModel
                   double                      cpu_us,
                   double                      norm,
                   double                      atol,
-                  double                      rtol)
+                  double                      rtol,
+                  double                      absErrorMax,
+                  double                      absErrorAvg,
+                  double                      relErrorMax,
+                  double                      relErrorAvg)
     {
         // requires enablement for frequency logging
         ArgumentModel_log_performance(name_line, val_line);
@@ -154,6 +158,11 @@ class ArgumentModel
                     else
                         val_line << "," << rtol;
                 }
+                if (absErrorMax != ArgumentLogging::NA_value)
+                {
+                    name_line << ",absErrorMax,absErrorAvg,relErrorMax,relErrorAvg";
+                    val_line << "," << absErrorMax << "," << absErrorAvg << "," << relErrorMax << "," << relErrorAvg;
+                }
             }
         }
     }
@@ -176,7 +185,11 @@ class ArgumentModel
                   double                      cpu_us = ArgumentLogging::NA_value,
                   double                      norm   = ArgumentLogging::NA_value,
                   double                      atol   = ArgumentLogging::NA_value,
-                  double                      rtol   = ArgumentLogging::NA_value)
+                  double                      rtol   = ArgumentLogging::NA_value,
+                  double                      absErrorMax = ArgumentLogging::NA_value,
+                  double                      absErrorAvg   = ArgumentLogging::NA_value,
+                  double                      relErrorMax   = ArgumentLogging::NA_value,
+                  double                      relErrorAvg   = ArgumentLogging::NA_value)
     {
         hipblaslt_internal_ostream name_list;
         hipblaslt_internal_ostream value_list;
@@ -261,7 +274,11 @@ class ArgumentModel
                      cpu_us,
                      norm,
                      atol,
-                     rtol);
+                     rtol,
+                     absErrorMax,
+                     absErrorAvg,
+                     relErrorMax,
+                     relErrorAvg);
 
         if(archName != "")
         {

@@ -831,7 +831,11 @@ void check(hipStream_t                   stream,
            hipDataType                   To,
            hipDataType                   Tbias,
            hipDataType                   Taux,
-           hipDataType                   Tc)
+           hipDataType                   Tc,
+           double&                       absErrorMax,
+           double&                       absErrorAvg,
+           double&                       relErrorMax,
+           double&                       relErrorAvg)
 {
     // fetch GPU
     CHECK_HIP_ERROR(hipStreamSynchronize(stream));
@@ -1040,7 +1044,11 @@ void check(hipStream_t                   stream,
                                                       num_batches[gemmIdx],
                                                       hipblaslt_atol,
                                                       hipblaslt_rtol,
-                                                      To);
+                                                      To,
+                                                      absErrorMax,
+                                                      absErrorAvg,
+                                                      relErrorMax,
+                                                      relErrorAvg);
             //TODO: confirm if allclose_check_assert is neccessary
         }
     }
@@ -3080,15 +3088,17 @@ void testing_matmul_with_bias(const Arguments& arg,
         }
 
         // For the xf32 xdl math op, cast type of A/B from float to xfloat32 .
-        if(TiA == HIP_R_32F && TiB == HIP_R_32F && To == HIP_R_32F && Talpha == HIP_R_32F)
-            if(arg.compute_type == HIPBLAS_COMPUTE_32F_FAST_TF32)
+        // NOTE: This should not be needed. Converting inputs to TF32 would result in
+        // less accurate reference values.
+        /*if(TiA == HIP_R_32F && TiB == HIP_R_32F && To == HIP_R_32F && Talpha == HIP_R_32F)
+            if(arg.compute_type == HIPBLAS_COMPUTE_32F_FAST_TF32 && false)
                 for(int i = 0; i < gemm_count; i++)
                 {
                     type_to_xdl_math_op_type<hipblasLtXfloat32, float, float>(
                         static_cast<float*>(hA[i].buf()), size_A[i]);
                     type_to_xdl_math_op_type<hipblasLtXfloat32, float, float>(
                         static_cast<float*>(hB[i].buf()), size_B[i]);
-                }
+                        }*/
 
 #define epilogue_param                                                                      \
     M[gemmIdx], N[gemmIdx], ldd[gemmIdx],                                                   \
@@ -3360,6 +3370,11 @@ void testing_matmul_with_bias(const Arguments& arg,
         }
     }
 
+    double absErrorMax = 0;
+    double absErrorAvg = 0;
+    double relErrorMax = 0;
+    double relErrorAvg = 0;
+
     if(!arg.timing)
     {
         for(size_t sol = 0; sol < heuristicResult.size(); sol++)
@@ -3480,7 +3495,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                       To,
                       Tbias,
                       Taux,
-                      Talpha);
+                      Talpha,
+                      absErrorMax,
+                      absErrorAvg,
+                      relErrorMax,
+                      relErrorAvg);
             }
         }
     }
@@ -3883,7 +3902,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                       To,
                       Tbias,
                       Taux,
-                      Talpha);
+                      Talpha,
+                      absErrorMax,
+                      absErrorAvg,
+                      relErrorMax,
+                      relErrorAvg);
             }
 
 #define argument_param                                                                            \
@@ -3952,7 +3975,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                     cpu_time_used,
                     hipblaslt_error,
                     hipblaslt_atol,
-                    hipblaslt_rtol);
+                    hipblaslt_rtol,
+                    absErrorMax,
+                    absErrorAvg,
+                    relErrorMax,
+                    relErrorAvg);
             }
             if(best_gpu_time > gpu_time_used)
             {
@@ -4010,7 +4037,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                 cpu_time_used,
                 best_norm,
                 best_atol,
-                best_rtol);
+                best_rtol,
+                absErrorMax,
+                absErrorAvg,
+                relErrorMax,
+                relErrorAvg);
         }
     }