Address review: move Gluten checksum tests to dedicated suite

jaylisde · jaylisde · commit 0836a1929f63 · 2026-06-04T03:32:04.000Z
Review feedback from philo-he on PR #12067: - Drop stale TODO comment above enableSuite[GlutenMapStatusEndToEndSuite] in VeloxTestSettings. - Move the two Gluten-specific row-based checksum tests out of GlutenMapStatusEndToEndSuite (which is a wrapper around upstream Spark's MapStatusEndToEndSuite) into a new dedicated GlutenRowBasedChecksumSuite under org.apache.spark.sql.gluten. The new suite extends GlutenSQLTestsTrait directly and configures LEAF_NODE_DEFAULT_PARALLELISM, CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED, and GlutenConfig.GLUTEN_ANSI_FALLBACK_ENABLED via sparkConf rather than mutating the live session in beforeAll.
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.execution.metric.{GlutenCustomMetricsSuite, GlutenSQLMetricsSuite}
 import org.apache.spark.sql.execution.python._
 import org.apache.spark.sql.extension.{GlutenCollapseProjectExecTransformerSuite, GlutenSessionExtensionSuite}
-import org.apache.spark.sql.gluten.{GlutenFallbackStrategiesSuite, GlutenFallbackSuite}
+import org.apache.spark.sql.gluten.{GlutenFallbackStrategiesSuite, GlutenFallbackSuite, GlutenRowBasedChecksumSuite}
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming._
@@ -1026,7 +1026,6 @@ class VeloxTestSettings extends BackendTestSettings {
     // TODO: fix on Spark-4.1 introduced by https://github.com/apache/spark/pull/47856
     .exclude("SPARK-49386: test SortMergeJoin (with spill by size threshold)")
   enableSuite[GlutenMathFunctionsSuite]
-  // TODO: fix on Spark-4.1 see https://github.com/apache/spark/pull/50230
   enableSuite[GlutenMapStatusEndToEndSuite]
   enableSuite[GlutenMetadataCacheSuite]
     .exclude("SPARK-16336,SPARK-27961 Suggest fixing FileNotFoundException")
@@ -1093,6 +1092,7 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenUnsafeRowChecksumSuite]
   enableSuite[GlutenXPathFunctionsSuite]
   enableSuite[GlutenFallbackSuite]
+  enableSuite[GlutenRowBasedChecksumSuite]
   enableSuite[GlutenHashAggregationQuerySuite]
     // TODO: fix on https://github.com/apache/gluten/issues/11919
     .exclude("udaf with all data types")
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/GlutenMapStatusEndToEndSuite.scala b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/GlutenMapStatusEndToEndSuite.scala
@@ -16,7 +16,6 @@
  */
 package org.apache.spark.sql
 
-import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.internal.SQLConf
 
 class GlutenMapStatusEndToEndSuite extends MapStatusEndToEndSuite with GlutenTestsTrait {
@@ -29,58 +28,5 @@ class GlutenMapStatusEndToEndSuite extends MapStatusEndToEndSuite with GlutenTes
     _spark.sparkContext.conf
       .set(SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key, "false")
     _spark.conf.set(SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key, "false")
-
-    // Disable ANSI fallback to force Gluten's ColumnarShuffleWriter path.
-    _spark.conf.set("spark.gluten.sql.ansiFallback.enabled", "false")
-  }
-
-  import org.apache.spark.MapOutputTrackerMaster
-
-  private def getLatestShuffleChecksumValues(): Array[Long] = {
-    val tracker = _spark.sparkContext.env.mapOutputTracker
-      .asInstanceOf[MapOutputTrackerMaster]
-    val latestShuffleId = tracker.shuffleStatuses.keys.max
-    tracker.shuffleStatuses(latestShuffleId).mapStatuses.map(_.checksumValue)
-  }
-
-  test("Gluten row-based checksum is deterministic") {
-    withSQLConf(
-      SQLConf.SHUFFLE_ORDER_INDEPENDENT_CHECKSUM_ENABLED.key -> "true",
-      SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key -> "false") {
-      withTable("t_det1", "t_det2") {
-        _spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_det1")
-        val checksums1 = getLatestShuffleChecksumValues()
-
-        _spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_det2")
-        val checksums2 = getLatestShuffleChecksumValues()
-
-        // Same input -> same checksumValue (deterministic)
-        assert(
-          checksums1.zip(checksums2).forall { case (a, b) => a == b },
-          s"Checksums not deterministic: ${checksums1.toSeq} vs ${checksums2.toSeq}")
-      }
-    }
-  }
-
-  test("Gluten row-based checksum detects data change") {
-    withSQLConf(
-      SQLConf.SHUFFLE_ORDER_INDEPENDENT_CHECKSUM_ENABLED.key -> "true",
-      SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key -> "false") {
-      withTable("t_diff1", "t_diff2") {
-        _spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_diff1")
-        val checksums1 = getLatestShuffleChecksumValues()
-
-        // Different data
-        _spark.range(
-          500,
-          1000).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_diff2")
-        val checksums2 = getLatestShuffleChecksumValues()
-
-        // Different input -> different checksumValue
-        assert(
-          checksums1.zip(checksums2).exists { case (a, b) => a != b },
-          s"Checksums should differ for different data: ${checksums1.toSeq} vs ${checksums2.toSeq}")
-      }
-    }
   }
 }
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/gluten/GlutenRowBasedChecksumSuite.scala b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/gluten/GlutenRowBasedChecksumSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.gluten
+
+import org.apache.gluten.config.GlutenConfig
+
+import org.apache.spark.{MapOutputTrackerMaster, SparkConf}
+import org.apache.spark.sql.GlutenSQLTestsTrait
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * End-to-end tests for the row-based checksum (SPARK-51756) computed by Gluten's
+ * ColumnarShuffleWriter. Verifies that `MapStatus.checksumValue` is propagated, deterministic
+ * for identical input, and changes when row data changes.
+ */
+class GlutenRowBasedChecksumSuite extends GlutenSQLTestsTrait {
+
+  override def sparkConf: SparkConf = {
+    super.sparkConf
+      .set(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key, "5")
+      .set(SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key, "false")
+      // Disable ANSI fallback to force Gluten's ColumnarShuffleWriter path.
+      .set(GlutenConfig.GLUTEN_ANSI_FALLBACK_ENABLED.key, "false")
+  }
+
+  private def getLatestShuffleChecksumValues(): Array[Long] = {
+    val tracker = spark.sparkContext.env.mapOutputTracker
+      .asInstanceOf[MapOutputTrackerMaster]
+    val latestShuffleId = tracker.shuffleStatuses.keys.max
+    tracker.shuffleStatuses(latestShuffleId).mapStatuses.map(_.checksumValue)
+  }
+
+  test("Gluten row-based checksum is deterministic") {
+    withSQLConf(
+      SQLConf.SHUFFLE_ORDER_INDEPENDENT_CHECKSUM_ENABLED.key -> "true",
+      SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key -> "false") {
+      withTable("t_det1", "t_det2") {
+        spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_det1")
+        val checksums1 = getLatestShuffleChecksumValues()
+
+        spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_det2")
+        val checksums2 = getLatestShuffleChecksumValues()
+
+        // Same input -> same checksumValue (deterministic)
+        assert(
+          checksums1.zip(checksums2).forall { case (a, b) => a == b },
+          s"Checksums not deterministic: ${checksums1.toSeq} vs ${checksums2.toSeq}")
+      }
+    }
+  }
+
+  test("Gluten row-based checksum detects data change") {
+    withSQLConf(
+      SQLConf.SHUFFLE_ORDER_INDEPENDENT_CHECKSUM_ENABLED.key -> "true",
+      SQLConf.CLASSIC_SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED.key -> "false") {
+      withTable("t_diff1", "t_diff2") {
+        spark.range(500).repartition(5, col("id")).write.mode("overwrite").saveAsTable("t_diff1")
+        val checksums1 = getLatestShuffleChecksumValues()
+
+        // Different data
+        spark
+          .range(500, 1000)
+          .repartition(5, col("id"))
+          .write
+          .mode("overwrite")
+          .saveAsTable("t_diff2")
+        val checksums2 = getLatestShuffleChecksumValues()
+
+        // Different input -> different checksumValue
+        assert(
+          checksums1.zip(checksums2).exists { case (a, b) => a != b },
+          s"Checksums should differ for different data: ${checksums1.toSeq} vs ${checksums2.toSeq}")
+      }
+    }
+  }
+}