Skip to content

Commit 0431eb5

Browse files
sezrubyclaude
andcommitted
[GLUTEN-10511][VL][Delta] Use getPartitionArray for partition-count assertion
`getPartitions` reflects post-coalesce splits (Velox merges small per-partition files into one input split), which made the assertion flaky for tables with tiny files. Switch to `getPartitionArray` -- the pre-coalesce list of partition directories selected by the executed scan -- which is the actual call site exercised by the bug fix. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 091a6ca commit 0431eb5

1 file changed

Lines changed: 17 additions & 13 deletions

File tree

gluten-delta/src/test/scala/org/apache/gluten/execution/DeltaSuite.scala

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,17 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
8888
private def deltaInputFileCount(df: org.apache.spark.sql.DataFrame): Int =
8989
df.inputFiles.length
9090

91-
// Counts splits the native scan will execute. Reflects the post-Gluten-rewrite state, so it
92-
// catches regressions where Gluten silently dropped pruning (the pre-fix behavior).
93-
private def scanPartitionCount(df: org.apache.spark.sql.DataFrame): Int = {
91+
// Counts the partition directories selected by the executed scan after Gluten's rewrite.
92+
// Backed by `selectedPartitions` -> `relation.location.listFiles(partitionFilters, dataFilters)`,
93+
// which is the exact call site of issue #10511: pre-fix, physical-named partition filters
94+
// could not match Delta's logical partition schema, so this returned all directories. We use
95+
// `getPartitionArray` rather than `getPartitions` because the latter reflects post-coalesce
96+
// splits (Velox may merge small files into one split, hiding the per-partition count).
97+
private def selectedPartitionCount(df: org.apache.spark.sql.DataFrame): Int = {
9498
val scan = df.queryExecution.executedPlan.collect {
9599
case f: DeltaScanTransformer => f
96100
}.head
97-
scan.getPartitions.size
101+
scan.getPartitionArray.length
98102
}
99103

100104
// Regression for issue #10511: with column mapping, a partition column filter must prune
@@ -121,27 +125,27 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
121125
checkLengthAndPlan(df1, 1)
122126
checkAnswer(df1, Row("v2") :: Nil)
123127
assert(deltaInputFileCount(df1) == 1, "Delta should prune to 1 file")
124-
assert(scanPartitionCount(df1) == 1, "native scan should see 1 split")
128+
assert(selectedPartitionCount(df1) == 1, "native scan should see 1 split")
125129

126130
// Range on partition column (the exact case from the bug report).
127131
val df2 = runQueryAndCompare("select name from delta_cm_part where id > 2") { _ => }
128132
checkLengthAndPlan(df2, 1)
129133
checkAnswer(df2, Row("v3") :: Nil)
130134
assert(deltaInputFileCount(df2) == 1)
131-
assert(scanPartitionCount(df2) == 1)
135+
assert(selectedPartitionCount(df2) == 1)
132136

133137
// IN list on partition column. 2 of 3 partitions match.
134138
val df3 =
135139
runQueryAndCompare("select name from delta_cm_part where id in (1, 3)") { _ => }
136140
checkLengthAndPlan(df3, 2)
137141
checkAnswer(df3, Row("v1") :: Row("v3") :: Nil)
138142
assert(deltaInputFileCount(df3) == 2)
139-
assert(scanPartitionCount(df3) == 2)
143+
assert(selectedPartitionCount(df3) == 2)
140144

141145
// No filter -- baseline: all 3 partitions read.
142146
val dfAll = runQueryAndCompare("select name from delta_cm_part") { _ => }
143147
assert(deltaInputFileCount(dfAll) == 3)
144-
assert(scanPartitionCount(dfAll) == 3)
148+
assert(selectedPartitionCount(dfAll) == 3)
145149
}
146150
}
147151

@@ -165,15 +169,15 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
165169
checkLengthAndPlan(df, 1)
166170
checkAnswer(df, Row("v2") :: Nil)
167171
assert(deltaInputFileCount(df) == 1, "Delta should prune to 1 file with both filters")
168-
assert(scanPartitionCount(df) == 1)
172+
assert(selectedPartitionCount(df) == 1)
169173

170174
// Filter on only one of two partition columns.
171175
val df2 = runQueryAndCompare(
172176
"select name from delta_cm_part_multi where region = 'eu'") { _ => }
173177
checkLengthAndPlan(df2, 2)
174178
checkAnswer(df2, Row("v3") :: Row("v4") :: Nil)
175179
assert(deltaInputFileCount(df2) == 2)
176-
assert(scanPartitionCount(df2) == 2)
180+
assert(selectedPartitionCount(df2) == 2)
177181
}
178182
}
179183

@@ -229,13 +233,13 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
229233
"select name from delta_cm_part_null where id is null") { _ => }
230234
checkAnswer(df1, Row("vn") :: Nil)
231235
assert(deltaInputFileCount(df1) == 1)
232-
assert(scanPartitionCount(df1) == 1)
236+
assert(selectedPartitionCount(df1) == 1)
233237

234238
val df2 = runQueryAndCompare(
235239
"select name from delta_cm_part_null where id is not null") { _ => }
236240
checkAnswer(df2, Row("v1") :: Row("v2") :: Nil)
237241
assert(deltaInputFileCount(df2) == 2)
238-
assert(scanPartitionCount(df2) == 2)
242+
assert(selectedPartitionCount(df2) == 2)
239243
}
240244
}
241245

@@ -260,7 +264,7 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
260264
checkLengthAndPlan(df, 2)
261265
checkAnswer(df, Row("v2") :: Row("v3") :: Nil)
262266
assert(deltaInputFileCount(df) == 2)
263-
assert(scanPartitionCount(df) == 2)
267+
assert(selectedPartitionCount(df) == 2)
264268
}
265269
}
266270

0 commit comments

Comments
 (0)