[GLUTEN] Fix wrong input_file_name() for BHJ build-side LocalRelation

taiyang-li · AIME · liyang.127 · commit 52b845f752ce · 2026-06-10T14:30:19.000+08:00
## Background

When running queries like below on Gluten (e.g. with the Velox backend),
`input_file_name()` returns an empty string:

```sql
SELECT a.event, input_file_name() AS fname
FROM   parquet_table a
JOIN  (SELECT X AS k1, 123L AS k2) b
ON     a.event = b.k1 AND a.device_id = b.k2;
```

The physical plan contains (key fragment):

```
ProjectExecTransformer [..., input_file_name#1816 AS fname]
+- BroadcastHashJoinExecTransformer ..., BuildLeft
   :- InputIteratorTransformer[..., input_file_name#1816]
   :  +- RowToColumnar
   :     +- *(1) Project [..., input_file_name() AS input_file_name#1816]   ← BUG
   :        +- ColumnarToRow
   :           +- BroadcastQueryStage 0
   :              +- ColumnarBroadcastExchange
   :                 +- LocalTableScan [...]                                 ← no file context
   +- ProjectExecTransformer [..., input_file_name#1816]
      +- FileScanTransformer parquet ...[..., input_file_name#1816]          ← same ExprId
```

## Root cause

`PushDownInputFileExpression.PreOffload` originally injects
`Project [..., input_file_name() AS attr#N]` above **every** `LeafExecNode`.

1. When BHJ's build side is a `LocalTableScanExec` / `RangeExec` /
   `RDDScanExec` etc. (no real file context), `input_file_name()` has no
   `InputFileBlockHolder` thread-local and always returns `""`.
2. Both leaves end up reusing the same `ExprId`. When BHJ resolves
   `left ++ right`, the outer `Project` is rebound to the build-side empty
   attribute, so the final query returns an empty file name.

## Fix

Only inject `input_file_name()` on leaves that can really populate
`InputFileBlockHolder`:

- `FileSourceScanExec`
- v2 `BatchScanExec`
- Hive table scan (`HiveTableScanExecTransformer.isHiveTableScan`)
- `BatchScanExecTransformerBase` (already special-cased in community)

In addition, the `ProjectExec` match in `PreOffload` now requires that the
subtree actually contains at least one such file-aware source via the new
`hasInputFileRelatedSource` helper. This avoids producing a fake
`input_file_name` attribute on non-file leaves and avoids polluting the
common ExprId with the empty string from the BHJ build side.

## Test

Added `input_file_name() with BHJ build-side LocalRelation must return real path`
in `ScalarFunctionsValidateSuite` (backends-velox) covering:

- `fname` column is non-empty for every joined row;
- `fname` contains the real parquet path;
- compared against vanilla Spark.

The existing `test("input_file_name")` (file/Hive scan paths) is unchanged
because those scans are still in the whitelist.

Co-Authored-By: AIME &lt;aime@bytedance.com&gt;
Change-Id: I77c1fa343444488fb8b71deb8dd0b13d587d2155
diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
@@ -1077,6 +1077,50 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite {
     }
   }
 
+  test("input_file_name() with BHJ build-side LocalRelation must return real path") {
+    withTempPath {
+      path =>
+        Seq(("mp_cert_face_result", 3915800915739947L, "param1"))
+          .toDF("event", "device_id", "params")
+          .write
+          .parquet(path.getCanonicalPath)
+        spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("event_log")
+
+        withSQLConf(
+          "spark.sql.autoBroadcastJoinThreshold" -> "10MB",
+          "spark.sql.adaptive.enabled" -> "true"
+        ) {
+          val sql =
+            """
+              |SELECT  a.event,
+              |        a.params,
+              |        a.device_id,
+              |        input_file_name() AS fname
+              |FROM    event_log a
+              |JOIN
+              |        (
+              |            SELECT  'mp_cert_face_result' AS envent,
+              |                    3915800915739947 AS device_id
+              |        ) b
+              |ON      a.event     = b.envent
+              |AND     a.device_id = b.device_id
+              |""".stripMargin
+
+          compareResultsAgainstVanillaSpark(sql, true, { _ => })
+
+          val df = spark.sql(sql)
+          val rows = df.collect()
+          assert(rows.nonEmpty, "Join should match at least one row")
+          rows.foreach {
+            r =>
+              val fname = r.getAs[String]("fname")
+              assert(fname != null && fname.nonEmpty)
+              assert(fname.contains(path.getName))
+          }
+        }
+    }
+  }
+
   testWithMinSparkVersion("array insert", "3.4") {
     withTempPath {
       path =>
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/extension/columnar/PushDownInputFileExpression.scala b/gluten-substrait/src/main/scala/org/apache/gluten/extension/columnar/PushDownInputFileExpression.scala
@@ -21,7 +21,8 @@ import org.apache.gluten.execution.{BatchScanExecTransformerBase, FileSourceScan
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, NamedExpression}
 import org.apache.spark.sql.catalyst.optimizer.CollapseProjectShim
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.{DeserializeToObjectExec, LeafExecNode, ProjectExec, SerializeFromObjectExec, SparkPlan, UnionExec}
+import org.apache.spark.sql.execution.{DeserializeToObjectExec, FileSourceScanExec, LeafExecNode, ProjectExec, SerializeFromObjectExec, SparkPlan, UnionExec}
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.hive.HiveTableScanExecTransformer
 
 import java.util.Locale
@@ -87,7 +88,8 @@ object PushDownInputFileExpression {
 
   object PreOffload extends Rule[SparkPlan] {
     override def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-      case ProjectExec(projectList, child) if projectList.exists(containsInputFileRelatedExpr) =>
+      case ProjectExec(projectList, child)
+          if projectList.exists(containsInputFileRelatedExpr) && hasInputFileRelatedSource(child) =>
         val replacedExprs = mutable.Map[String, Alias]()
         val newProjectList = projectList.map {
           expr => rewriteExpr(expr, replacedExprs).asInstanceOf[NamedExpression]
@@ -104,8 +106,10 @@ object PushDownInputFileExpression {
           // For BatchScanExecTransformerBase (includes Iceberg scans), add fallback tag
           // to prevent offloading when input_file expressions are present
           addFallbackTag(ProjectExec(p.output ++ replacedExprs.values, p))
-        case p: LeafExecNode =>
+        case p: LeafExecNode if shouldAddInputFileExpr(p) =>
           addFallbackTag(ProjectExec(p.output ++ replacedExprs.values, p))
+        case p: LeafExecNode =>
+          p
         // Output of SerializeFromObjectExec's child and output of DeserializeToObjectExec must be
         // a single-field row.
         case p @ (_: SerializeFromObjectExec | _: DeserializeToObjectExec) =>
@@ -127,6 +131,23 @@ object PushDownInputFileExpression {
           u.copy(children = newFirstChild +: newOtherChildren)
         case p => p.withNewChildren(p.children.map(child => addMetadataCol(child, replacedExprs)))
       }
+
+    private def hasInputFileRelatedSource(plan: SparkPlan): Boolean = {
+      plan match {
+        case _: BatchScanExecTransformerBase => true
+        case p: LeafExecNode => shouldAddInputFileExpr(p)
+        case _ => plan.children.exists(hasInputFileRelatedSource)
+      }
+    }
+
+    private def shouldAddInputFileExpr(plan: SparkPlan): Boolean = {
+      plan match {
+        case _: FileSourceScanExec => true
+        case _: BatchScanExec => true
+        case p if HiveTableScanExecTransformer.isHiveTableScan(p) => true
+        case _ => false
+      }
+    }
   }
 
   object PostOffload extends Rule[SparkPlan] {