apache · maomaodev · May 20, 2026
diff --git a/extensions/spark/kyuubi-spark-connector-hive/pom.xml b/extensions/spark/kyuubi-spark-connector-hive/pom.xml
@@ -31,6 +31,10 @@
     <description>A Kyuubi hive connector based on Spark V2 DataSource</description>
     <url>https://kyuubi.apache.org/</url>
 
+    <properties>
+        <kshc.columnar.source.dir>scala-spark-3.5-plus</kshc.columnar.source.dir>
+    </properties>
+
     <dependencies>
         <dependency>
             <groupId>org.apache.kyuubi</groupId>
@@ -140,6 +144,25 @@
 
     <build>
         <plugins>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>build-helper-maven-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>add-spark-version-source</id>
+                        <goals>
+                            <goal>add-source</goal>
+                        </goals>
+                        <phase>generate-sources</phase>
+                        <configuration>
+                            <sources>
+                                <source>src/main/${kshc.columnar.source.dir}</source>
+                            </sources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-dependency-plugin</artifactId>
@@ -198,6 +221,20 @@
     </build>
 
     <profiles>
+        <profile>
+            <id>spark-3.3</id>
+            <properties>
+                <kshc.columnar.source.dir>scala-spark-pre-3.5</kshc.columnar.source.dir>
+            </properties>
+        </profile>
+
+        <profile>
+            <id>spark-3.4</id>
+            <properties>
+                <kshc.columnar.source.dir>scala-spark-pre-3.5</kshc.columnar.source.dir>
+            </properties>
+        </profile>
+
         <profile>
             <id>cross-version-test</id>
             <dependencies>

diff --git a/...a-spark-3.5-plus/org/apache/kyuubi/spark/connector/hive/read/KyuubiOrcColumnarMixin.scala b/...a-spark-3.5-plus/org/apache/kyuubi/spark/connector/hive/read/KyuubiOrcColumnarMixin.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import org.apache.spark.sql.connector.read.Scan
+import org.apache.spark.sql.execution.WholeStageCodegenExec
+import org.apache.spark.sql.execution.datasources.orc.OrcUtils
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Hard prerequisite for [[KyuubiOrcScan]]'s DPP support on Spark 3.5+.
+ * [[org.apache.spark.sql.connector.read.Scan.ColumnarSupportMode]] is
+ * introduced in Spark 3.5.0 (SPARK-44505). A no-op trait lives under
+ * `src/main/scala-spark-pre-3.5/` for Spark 3.3 / 3.4.
+ *
+ * With the default [[Scan.ColumnarSupportMode.PARTITION_DEFINED]],
+ * `DataSourceV2ScanExecBase.supportsColumnar` materialises `inputPartitions`
+ * during planning to probe each partition's reader factory, which through
+ * `FileScan.partitions` -> `HiveCatalogFileIndex.listFiles` triggers a
+ * full-table HDFS listing before runtime filters arrive via
+ * [[org.apache.spark.sql.connector.read.SupportsRuntimeFiltering.filter]].
+ * DPP still prunes the data scan correctly, but its end-to-end win is
+ * cancelled by this wasted pre-DPP listing. Vanilla Spark sidesteps the
+ * issue because Hive Parquet/ORC tables fall back to V1 `FileSourceScanExec`
+ * and never consult [[Scan.columnarSupportMode]].
+ */
+trait KyuubiOrcColumnarMixin { this: OrcScan =>
+
+  /**
+   * The decision returned here is semantically identical to Spark's
+   * `OrcPartitionReaderFactory.supportColumnarReads`, so advertising it at
+   * scan-level is safe.
+   */
+  override def columnarSupportMode(): Scan.ColumnarSupportMode = {
+    val sqlConf = sparkSession.sessionState.conf
+    val schema = StructType(readDataSchema.fields ++ readPartitionSchema.fields)
+    val supportsColumnar = sqlConf.orcVectorizedReaderEnabled &&
+      sqlConf.wholeStageEnabled &&
+      !WholeStageCodegenExec.isTooManyFields(sqlConf, schema) &&
+      schema.forall(s =>
+        OrcUtils.supportColumnarReads(
+          s.dataType,
+          sqlConf.orcVectorizedReaderNestedColumnEnabled))
+    if (supportsColumnar) Scan.ColumnarSupportMode.SUPPORTED
+    else Scan.ColumnarSupportMode.UNSUPPORTED
+  }
+}
diff --git a/...ark-3.5-plus/org/apache/kyuubi/spark/connector/hive/read/KyuubiParquetColumnarMixin.scala b/...ark-3.5-plus/org/apache/kyuubi/spark/connector/hive/read/KyuubiParquetColumnarMixin.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import org.apache.spark.sql.connector.read.Scan
+import org.apache.spark.sql.execution.WholeStageCodegenExec
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Hard prerequisite for [[KyuubiParquetScan]]'s DPP support on Spark 3.5+.
+ * [[org.apache.spark.sql.connector.read.Scan.ColumnarSupportMode]] is
+ * introduced in Spark 3.5.0 (SPARK-44505). A no-op trait lives under
+ * `src/main/scala-spark-pre-3.5/` for Spark 3.3 / 3.4.
+ *
+ * With the default [[Scan.ColumnarSupportMode.PARTITION_DEFINED]],
+ * `DataSourceV2ScanExecBase.supportsColumnar` materialises `inputPartitions`
+ * during planning to probe each partition's reader factory, which through
+ * `FileScan.partitions` -> `HiveCatalogFileIndex.listFiles` triggers a
+ * full-table HDFS listing before runtime filters arrive via
+ * [[org.apache.spark.sql.connector.read.SupportsRuntimeFiltering.filter]].
+ * DPP still prunes the data scan correctly, but its end-to-end win is
+ * cancelled by this wasted pre-DPP listing. Vanilla Spark sidesteps the
+ * issue because Hive Parquet/ORC tables fall back to V1 `FileSourceScanExec`
+ * and never consult [[Scan.columnarSupportMode]].
+ */
+trait KyuubiParquetColumnarMixin { this: ParquetScan =>
+
+  /**
+   * The decision returned here is semantically identical to Spark's
+   * `ParquetPartitionReaderFactory.supportColumnarReads`, so advertising it
+   * at scan-level is safe.
+   */
+  override def columnarSupportMode(): Scan.ColumnarSupportMode = {
+    val sqlConf = sparkSession.sessionState.conf
+    val schema = StructType(readDataSchema.fields ++ readPartitionSchema.fields)
+    val supportsColumnar = ParquetUtils.isBatchReadSupportedForSchema(sqlConf, schema) &&
+      sqlConf.wholeStageEnabled &&
+      !WholeStageCodegenExec.isTooManyFields(sqlConf, schema)
+    if (supportsColumnar) Scan.ColumnarSupportMode.SUPPORTED
+    else Scan.ColumnarSupportMode.UNSUPPORTED
+  }
+}
diff --git a/...la-spark-pre-3.5/org/apache/kyuubi/spark/connector/hive/read/KyuubiOrcColumnarMixin.scala b/...la-spark-pre-3.5/org/apache/kyuubi/spark/connector/hive/read/KyuubiOrcColumnarMixin.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
+
+/**
+ * No-op stub for Spark 3.3 / 3.4 where
+ * [[org.apache.spark.sql.connector.read.Scan.ColumnarSupportMode]]
+ * is not yet available (introduced in Spark 3.5.0 by SPARK-44505).
+ * The real override that prevents the plan-stage full-table HDFS
+ * listing lives in `src/main/scala-spark-3.5-plus/`.
+ */
+trait KyuubiOrcColumnarMixin { this: OrcScan => }
diff --git a/...park-pre-3.5/org/apache/kyuubi/spark/connector/hive/read/KyuubiParquetColumnarMixin.scala b/...park-pre-3.5/org/apache/kyuubi/spark/connector/hive/read/KyuubiParquetColumnarMixin.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
+
+/**
+ * No-op stub for Spark 3.3 / 3.4 where
+ * [[org.apache.spark.sql.connector.read.Scan.ColumnarSupportMode]]
+ * is not yet available (introduced in Spark 3.5.0 by SPARK-44505).
+ * The real override that prevents the plan-stage full-table HDFS
+ * listing lives in `src/main/scala-spark-3.5-plus/`.
+ */
+trait KyuubiParquetColumnarMixin { this: ParquetScan => }
diff --git a/...park-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/HiveTable.scala b/...park-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/HiveTable.scala
@@ -32,14 +32,12 @@ import org.apache.spark.sql.connector.catalog.TableCapability.{BATCH_READ, BATCH
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.connector.read.ScanBuilder
 import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
-import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder
-import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScanBuilder
 import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.{BucketSpecHelper, LogicalExpressions}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorConf.{READ_CONVERT_METASTORE_ORC, READ_CONVERT_METASTORE_PARQUET}
-import org.apache.kyuubi.spark.connector.hive.read.{HiveCatalogFileIndex, HiveScanBuilder}
+import org.apache.kyuubi.spark.connector.hive.read.{HiveCatalogFileIndex, HiveScanBuilder, KyuubiOrcScanBuilder, KyuubiParquetScanBuilder}
 import org.apache.kyuubi.spark.connector.hive.write.HiveWriteBuilder
 
 case class HiveTable(
@@ -97,10 +95,22 @@ case class HiveTable(
   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
     convertedProvider match {
       case Some("ORC") if sparkSession.sessionState.conf.getConf(READ_CONVERT_METASTORE_ORC) =>
-        OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+        new KyuubiOrcScanBuilder(
+          sparkSession,
+          fileIndex,
+          schema,
+          dataSchema,
+          options,
+          catalogTable)
       case Some("PARQUET")
           if sparkSession.sessionState.conf.getConf(READ_CONVERT_METASTORE_PARQUET) =>
-        ParquetScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+        new KyuubiParquetScanBuilder(
+          sparkSession,
+          fileIndex,
+          schema,
+          dataSchema,
+          options,
+          catalogTable)
       case _ => HiveScanBuilder(sparkSession, fileIndex, dataSchema, catalogTable)
     }
   }