apache · zhztheplayer · Jun 12, 2026 · May 31, 2026 · Jun 4, 2026 · Jun 9, 2026
diff --git a/backends-velox/src-delta/main/scala/org/apache/gluten/component/VeloxDeltaComponent.scala b/backends-velox/src-delta/main/scala/org/apache/gluten/component/VeloxDeltaComponent.scala
@@ -36,6 +36,10 @@ class VeloxDeltaComponent extends Component {
 
   override def injectRules(injector: Injector): Unit = {
     val legacy = injector.gluten.legacy
+    // Deletion-vector scans need no Gluten-side logical preprocessing: Delta's own
+    // PreprocessTableWithDVsStrategy injects the skip-row column and filter during physical
+    // planning, DeltaPostTransformRules.nativeDeletionVectorRule strips them when the scan
+    // offloads, and DeltaScanTransformer materializes the per-file DV payloads for Velox.
     legacy.injectTransform {
       c =>
         val offload = Seq(OffloadDeltaScan(), OffloadDeltaProject(), OffloadDeltaFilter())

diff --git a/...x/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaDeletionVectorHandoffSuite.scala b/...x/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaDeletionVectorHandoffSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.delta
+
+import org.apache.gluten.execution.DeltaScanTransformer
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.delta.test.{DeltaSQLCommandTest, DeltaSQLTestUtils}
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.tags.ExtendedSQLTest
+
+import org.apache.hadoop.fs.Path
+
+@ExtendedSQLTest
+class DeltaDeletionVectorHandoffSuite
+  extends QueryTest
+  with SharedSparkSession
+  with DeltaSQLTestUtils
+  with DeltaSQLCommandTest {
+
+  import testImplicits._
+
+  test("Spark 3.5 Delta DV scan handoff should filter deleted rows") {
+    withTempDir {
+      tempDir =>
+        val path = tempDir.getCanonicalPath
+        Seq((1, "a"), (2, "b"), (3, "c"), (4, "d"))
+          .toDF("id", "value")
+          .coalesce(1)
+          .write
+          .format("delta")
+          .save(path)
+
+        spark.sql(
+          s"ALTER TABLE delta.`$path` SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)")
+        spark.sql(s"DELETE FROM delta.`$path` WHERE id IN (3, 4)")
+
+        val log = DeltaLog.forTable(spark, new Path(path))
+        val addFileWithDv = log.update().allFiles.collect().find(_.deletionVector != null)
+        assert(addFileWithDv.nonEmpty)
+
+        val dataFile = addFileWithDv.get
+        assert(dataFile.deletionVector.cardinality == 2L)
+
+        val df = spark.read.format("delta").load(path)
+        val executedPlan = df.queryExecution.executedPlan
+        assert(executedPlan.collect { case _: DeltaScanTransformer => true }.nonEmpty)
+        val planText = executedPlan.toString()
+        assert(!planText.contains("__delta_internal_is_row_deleted"))
+        assert(!planText.contains("__delta_internal_row_index"))
+        checkAnswer(df, Seq((1, "a"), (2, "b")).toDF())
+    }
+  }
+}
diff --git a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala
@@ -197,6 +197,34 @@ class DeltaSuite
     checkAnswer(data.toDF(), Row(1) :: Row(2) :: Row(3) :: Row(4) :: Row(5) :: Row(6) :: Nil)
   }
 
+  test("DV scan without metadata row index falls back and stays correct") {
+    withTempDir {
+      tempDir =>
+        val path = tempDir.getCanonicalPath
+        Seq((1, "a"), (2, "b"), (3, "c"), (4, "d"))
+          .toDF("id", "value")
+          .coalesce(1)
+          .write
+          .format("delta")
+          .save(path)
+
+        spark.sql(
+          s"ALTER TABLE delta.`$path` SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)")
+
+        withSQLConf(DeltaSQLConf.DELETION_VECTORS_USE_METADATA_ROW_INDEX.key -> "false") {
+          spark.sql(s"DELETE FROM delta.`$path` WHERE id IN (3, 4)")
+
+          val log = DeltaLog.forTable(spark, new Path(path))
+          assert(log.update().allFiles.collect().exists(_.deletionVector != null))
+
+          val df = spark.read.format("delta").load(path)
+          val executedPlan = df.queryExecution.executedPlan
+          assert(executedPlan.collect { case _: DeltaScanTransformer => true }.isEmpty)
+          checkAnswer(df, Seq(Row(1, "a"), Row(2, "b")))
+        }
+    }
+  }
+
   test("partitioned append - nulls") {
     val tempDir = Utils.createTempDir()
     Seq(Some(1), None)

diff --git a/...x/src-delta40/test/scala/org/apache/spark/sql/delta/DeltaDeletionVectorHandoffSuite.scala b/...x/src-delta40/test/scala/org/apache/spark/sql/delta/DeltaDeletionVectorHandoffSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.delta
+
+import org.apache.gluten.execution.DeltaScanTransformer
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
+import org.apache.spark.sql.delta.test.{DeltaSQLCommandTest, DeltaSQLTestUtils}
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.tags.ExtendedSQLTest
+
+import org.apache.hadoop.fs.Path
+
+@ExtendedSQLTest
+class DeltaDeletionVectorHandoffSuite
+  extends QueryTest
+  with SharedSparkSession
+  with DeltaSQLTestUtils
+  with DeltaSQLCommandTest {
+
+  import testImplicits._
+
+  test("Spark 4 Delta DV scan should fall back when metadata row index is disabled") {
+    withTempDir {
+      tempDir =>
+        val path = tempDir.getCanonicalPath
+        Seq((1, "a"), (2, "b"), (3, "c"), (4, "d"))
+          .toDF("id", "value")
+          .coalesce(1)
+          .write
+          .format("delta")
+          .save(path)
+
+        spark.sql(
+          s"ALTER TABLE delta.`$path` SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)")
+        spark.sql(s"DELETE FROM delta.`$path` WHERE id IN (3, 4)")
+
+        val log = DeltaLog.forTable(spark, new Path(path))
+        assert(log.update().allFiles.collect().exists(_.deletionVector != null))
+
+        // This covers scan behavior over an existing DV. Keep the no-metadata-row-index
+        // path on Spark until the native path can prove the same contract for DML DVs.
+        withSQLConf(DeltaSQLConf.DELETION_VECTORS_USE_METADATA_ROW_INDEX.key -> "false") {
+          val df = spark.read.format("delta").load(path)
+          val executedPlan = df.queryExecution.executedPlan
+          assert(executedPlan.collect { case _: DeltaScanTransformer => true }.isEmpty)
+          checkAnswer(df, Seq((1, "a"), (2, "b")).toDF())
+        }
+    }
+  }
+
+  test("Spark 4 Delta DV scan handoff should filter deleted rows") {
+    withTempDir {
+      tempDir =>
+        val path = tempDir.getCanonicalPath
+        Seq((1, "a"), (2, "b"), (3, "c"), (4, "d"))
+          .toDF("id", "value")
+          .coalesce(1)
+          .write
+          .format("delta")
+          .save(path)
+
+        spark.sql(
+          s"ALTER TABLE delta.`$path` SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)")
+        spark.sql(s"DELETE FROM delta.`$path` WHERE id IN (3, 4)")
+
+        val log = DeltaLog.forTable(spark, new Path(path))
+        val addFileWithDv = log.update().allFiles.collect().find(_.deletionVector != null)
+        assert(addFileWithDv.nonEmpty)
+
+        val dataFile = addFileWithDv.get
+        assert(dataFile.deletionVector.cardinality == 2L)
+
+        val df = spark.read.format("delta").load(path)
+        val executedPlan = df.queryExecution.executedPlan
+        assert(executedPlan.collect { case _: DeltaScanTransformer => true }.nonEmpty)
+        val planText = executedPlan.toString()
+        assert(!planText.contains("__delta_internal_is_row_deleted"))
+        assert(!planText.contains("__delta_internal_row_index"))
+        checkAnswer(df, Seq((1, "a"), (2, "b")).toDF())
+    }
+  }
+}
diff --git a/cpp/velox/compute/VeloxPlanConverter.cc b/cpp/velox/compute/VeloxPlanConverter.cc
@@ -17,8 +17,13 @@
 
 #include "VeloxPlanConverter.h"
 #include <filesystem>
+#include <limits>
+#include <optional>
 
+#include <google/protobuf/any.pb.h>
+#include <google/protobuf/wrappers.pb.h>
 #include "config/GlutenConfig.h"
+#include "delta/DeltaSplitInfo.h"
 #include "iceberg/IcebergPlanConverter.h"
 #include "operators/plannodes/IteratorSplit.h"
 
@@ -48,6 +53,87 @@ VeloxPlanConverter::VeloxPlanConverter(
 }
 
 namespace {
+std::optional<std::string> unpackMetadataValue(const google::protobuf::Any& value) {
+  google::protobuf::BytesValue bytesValue;
+  if (value.UnpackTo(&bytesValue)) {
+    return bytesValue.value();
+  }
+
+  google::protobuf::StringValue stringValue;
+  if (value.UnpackTo(&stringValue)) {
+    return stringValue.value();
+  }
+
+  google::protobuf::Int32Value int32Value;
+  if (value.UnpackTo(&int32Value)) {
+    return std::to_string(int32Value.value());
+  }
+
+  google::protobuf::Int64Value int64Value;
+  if (value.UnpackTo(&int64Value)) {
+    return std::to_string(int64Value.value());
+  }
+
+  google::protobuf::DoubleValue doubleValue;
+  if (value.UnpackTo(&doubleValue)) {
+    return std::to_string(doubleValue.value());
+  }
+
+  // Matches the string encoding the JVM side uses for booleans, which are packed through
+  // SubstraitUtil.convertJavaObjectToAny's toString fallback rather than as BoolValue.
+  google::protobuf::BoolValue boolValue;
+  if (value.UnpackTo(&boolValue)) {
+    return boolValue.value() ? "true" : "false";
+  }
+
+  return std::nullopt;
+}
+
+delta::DeltaRowIndexFilterType parseDeltaRowIndexFilterType(int filterType) {
+  switch (filterType) {
+    case 1:
+      return delta::DeltaRowIndexFilterType::kIfContained;
+    case 2:
+      return delta::DeltaRowIndexFilterType::kIfNotContained;
+    case 0:
+    default:
+      return delta::DeltaRowIndexFilterType::kKeepAll;
+  }
+}
+
+std::shared_ptr<DeltaSplitInfo> parseDeltaSplitInfo(
+    const substrait::ReadRel_LocalFiles_FileOrFiles& file,
+    std::shared_ptr<SplitInfo> splitInfo) {
+  auto deltaSplitInfo = std::dynamic_pointer_cast<DeltaSplitInfo>(splitInfo)
+      ? std::dynamic_pointer_cast<DeltaSplitInfo>(splitInfo)
+      : std::make_shared<DeltaSplitInfo>(*splitInfo);
+
+  deltaSplitInfo->format = dwio::common::FileFormat::PARQUET;
+  const auto& deltaReadOptions = file.delta();
+  deltaSplitInfo->rowIndexFilterTypes.emplace_back(
+      parseDeltaRowIndexFilterType(deltaReadOptions.row_index_filter_type()));
+
+  if (!deltaReadOptions.has_deletion_vector()) {
+    deltaSplitInfo->deletionVectors.emplace_back(std::nullopt);
+    return deltaSplitInfo;
+  }
+
+  auto serializedPayload = deltaReadOptions.serialized_deletion_vector();
+  VELOX_USER_CHECK(!serializedPayload.empty(), "Delta split has a deletion vector without a serialized payload");
+  VELOX_USER_CHECK_LE(
+      serializedPayload.size(),
+      static_cast<size_t>(std::numeric_limits<int32_t>::max()),
+      "Delta deletion vector serialized payload is too large");
+  const auto cardinality = static_cast<uint64_t>(deltaReadOptions.deletion_vector_cardinality());
+  auto payload = std::make_shared<std::string>(std::move(serializedPayload));
+  const SplitPayloadBufferView payloadView{
+      reinterpret_cast<const uint8_t*>(payload->data()), static_cast<int32_t>(payload->size())};
+  deltaSplitInfo->deletionVectors.emplace_back(
+      delta::DeltaDeletionVectorDescriptor::serialized(cardinality, payloadView));
+  deltaSplitInfo->deletionVectorPayloads.emplace_back(std::move(payload));
+  return deltaSplitInfo;
+}
+
 std::shared_ptr<SplitInfo> parseScanSplitInfo(
     const facebook::velox::config::ConfigBase* veloxCfg,
     const google::protobuf::RepeatedPtrField<substrait::ReadRel_LocalFiles_FileOrFiles>& fileList) {
@@ -75,6 +161,11 @@ std::shared_ptr<SplitInfo> parseScanSplitInfo(
     for (const auto& metadataColumn : file.metadata_columns()) {
       metadataColumnMap[metadataColumn.key()] = metadataColumn.value();
     }
+    for (const auto& otherMetadataColumn : file.other_const_metadata_columns()) {
+      if (auto unpackedValue = unpackMetadataValue(otherMetadataColumn.value())) {
+        metadataColumnMap[otherMetadataColumn.key()] = std::move(*unpackedValue);
+      }
+    }
     splitInfo->metadataColumns.emplace_back(metadataColumnMap);
 
     splitInfo->paths.emplace_back(file.uri_file());
@@ -103,6 +194,9 @@ std::shared_ptr<SplitInfo> parseScanSplitInfo(
       case SubstraitFileFormatCase::kIceberg:
         splitInfo = IcebergPlanConverter::parseIcebergSplitInfo(file, std::move(splitInfo));
         break;
+      case SubstraitFileFormatCase::kDelta:
+        splitInfo = parseDeltaSplitInfo(file, std::move(splitInfo));
+        break;
       default:
         splitInfo->format = dwio::common::FileFormat::UNKNOWN;
         break;