apache
diff --git a/‎.asf.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.asf.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/util/install-spark-resources.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/util/install-spark-resources.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/velox_backend_arm.yml‎
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/velox_backend_arm.yml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.github/workflows/velox_backend_cache.yml‎
Lines changed: 9 additions & 9 deletions b/‎.github/workflows/velox_backend_cache.yml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎.github/workflows/velox_backend_x86.yml‎
Lines changed: 9 additions & 8 deletions b/‎.github/workflows/velox_backend_x86.yml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala‎
Lines changed: 1 addition & 0 deletions b/‎backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala‎
Lines changed: 2 additions & 1 deletion b/‎backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends-velox/src-delta33/main/scala/org/apache/gluten/delta/DeltaDeletionVectorScanInfo.scala‎
Lines changed: 207 additions & 0 deletions b/‎backends-velox/src-delta33/main/scala/org/apache/gluten/delta/DeltaDeletionVectorScanInfo.scala‎
Lines changed: 207 additions & 0 deletions
@@ -48,6 +48,11 @@ github:
     discussions: true
     wiki: false
     projects: true
+  copilot_code_review:
+    enabled: true
+    review_drafts: false
+    review_on_push: true
+
 notifications:
   commits: commits@gluten.apache.org
   issues: commits@gluten.apache.org
 
@@ -118,7 +118,7 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
   4.0)
       # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
       cd ${INSTALL_DIR} && \
-      install_spark "4.0.1" "3" "2.12"
+      install_spark "4.0.2" "3" "2.12"
       mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
       ;;
   4.1)
 
@@ -137,20 +137,21 @@ jobs:
 
   cpp-test-udf-test:
     runs-on: ubuntu-24.04-arm
-    container: apache/gluten:centos-8-jdk8
+    container: apache/gluten:centos-9-jdk8
     steps:
       - uses: actions/checkout@v4
       - name: Get Ccache
         uses: actions/cache/restore@v4
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos8-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
           restore-keys: |
-            ccache-centos8-release-shared-${{runner.arch}}
+            ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten native libraries
         run: |
           df -a
-          bash dev/ci-velox-buildshared-centos-8.sh
+          sed -i "s|gflags_static|gflags_shared|g" /usr/local/lib/cmake/folly/folly-targets.cmake # TODO: remove after upgrading folly to 2024.09.30 or later which has fixed the gflags linkage issue
+          bash dev/ci-velox-buildshared-centos-9.sh
           ccache -s
       - name: Run CPP unit test
         run: |
 
@@ -83,34 +83,34 @@ jobs:
           path: '${{ env.CCACHE_DIR }}'
           key: ccache-centos8-release-default-${{runner.arch}}-${{github.sha}}
 
-  cache-shared-lib-centos-8:
+  cache-shared-lib-centos-9:
     if: ${{ startsWith(github.repository, 'apache/') }}
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         os: [ ubuntu-22.04, ubuntu-24.04-arm ]
-    container: apache/gluten:centos-8-jdk8
+    container: apache/gluten:centos-9-jdk8
     steps:
       - uses: actions/checkout@v4
       - name: Get Ccache
         uses: actions/cache/restore@v3
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos8-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
           restore-keys: |
-            ccache-centos8-release-shared-${{runner.arch}}
+            ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten shared libraries
         run: |
           df -a
           export CCACHE_MAXSIZE=1G
-          bash dev/ci-velox-buildshared-centos-8.sh
+          bash dev/ci-velox-buildshared-centos-9.sh
           ccache -s
       - name: Save Ccache
         uses: actions/cache/save@v3
         id: ccache
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos8-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
 
   cache-enhanced-native-lib-centos-7:
     if: ${{ startsWith(github.repository, 'apache/') }}
@@ -139,7 +139,7 @@ jobs:
           path: '${{ env.CCACHE_DIR }}'
           key: ccache-enhanced-centos7-release-default-${{github.sha}}
 
-  cache-shared-lib-centos-9:
+  cache-shared-lib-centos-9-cudf:
     if: ${{ startsWith(github.repository, 'apache/') }}
     runs-on: ${{ matrix.os }}
     strategy:
@@ -156,7 +156,7 @@ jobs:
         uses: actions/cache/restore@v3
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-cudf-release-shared-${{runner.arch}}-${{github.sha}}
           restore-keys: |
             ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten shared libraries
@@ -187,4 +187,4 @@ jobs:
         id: ccache
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-cudf-release-shared-${{runner.arch}}-${{github.sha}}
@@ -1168,20 +1168,21 @@ jobs:
 
   cpp-test-udf-test:
     runs-on: ubuntu-22.04
-    container: apache/gluten:centos-8-jdk8
+    container: apache/gluten:centos-9-jdk8
     steps:
       - uses: actions/checkout@v4
       - name: Get Ccache
         uses: actions/cache/restore@v4
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos8-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
           restore-keys: |
-            ccache-centos8-release-shared-${{runner.arch}}
+            ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten native libraries
         run: |
           df -a
-          bash dev/ci-velox-buildshared-centos-8.sh
+          sed -i "s|gflags_static|gflags_shared|g" /usr/local/lib/cmake/folly/folly-targets.cmake # TODO remove after image update
+          bash dev/ci-velox-buildshared-centos-9.sh
           ccache -s
       - name: Run CPP unit test
         run: |
@@ -1245,9 +1246,9 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: '${{ env.CCACHE_DIR }}'
-          key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
+          key: ccache-centos9-cudf-release-shared-${{runner.arch}}-${{github.sha}}
           restore-keys: |
-            ccache-centos9-release-shared-${{runner.arch}}
+            ccache-centos9-cudf-release-shared-${{runner.arch}}
       - name: Build Gluten native libraries
         run: |
           docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:centos-9-jdk8-cudf bash -c "
@@ -1307,7 +1308,7 @@ jobs:
           pip3 install setuptools==77.0.3 && \
           pip3 install pyspark==3.5.5 cython && \
           pip3 install pandas==2.2.3 pyarrow==20.0.0
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0
@@ -1358,7 +1359,7 @@ jobs:
         with:
           name: arrow-jars-centos-7-${{github.sha}}
           path: /root/.m2/repository/org/apache/arrow/
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0
 
@@ -200,6 +200,7 @@ object CHExpressionUtil {
     REGR_SLOPE -> DefaultValidator(),
     REGR_INTERCEPT -> DefaultValidator(),
     REGR_SXY -> DefaultValidator(),
+    BITMAP_CONSTRUCT_AGG -> DefaultValidator(),
     TO_UTC_TIMESTAMP -> UtcTimestampValidator(),
     FROM_UTC_TIMESTAMP -> UtcTimestampValidator(),
     STACK -> DefaultValidator(),
 
@@ -151,7 +151,8 @@ class VeloxCelebornColumnarShuffleWriter[K, V](
           nativeBufferSize,
           GlutenConfig.get.columnarShuffleReallocThreshold,
           GlutenConfig.get.columnarShufflePartitionBufferEvictThreshold,
-          partitionWriterHandle
+          partitionWriterHandle,
+          false
         )
       case SortShuffleWriterType =>
         shuffleWriterJniWrapper.createSortShuffleWriter(
 
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.delta
+
+import org.apache.gluten.sql.shims.SparkShimLoader
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.delta.GlutenDeltaParquetFileFormat
+import org.apache.spark.sql.delta.actions.DeletionVectorDescriptor
+import org.apache.spark.sql.delta.deletionvectors.{RoaringBitmapArrayFormat, StoredBitmap}
+import org.apache.spark.sql.delta.storage.dv.HadoopFileSystemDVStore
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+
+import org.apache.hadoop.fs.Path
+
+import java.util.{ArrayList => JArrayList}
+
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+object DeltaDeletionVectorScanInfo {
+  object RowIndexFilterType extends Enumeration {
+    type RowIndexFilterType = Value
+    val KEEP_ALL, IF_CONTAINED, IF_NOT_CONTAINED = Value
+  }
+
+  import RowIndexFilterType._
+
+  final case class DeletionVectorInfo(
+      hasDeletionVector: Boolean,
+      rowIndexFilterType: RowIndexFilterType,
+      cardinality: Long,
+      serializedDeletionVector: Array[Byte])
+
+  final case class PartitionFileScanInfo(
+      normalizedOtherMetadataColumns: Map[String, Object],
+      deletionVectorInfo: DeletionVectorInfo)
+
+  private val RowIndexFilterIdEncoded =
+    GlutenDeltaParquetFileFormat.FILE_ROW_INDEX_FILTER_ID_ENCODED
+  private val RowIndexFilterTypeKey =
+    GlutenDeltaParquetFileFormat.FILE_ROW_INDEX_FILTER_TYPE
+
+  def extract(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      file: PartitionedFile): PartitionFileScanInfo = {
+    val metadata = otherMetadataColumns(file)
+    val normalizedMetadata = metadata -- Seq(RowIndexFilterIdEncoded, RowIndexFilterTypeKey)
+    val dvInfo = extractDeletionVectorInfo(spark, partitionColumnCount, file, metadata)
+    PartitionFileScanInfo(normalizedMetadata, dvInfo)
+  }
+
+  def extractAll(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      files: Seq[PartitionedFile]): Seq[PartitionFileScanInfo] = {
+    files.map(extract(spark, partitionColumnCount, _))
+  }
+
+  def extractAllFromJava(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      files: java.util.List[PartitionedFile]): java.util.List[PartitionFileScanInfo] = {
+    new JArrayList(extractAll(spark, partitionColumnCount, files.asScala.toSeq).asJava)
+  }
+
+  private def extractDeletionVectorInfo(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      file: PartitionedFile,
+      metadata: Map[String, Object]): DeletionVectorInfo = {
+    val descriptorValue = metadata.get(RowIndexFilterIdEncoded)
+    val filterTypeValue = metadata.get(RowIndexFilterTypeKey)
+
+    (descriptorValue, filterTypeValue) match {
+      case (None, None) =>
+        DeletionVectorInfo(false, KEEP_ALL, 0L, Array.emptyByteArray)
+      case (Some(encodedDescriptor), Some(filterType)) =>
+        val descriptor = parseDescriptor(encodedDescriptor.toString)
+        val serializedPayload = serializePayload(spark, partitionColumnCount, file, descriptor)
+        DeletionVectorInfo(
+          true,
+          parseRowIndexFilterType(filterType.toString),
+          descriptor.cardinality,
+          serializedPayload)
+      case _ =>
+        throw new IllegalStateException(
+          s"Both $RowIndexFilterIdEncoded and $RowIndexFilterTypeKey must either be present or absent")
+    }
+  }
+
+  private def otherMetadataColumns(file: PartitionedFile): Map[String, Object] = {
+    val otherMetadata =
+      SparkShimLoader.getSparkShims.getOtherConstantMetadataColumnValues(file)
+    if (otherMetadata == null) {
+      Map.empty
+    } else {
+      otherMetadata.asScala.toMap
+    }
+  }
+
+  private def parseDescriptor(encodedDescriptor: String): DeletionVectorDescriptor = {
+    try {
+      DeletionVectorDescriptor.deserializeFromBase64(encodedDescriptor)
+    } catch {
+      case NonFatal(e) =>
+        throw new IllegalArgumentException("Unable to parse Delta deletion vector descriptor", e)
+    }
+  }
+
+  private def parseRowIndexFilterType(filterType: String): RowIndexFilterType = {
+    filterType match {
+      case "IF_CONTAINED" => IF_CONTAINED
+      case "IF_NOT_CONTAINED" => IF_NOT_CONTAINED
+      case "KEEP_ALL" => KEEP_ALL
+      case unexpected =>
+        throw new IllegalStateException(s"Unexpected row index filter type: $unexpected")
+    }
+  }
+
+  private def serializePayload(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      file: PartitionedFile,
+      descriptor: DeletionVectorDescriptor): Array[Byte] = {
+    val tablePath = resolveTablePath(spark, partitionColumnCount, file)
+    if (tablePath == null) {
+      throw new IllegalStateException(
+        "Unable to resolve Delta table path while materializing deletion vector payload")
+    }
+    val dvStore = new HadoopFileSystemDVStore(spark.sessionState.newHadoopConf())
+    StoredBitmap
+      .create(descriptor, tablePath)
+      .load(dvStore)
+      .serializeAsByteArray(RoaringBitmapArrayFormat.Portable)
+  }
+
+  private def resolveTablePath(
+      spark: SparkSession,
+      partitionColumnCount: Int,
+      file: PartitionedFile): Path = {
+    val fileParent = new Path(unescapePathName(file.filePath.toString)).getParent
+    var tablePath = fileParent
+    for (_ <- 0 until partitionColumnCount) {
+      tablePath = tablePath.getParent
+    }
+    if (tablePath != null && isDeltaTablePath(spark, tablePath)) {
+      return tablePath
+    }
+
+    var candidate = fileParent
+    while (candidate != null && !isDeltaTablePath(spark, candidate)) {
+      candidate = candidate.getParent
+    }
+    if (candidate != null) candidate else tablePath
+  }
+
+  private def isDeltaTablePath(spark: SparkSession, tablePath: Path): Boolean = {
+    val deltaLogPath = new Path(tablePath, "_delta_log")
+    try {
+      deltaLogPath.getFileSystem(spark.sessionState.newHadoopConf()).exists(deltaLogPath)
+    } catch {
+      case NonFatal(_) => false
+    }
+  }
+
+  private def unescapePathName(path: String): String = {
+    if (path == null || path.indexOf('%') < 0) {
+      path
+    } else {
+      val builder = new StringBuilder(path.length)
+      var index = 0
+      while (index < path.length) {
+        if (path.charAt(index) == '%' && index + 2 < path.length) {
+          val high = Character.digit(path.charAt(index + 1), 16)
+          val low = Character.digit(path.charAt(index + 2), 16)
+          if (high >= 0 && low >= 0) {
+            builder.append(((high << 4) | low).toChar)
+            index += 3
+          } else {
+            builder.append(path.charAt(index))
+            index += 1
+          }
+        } else {
+          builder.append(path.charAt(index))
+          index += 1
+        }
+      }
+      builder.toString()
+    }
+  }
+}