apache
diff --git a/‎.github/workflows/delta_spark_ut.yml‎
Lines changed: 105 additions & 1 deletion b/‎.github/workflows/delta_spark_ut.yml‎
Lines changed: 105 additions & 1 deletion
diff --git a/‎.github/workflows/util/delta-spark-ut/README.md‎
Lines changed: 112 additions & 0 deletions b/‎.github/workflows/util/delta-spark-ut/README.md‎
Lines changed: 112 additions & 0 deletions
@@ -46,6 +46,16 @@ on:
         description: 'Forked test JVMs per shard (TEST_PARALLELISM_COUNT)'
         required: true
         default: '1'
+      update_baseline:
+        description: 'Seed/refresh the known-failures baseline instead of enforcing it'
+        type: boolean
+        required: false
+        default: false
+      fail_on_fixed:
+        description: 'Fail when a baseline test now passes (keeps the baseline honest)'
+        type: boolean
+        required: false
+        default: true
   pull_request:
     paths:
       - '.github/workflows/delta_spark_ut.yml'
@@ -71,6 +81,11 @@ env:
   DELTA_REF_DEFAULT: 'v4.2.0'
   DELTA_SPARK_VERSION_DEFAULT: '4.1'
   DELTA_TEST_PARALLELISM_DEFAULT: '1'
+  # Default mode for pull_request runs (where inputs.* is empty): enforce the
+  # committed baseline and fail when a baseline test starts passing. Override
+  # via the workflow_dispatch inputs above.
+  DELTA_UPDATE_BASELINE_DEFAULT: 'false'
+  DELTA_FAIL_ON_FIXED_DEFAULT: 'true'
   DELTA_SCALA_VERSION: '2.13.16'
   # Number of shards in the delta-spark-test matrix. Must equal the length of
   # the `shard` matrix below.
@@ -212,13 +227,19 @@ jobs:
           delta_ref='${{ github.event.inputs.delta_ref }}'
           spark_version='${{ github.event.inputs.spark_version }}'
           test_parallelism='${{ github.event.inputs.test_parallelism }}'
+          update_baseline='${{ github.event.inputs.update_baseline }}'
+          fail_on_fixed='${{ github.event.inputs.fail_on_fixed }}'
           : "${delta_ref:=${DELTA_REF_DEFAULT}}"
           : "${spark_version:=${DELTA_SPARK_VERSION_DEFAULT}}"
           : "${test_parallelism:=${DELTA_TEST_PARALLELISM_DEFAULT}}"
+          : "${update_baseline:=${DELTA_UPDATE_BASELINE_DEFAULT}}"
+          : "${fail_on_fixed:=${DELTA_FAIL_ON_FIXED_DEFAULT}}"
           {
             echo "delta_ref=${delta_ref}"
             echo "spark_version=${spark_version}"
             echo "test_parallelism=${test_parallelism}"
+            echo "update_baseline=${update_baseline}"
+            echo "fail_on_fixed=${fail_on_fixed}"
           } | tee -a "$GITHUB_OUTPUT"
 
       - name: Download Gluten bundle jar
@@ -235,7 +256,7 @@ jobs:
           # launcher needs). Install the rest of what Delta's build/sbt and the
           # tests may need. We deliberately do NOT install the full `curl`
           # package -- it conflicts with the pre-installed curl-minimal.
-          yum install -y java-17-openjdk-devel which findutils gzip
+          yum install -y java-17-openjdk-devel which findutils gzip python3
           export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
           export PATH=$JAVA_HOME/bin:$PATH
           java -version
@@ -341,13 +362,54 @@ jobs:
           #   Delta's own Test/javaOptions seq so our `-Xmx6G` comes AFTER
           #   `-Xmx1024m` and wins (last `-Xmx` wins). We also turn on heap
           #   dump on OOM so if it happens again we can analyze the dump.
+          # `-u target/test-reports` enables ScalaTest's JUnit XML reporter so
+          # every suite writes per-test results. Delta itself only configures
+          # the console reporter (-oDF), so without this we'd have no machine-
+          # readable results to gate on. The path is relative to the forked
+          # test JVM's working dir (Test / baseDirectory = spark/), i.e.
+          # delta/spark/target/test-reports/TEST-*.xml.
+          #
+          # We deliberately do NOT let an sbt non-zero exit (which fires on the
+          # MANY expected Delta-on-Gluten failures) fail this step directly.
+          # Instead the known-failures gate below decides pass/fail: the build
+          # is green when the only failures are ones already recorded in the
+          # baseline, and red on a genuine regression.
+          set +e
           ./build/sbt \
             -DsparkVersion=${{ steps.resolve.outputs.spark_version }} \
             -v \
             -J-XX:+UseG1GC -J-Xmx4G \
             "++ ${DELTA_SCALA_VERSION}" \
             'set spark / Test / javaOptions ++= Seq("-Xmx6G", "-XX:+HeapDumpOnOutOfMemoryError", "-XX:HeapDumpPath=/tmp/")' \
+            'set spark / Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-u", "target/test-reports")' \
             "spark/test"
+          SBT_EXIT=$?
+          set -e
+          echo "sbt spark/test exited with ${SBT_EXIT}"
+
+          # A compile/launch failure leaves no reports at all. In that case the
+          # gate would see zero failures and pass spuriously, so fail loudly.
+          REPORT_COUNT=$(find . -path '*/target/test-reports/*.xml' 2>/dev/null | wc -l || true)
+          echo "Found ${REPORT_COUNT} JUnit XML report file(s)."
+          if [ "${REPORT_COUNT}" -eq 0 ]; then
+            echo "::error::sbt produced no test reports (exit ${SBT_EXIT}) -- likely a compile or launch failure, not test failures."
+            exit 1
+          fi
+
+          # update_baseline=true -> SEED mode (record failures, never fail) so the
+          # baseline can be (re)generated. Otherwise ENFORCE against the baseline.
+          GATE_MODE=enforce
+          if [ "${{ steps.resolve.outputs.update_baseline }}" = "true" ]; then
+            GATE_MODE=seed
+          fi
+          mkdir -p "$GITHUB_WORKSPACE/gate-out"
+          python3 "$GITHUB_WORKSPACE/.github/workflows/util/delta-spark-ut/compare-test-results.py" \
+            --mode "${GATE_MODE}" \
+            --reports-dir "$GITHUB_WORKSPACE/delta" \
+            --known-failures "$GITHUB_WORKSPACE/.github/workflows/util/delta-spark-ut/known-failures.txt" \
+            --failures-out "$GITHUB_WORKSPACE/gate-out/failures-shard-${{ matrix.shard }}.txt" \
+            --ran-out "$GITHUB_WORKSPACE/gate-out/ran-shard-${{ matrix.shard }}.txt" \
+            --fail-on-fixed "${{ steps.resolve.outputs.fail_on_fixed }}"
 
       - name: Compress heap dumps (if any)
         if: ${{ failure() }}
@@ -364,6 +426,14 @@ jobs:
             echo "No heap dumps found in /tmp/."
           fi
 
+      - name: Upload per-shard gate lists
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: delta-spark-ut-gate-lists-shard-${{ matrix.shard }}
+          path: gate-out/*.txt
+          if-no-files-found: warn
+
       - name: Upload test reports
         if: always()
         uses: actions/upload-artifact@v4
@@ -386,3 +456,37 @@ jobs:
             /tmp/*.hprof
             /tmp/*.hprof.gz
           if-no-files-found: ignore
+
+  # Merges every shard's failure/ran lists into a single, sorted, ready-to-commit
+  # known-failures.txt and reports global regressions / now-passing / stale
+  # entries. Runs even when some shards went red (if: always()) so the refreshed
+  # baseline artifact is always available -- this is what you download and commit
+  # to bootstrap or refresh the baseline (see util/delta-spark-ut/README.md).
+  delta-spark-aggregate:
+    needs: delta-spark-test
+    if: always()
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download per-shard gate lists
+        uses: actions/download-artifact@v4
+        continue-on-error: true
+        with:
+          pattern: delta-spark-ut-gate-lists-shard-*
+          path: gate-lists
+          merge-multiple: true
+      - name: Aggregate known failures
+        run: |
+          set -euo pipefail
+          python3 .github/workflows/util/delta-spark-ut/compare-test-results.py \
+            --mode aggregate \
+            --inputs-dir gate-lists \
+            --known-failures .github/workflows/util/delta-spark-ut/known-failures.txt \
+            --baseline-out aggregated/known-failures.txt
+      - name: Upload refreshed baseline
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: delta-spark-ut-known-failures
+          path: aggregated/known-failures.txt
+          if-no-files-found: warn
@@ -0,0 +1,112 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Delta Spark UT (Gluten) — managing expected failures
+
+Running delta-io/delta's `spark` ScalaTest suite against the Gluten Velox
+bundle produces **many expected failures**: Gluten does not yet offload every
+Delta code path, and falls back or behaves differently in places. If CI simply
+went red on any failure, the signal would be useless and we could never tell a
+*new* breakage from the hundreds of already-known ones.
+
+To make this manageable we keep a **baseline of known failures** and gate each
+run against it. The build is green when the only failing tests are ones already
+recorded in the baseline; it goes red the moment a **previously-passing test
+starts failing** (a regression).
+
+## Files
+
+| File | Purpose |
+|---|---|
+| `known-failures.txt` | Committed baseline: the tests currently expected to fail. One `<suite>#<test>` per line. |
+| `compare-test-results.py` | Parses the JUnit XML from `sbt spark/test` and gates / seeds / aggregates against the baseline. Standard-library only. |
+| `setup-delta.sh` | Clones Delta, drops in the Gluten bundle, and patches `DeltaSQLCommandTest`. |
+
+## How the gate works
+
+Each test shard:
+
+1. Runs `sbt spark/test` with ScalaTest's JUnit XML reporter enabled
+   (`-u target/test-reports`), so every suite writes per-test results. (Delta
+   itself only configures the console reporter, so the workflow injects this.)
+2. Runs `compare-test-results.py --mode enforce`, which classifies every test:
+   - **regression** — failed, but not in the baseline → **fails the shard**.
+   - **expected** — failed and in the baseline → ignored.
+   - **now-passing** — in the baseline but passed this run → fails the shard
+     (so the baseline is kept honest), unless `fail_on_fixed=false`.
+
+A final `aggregate` job merges every shard's results into a single, sorted,
+ready-to-commit `known-failures.txt` artifact and reports **stale** baseline
+entries (tests no longer present in any shard, e.g. after a Delta version bump).
+
+Because Delta shards **by suite**, every suite (and therefore every test) runs
+in exactly one shard, so per-shard enforcement sees complete suites and never
+double-counts.
+
+## Bootstrapping the baseline (first time)
+
+While `known-failures.txt` has no entries the gate auto-runs in **seed mode**
+(it never fails — it only records failures). To create the initial baseline:
+
+1. Trigger **Actions → Delta Spark UT (Gluten) → Run workflow** with
+   `update_baseline = true`.
+2. When it finishes, download the **`delta-spark-ut-known-failures`** artifact.
+3. Replace `known-failures.txt` with the file from that artifact and commit it.
+
+From the next run onward the gate enforces the baseline.
+
+## Day-to-day: fixing tests incrementally
+
+- **You fixed Gluten and some Delta tests now pass.** CI will flag them as
+  *now-passing*. Delete those lines from `known-failures.txt` in your PR. That
+  is the whole point — the baseline only ever shrinks as coverage improves.
+- **You intentionally added a new expected failure** (e.g. a Delta path Gluten
+  can't offload yet). Add the exact `Suite#test` line(s) the gate prints under
+  *Regressions* to `known-failures.txt`, ideally with a comment explaining why.
+- **A genuine regression.** Fix it; do **not** add it to the baseline.
+
+The error log prints copy-pasteable `Suite#test` lines for both regressions and
+now-passing tests, and each run's job summary shows the full breakdown.
+
+## Regenerating / refreshing the whole baseline
+
+After a Delta version bump or a large Gluten change, regenerate from scratch the
+same way as bootstrapping: run the workflow with `update_baseline=true`, download
+the `delta-spark-ut-known-failures` artifact, and commit it. The aggregate job
+also lists **stale** entries you can prune.
+
+## Caveats
+
+- **Flaky tests.** A flaky test that usually passes will be flagged as a
+  regression when it flakes; one that usually fails (and is in the baseline)
+  may be flagged as now-passing when it happens to pass. Re-run, or set
+  `fail_on_fixed=false` for that run, and keep genuinely flaky tests out of the
+  enforced set.
+- **Known failures still execute** (and fail) — they are gated *after* the run,
+  not skipped — so they still consume CI time. This keeps us decoupled from
+  Delta's sources; skipping them at runtime would require patching Delta.
+
+## Running the comparison locally
+
+```bash
+# after an sbt spark/test run that wrote delta/**/target/test-reports/*.xml
+python3 .github/workflows/util/delta-spark-ut/compare-test-results.py \
+  --mode enforce \
+  --reports-dir delta \
+  --known-failures .github/workflows/util/delta-spark-ut/known-failures.txt \
+  --failures-out /tmp/failures.txt --ran-out /tmp/ran.txt
+```