4646 description : ' Forked test JVMs per shard (TEST_PARALLELISM_COUNT)'
4747 required : true
4848 default : ' 1'
49+ update_baseline :
50+ description : ' Seed/refresh the known-failures baseline instead of enforcing it'
51+ type : boolean
52+ required : false
53+ default : false
54+ fail_on_fixed :
55+ description : ' Fail when a baseline test now passes (keeps the baseline honest)'
56+ type : boolean
57+ required : false
58+ default : true
4959 pull_request :
5060 paths :
5161 - ' .github/workflows/delta_spark_ut.yml'
7181 DELTA_REF_DEFAULT : ' v4.2.0'
7282 DELTA_SPARK_VERSION_DEFAULT : ' 4.1'
7383 DELTA_TEST_PARALLELISM_DEFAULT : ' 1'
84+ # Default mode for pull_request runs (where inputs.* is empty): enforce the
85+ # committed baseline and fail when a baseline test starts passing. Override
86+ # via the workflow_dispatch inputs above.
87+ DELTA_UPDATE_BASELINE_DEFAULT : ' false'
88+ DELTA_FAIL_ON_FIXED_DEFAULT : ' true'
7489 DELTA_SCALA_VERSION : ' 2.13.16'
7590 # Number of shards in the delta-spark-test matrix. Must equal the length of
7691 # the `shard` matrix below.
@@ -212,13 +227,19 @@ jobs:
212227 delta_ref='${{ github.event.inputs.delta_ref }}'
213228 spark_version='${{ github.event.inputs.spark_version }}'
214229 test_parallelism='${{ github.event.inputs.test_parallelism }}'
230+ update_baseline='${{ github.event.inputs.update_baseline }}'
231+ fail_on_fixed='${{ github.event.inputs.fail_on_fixed }}'
215232 : "${delta_ref:=${DELTA_REF_DEFAULT}}"
216233 : "${spark_version:=${DELTA_SPARK_VERSION_DEFAULT}}"
217234 : "${test_parallelism:=${DELTA_TEST_PARALLELISM_DEFAULT}}"
235+ : "${update_baseline:=${DELTA_UPDATE_BASELINE_DEFAULT}}"
236+ : "${fail_on_fixed:=${DELTA_FAIL_ON_FIXED_DEFAULT}}"
218237 {
219238 echo "delta_ref=${delta_ref}"
220239 echo "spark_version=${spark_version}"
221240 echo "test_parallelism=${test_parallelism}"
241+ echo "update_baseline=${update_baseline}"
242+ echo "fail_on_fixed=${fail_on_fixed}"
222243 } | tee -a "$GITHUB_OUTPUT"
223244
224245 - name : Download Gluten bundle jar
@@ -235,7 +256,7 @@ jobs:
235256 # launcher needs). Install the rest of what Delta's build/sbt and the
236257 # tests may need. We deliberately do NOT install the full `curl`
237258 # package -- it conflicts with the pre-installed curl-minimal.
238- yum install -y java-17-openjdk-devel which findutils gzip
259+ yum install -y java-17-openjdk-devel which findutils gzip python3
239260 export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
240261 export PATH=$JAVA_HOME/bin:$PATH
241262 java -version
@@ -341,13 +362,54 @@ jobs:
341362 # Delta's own Test/javaOptions seq so our `-Xmx6G` comes AFTER
342363 # `-Xmx1024m` and wins (last `-Xmx` wins). We also turn on heap
343364 # dump on OOM so if it happens again we can analyze the dump.
365+ # `-u target/test-reports` enables ScalaTest's JUnit XML reporter so
366+ # every suite writes per-test results. Delta itself only configures
367+ # the console reporter (-oDF), so without this we'd have no machine-
368+ # readable results to gate on. The path is relative to the forked
369+ # test JVM's working dir (Test / baseDirectory = spark/), i.e.
370+ # delta/spark/target/test-reports/TEST-*.xml.
371+ #
372+ # We deliberately do NOT let an sbt non-zero exit (which fires on the
373+ # MANY expected Delta-on-Gluten failures) fail this step directly.
374+ # Instead the known-failures gate below decides pass/fail: the build
375+ # is green when the only failures are ones already recorded in the
376+ # baseline, and red on a genuine regression.
377+ set +e
344378 ./build/sbt \
345379 -DsparkVersion=${{ steps.resolve.outputs.spark_version }} \
346380 -v \
347381 -J-XX:+UseG1GC -J-Xmx4G \
348382 "++ ${DELTA_SCALA_VERSION}" \
349383 'set spark / Test / javaOptions ++= Seq("-Xmx6G", "-XX:+HeapDumpOnOutOfMemoryError", "-XX:HeapDumpPath=/tmp/")' \
384+ 'set spark / Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-u", "target/test-reports")' \
350385 "spark/test"
386+ SBT_EXIT=$?
387+ set -e
388+ echo "sbt spark/test exited with ${SBT_EXIT}"
389+
390+ # A compile/launch failure leaves no reports at all. In that case the
391+ # gate would see zero failures and pass spuriously, so fail loudly.
392+ REPORT_COUNT=$(find . -path '*/target/test-reports/*.xml' 2>/dev/null | wc -l || true)
393+ echo "Found ${REPORT_COUNT} JUnit XML report file(s)."
394+ if [ "${REPORT_COUNT}" -eq 0 ]; then
395+ echo "::error::sbt produced no test reports (exit ${SBT_EXIT}) -- likely a compile or launch failure, not test failures."
396+ exit 1
397+ fi
398+
399+ # update_baseline=true -> SEED mode (record failures, never fail) so the
400+ # baseline can be (re)generated. Otherwise ENFORCE against the baseline.
401+ GATE_MODE=enforce
402+ if [ "${{ steps.resolve.outputs.update_baseline }}" = "true" ]; then
403+ GATE_MODE=seed
404+ fi
405+ mkdir -p "$GITHUB_WORKSPACE/gate-out"
406+ python3 "$GITHUB_WORKSPACE/.github/workflows/util/delta-spark-ut/compare-test-results.py" \
407+ --mode "${GATE_MODE}" \
408+ --reports-dir "$GITHUB_WORKSPACE/delta" \
409+ --known-failures "$GITHUB_WORKSPACE/.github/workflows/util/delta-spark-ut/known-failures.txt" \
410+ --failures-out "$GITHUB_WORKSPACE/gate-out/failures-shard-${{ matrix.shard }}.txt" \
411+ --ran-out "$GITHUB_WORKSPACE/gate-out/ran-shard-${{ matrix.shard }}.txt" \
412+ --fail-on-fixed "${{ steps.resolve.outputs.fail_on_fixed }}"
351413
352414 - name : Compress heap dumps (if any)
353415 if : ${{ failure() }}
@@ -364,6 +426,14 @@ jobs:
364426 echo "No heap dumps found in /tmp/."
365427 fi
366428
429+ - name : Upload per-shard gate lists
430+ if : always()
431+ uses : actions/upload-artifact@v4
432+ with :
433+ name : delta-spark-ut-gate-lists-shard-${{ matrix.shard }}
434+ path : gate-out/*.txt
435+ if-no-files-found : warn
436+
367437 - name : Upload test reports
368438 if : always()
369439 uses : actions/upload-artifact@v4
@@ -386,3 +456,37 @@ jobs:
386456 /tmp/*.hprof
387457 /tmp/*.hprof.gz
388458 if-no-files-found : ignore
459+
460+ # Merges every shard's failure/ran lists into a single, sorted, ready-to-commit
461+ # known-failures.txt and reports global regressions / now-passing / stale
462+ # entries. Runs even when some shards went red (if: always()) so the refreshed
463+ # baseline artifact is always available -- this is what you download and commit
464+ # to bootstrap or refresh the baseline (see util/delta-spark-ut/README.md).
465+ delta-spark-aggregate :
466+ needs : delta-spark-test
467+ if : always()
468+ runs-on : ubuntu-22.04
469+ steps :
470+ - uses : actions/checkout@v4
471+ - name : Download per-shard gate lists
472+ uses : actions/download-artifact@v4
473+ continue-on-error : true
474+ with :
475+ pattern : delta-spark-ut-gate-lists-shard-*
476+ path : gate-lists
477+ merge-multiple : true
478+ - name : Aggregate known failures
479+ run : |
480+ set -euo pipefail
481+ python3 .github/workflows/util/delta-spark-ut/compare-test-results.py \
482+ --mode aggregate \
483+ --inputs-dir gate-lists \
484+ --known-failures .github/workflows/util/delta-spark-ut/known-failures.txt \
485+ --baseline-out aggregated/known-failures.txt
486+ - name : Upload refreshed baseline
487+ if : always()
488+ uses : actions/upload-artifact@v4
489+ with :
490+ name : delta-spark-ut-known-failures
491+ path : aggregated/known-failures.txt
492+ if-no-files-found : warn
0 commit comments