Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,31 @@ var buildkiteAgentCommandFlag = &cli.StringFlag{
Hidden: true,
}

var promiseFailureFlag = &cli.BoolFlag{
Name: "promise-failure",
Category: "TEST ENGINE",
Usage: "Declare an early failure to the Buildkite Agent API once retries are exhausted and hard (non-muted) failures remain. Opt-in.",
Value: false,
Sources: cli.EnvVars("BUILDKITE_TEST_ENGINE_PROMISE_FAILURE"),
Destination: &cfg.PromiseFailure,
}

var agentEndpointFlag = &cli.StringFlag{
Name: "agent-endpoint",
Usage: "Base URL of the Buildkite Agent API, used for promise_failure. Defaults to the job's BUILDKITE_AGENT_ENDPOINT.",
Sources: cli.EnvVars("BUILDKITE_AGENT_ENDPOINT"),
Destination: &cfg.AgentEndpoint,
Hidden: true,
}

var agentAccessTokenFlag = &cli.StringFlag{
Name: "agent-access-token",
Usage: "Buildkite Agent API access token, used for promise_failure. Defaults to the job's BUILDKITE_AGENT_ACCESS_TOKEN.",
Sources: cli.EnvVars("BUILDKITE_AGENT_ACCESS_TOKEN"),
Destination: &cfg.AgentAccessToken,
Hidden: true,
}

// `run` command flags
var planIdentifierFlag = &cli.StringFlag{
Name: "plan-identifier",
Expand Down Expand Up @@ -587,6 +612,7 @@ func runCommandFlags() []cli.Flag {
flags = append(flags, runnerEnvironmentFlags...)
flags = append(flags, parallelismFlag)
flags = append(flags, failOnNoTestsFlag)
flags = append(flags, promiseFailureFlag, agentEndpointFlag, agentAccessTokenFlag)
flags = append(flags, previewSelectionFlags()...)
return flags
}
Expand Down
90 changes: 90 additions & 0 deletions internal/agent/agent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Package agent talks to the Buildkite Agent API (agent.buildkite.com).
//
// This is deliberately separate from internal/api, which talks to the Test
// Engine API with a different base URL and a different access token. The Agent
// API is the service that owns a running job, so it is the only place that can
// accept a "promised failure" for that job.
package agent

import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"runtime"

"github.com/buildkite/test-engine-client/v2/internal/version"
)

// promiseFailureRequest is the JSON body sent to the promise_failure endpoint.
// The field names match the Buildkite Agent API contract exercised by the
// promised-failure-cascade-tests harness (PB-1665):
//
// {"exit_status": 1, "reason": "test_failure"}
type promiseFailureRequest struct {
ExitStatus int `json:"exit_status"`
Reason string `json:"reason"`
}

// PromiseFailure tells the Buildkite Agent API that the current job is going to
// finish with a non-zero exit status, before the job actually exits. This lets
// the build "cascade" to failing early.
//
// It mirrors the curl the cascade-test pipelines use:
//
// PUT {endpoint}/jobs/{jobID}/promise_failure
// Authorization: Token {accessToken}
// Content-Type: application/json
// {"exit_status": 1, "reason": "..."}
//
// endpoint and accessToken come from the job environment (BUILDKITE_AGENT_ENDPOINT
// and BUILDKITE_AGENT_ACCESS_TOKEN), which the agent injects into every job.
//
// This call is best-effort by contract: callers should log a failure and carry
// on, never changing the test run's real exit status because of a promise error.
func PromiseFailure(ctx context.Context, httpClient *http.Client, endpoint string, accessToken string, jobID string, exitStatus int, reason string) error {
if endpoint == "" {
return fmt.Errorf("agent endpoint is blank (is BUILDKITE_AGENT_ENDPOINT set?)")
}
if accessToken == "" {
return fmt.Errorf("agent access token is blank (is BUILDKITE_AGENT_ACCESS_TOKEN set?)")
}
if jobID == "" {
return fmt.Errorf("job ID is blank (is BUILDKITE_JOB_ID set?)")
}

if httpClient == nil {
httpClient = http.DefaultClient
}

url := fmt.Sprintf("%s/jobs/%s/promise_failure", endpoint, jobID)

body, err := json.Marshal(promiseFailureRequest{ExitStatus: exitStatus, Reason: reason})
if err != nil {
return fmt.Errorf("encoding promise_failure body: %w", err)
}

req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, bytes.NewReader(body))
if err != nil {
return fmt.Errorf("building promise_failure request: %w", err)
}
req.Header.Set("Authorization", "Token "+accessToken)
req.Header.Set("Content-Type", "application/json")
req.Header.Set("User-Agent", fmt.Sprintf(
"Buildkite Test Engine Client/%s (%s/%s)",
version.Version, runtime.GOOS, runtime.GOARCH,
))

resp, err := httpClient.Do(req)
if err != nil {
return fmt.Errorf("sending promise_failure request: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("promise_failure returned HTTP %d", resp.StatusCode)
}

return nil
}
96 changes: 96 additions & 0 deletions internal/agent/agent_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package agent

import (
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
)

func TestPromiseFailure_sendsCorrectRequest(t *testing.T) {
var (
gotMethod string
gotPath string
gotAuth string
gotType string
gotBody map[string]any
)

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
gotMethod = r.Method
gotPath = r.URL.Path
gotAuth = r.Header.Get("Authorization")
gotType = r.Header.Get("Content-Type")

raw, _ := io.ReadAll(r.Body)
_ = json.Unmarshal(raw, &gotBody)

w.WriteHeader(http.StatusOK)
}))
defer server.Close()

err := PromiseFailure(context.Background(), server.Client(), server.URL, "secret-token", "job-uuid-123", 1, "test_failure")
if err != nil {
t.Fatalf("PromiseFailure returned error: %v", err)
}

if gotMethod != http.MethodPut {
t.Errorf("method = %q, want PUT", gotMethod)
}
if want := "/jobs/job-uuid-123/promise_failure"; gotPath != want {
t.Errorf("path = %q, want %q", gotPath, want)
}
if want := "Token secret-token"; gotAuth != want {
t.Errorf("Authorization = %q, want %q", gotAuth, want)
}
if want := "application/json"; gotType != want {
t.Errorf("Content-Type = %q, want %q", gotType, want)
}

wantBody := map[string]any{"exit_status": float64(1), "reason": "test_failure"}
if diff := cmp.Diff(wantBody, gotBody); diff != "" {
t.Errorf("request body diff (-want +got):\n%s", diff)
}
}

func TestPromiseFailure_returnsErrorOnNon2xx(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer server.Close()

err := PromiseFailure(context.Background(), server.Client(), server.URL, "tok", "job-1", 1, "test_failure")
if err == nil {
t.Fatal("expected an error on HTTP 500, got nil")
}
if !strings.Contains(err.Error(), "500") {
t.Errorf("error = %q, want it to mention 500", err.Error())
}
}

func TestPromiseFailure_validatesRequiredArgs(t *testing.T) {
tests := []struct {
name string
endpoint string
accessToken string
jobID string
}{
{"blank endpoint", "", "tok", "job-1"},
{"blank token", "http://example.com", "", "job-1"},
{"blank job ID", "http://example.com", "tok", ""},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
err := PromiseFailure(context.Background(), http.DefaultClient, tc.endpoint, tc.accessToken, tc.jobID, 1, "test_failure")
if err == nil {
t.Fatalf("expected an error for %s, got nil", tc.name)
}
})
}
}
40 changes: 40 additions & 0 deletions internal/command/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ import (
"context"
"errors"
"fmt"
"net/http"
"os"
"os/exec"
"strconv"
"syscall"
"time"

"github.com/buildkite/test-engine-client/v2/internal/agent"
"github.com/buildkite/test-engine-client/v2/internal/api"
"github.com/buildkite/test-engine-client/v2/internal/config"
"github.com/buildkite/test-engine-client/v2/internal/debug"
Expand Down Expand Up @@ -83,6 +85,11 @@ func Run(ctx context.Context, cfg *config.Config, testListFilename string) error
logSignalAndExit(testRunner.Name(), ProcessSignaledError.Signal)
}

// Retries are now exhausted. If hard (non-muted) failures remain and the
// opt-in flag is set, declare an early failure to the Buildkite Agent API so
// the build can cascade to failing before this job actually exits.
promiseFailureIfNeeded(ctx, cfg, runResult)

printReport(runResult, testPlan.SkippedTests, testRunner.Name())
if !testPlan.Fallback {
sendMetadata(ctx, apiClient, cfg, timeline, runResult.Statistics())
Expand All @@ -102,6 +109,39 @@ func Run(ctx context.Context, cfg *config.Config, testListFilename string) error
return runErr
}

// promiseFailureIfNeeded declares an early ("promised") failure to the Buildkite
// Agent API when the run finished with hard (non-muted) failures after retries
// and the opt-in flag is enabled.
//
// This is the single point that knows both that retries are exhausted and which
// failures are hard vs muted, so it's the only correct place to promise. Muted
// failures are excluded by FailedTests(), so a muted-only run never promises.
//
// It is best-effort: any error is logged and swallowed so a promise problem
// never changes the test run's real exit status.
func promiseFailureIfNeeded(ctx context.Context, cfg *config.Config, runResult runner.RunResult) {
if !cfg.PromiseFailure {
return
}

failedTests := runResult.FailedTests()
if len(failedTests) == 0 {
return
}

const promisedExitStatus = 1
reason := fmt.Sprintf("test_failure (%d failed after retries)", len(failedTests))

fmt.Printf("+++ Buildkite Test Engine Client: ⚠️ Declaring early failure: %d hard test failure(s) remain after retries\n", len(failedTests))

if err := agent.PromiseFailure(ctx, http.DefaultClient, cfg.AgentEndpoint, cfg.AgentAccessToken, cfg.JobID, promisedExitStatus, reason); err != nil {
fmt.Printf("Buildkite Test Engine Client: Warning: failed to declare early failure: %v\n", err)
return
}

fmt.Println("Buildkite Test Engine Client: Early failure declared to the Buildkite Agent API.")
}

func printStartUpMessage() {
const green = "\033[32m"
const reset = "\033[0m"
Expand Down
9 changes: 9 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@ import "time"
type Config struct {
// AccessToken is the access token for the API.
AccessToken string `json:"-"`
// AgentAccessToken is the Buildkite Agent API access token, used to authenticate
// promise_failure calls. Injected into the job env as BUILDKITE_AGENT_ACCESS_TOKEN.
AgentAccessToken string `json:"-"`
// AgentEndpoint is the base URL of the Buildkite Agent API (e.g. https://agent.buildkite.com/v3).
// Injected into the job env as BUILDKITE_AGENT_ENDPOINT.
AgentEndpoint string `json:"BUILDKITE_AGENT_ENDPOINT"`
// PromiseFailure, when true, makes bktec declare an early failure to the
// Buildkite Agent API once retries are exhausted and hard failures remain.
PromiseFailure bool `json:"BUILDKITE_TEST_ENGINE_PROMISE_FAILURE"`
// UploadBaseURL is the base URL for the Test Engine analytics API.
UploadBaseURL string `json:"-"`
// Branch is the string value of the git branch name, used by Buildkite only.
Expand Down