Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion auth/oauth/oauth.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@ package oauth

import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"strings"
"time"

"github.com/coreos/go-oidc/v3/oidc"
"golang.org/x/oauth2"
)

// hostConfigTimeout bounds the /.well-known/databricks-config lookup so it cannot
// stall connection setup; on any failure we fall back to bare-host OIDC discovery.
const hostConfigTimeout = 30 * time.Second

var azureTenants = map[string]string{
".dev.azuredatabricks.net": "62a912ac-b58e-4c1d-89ea-b2dbfc7358fc",
".staging.azuredatabricks.net": "4a67d088-db5c-48f1-9ff2-0aace800ae68",
Expand All @@ -35,7 +42,12 @@ func GetEndpoint(ctx context.Context, hostName string) (oauth2.Endpoint, error)
return oauth2.Endpoint{AuthURL: authURL, TokenURL: tokenURL}, nil
}

issuerURL := fmt.Sprintf("https://%s/oidc", hostName)
// AWS / GCP. Resolve the OIDC issuer via /.well-known/databricks-config so that
// unified / SPOG hosts (one host fronting workspaces across multiple accounts)
// use their account-rooted endpoint instead of the account-agnostic console login.
// For normal workspace hosts this resolves to https://<host>/oidc, identical to
// the previous behavior.
issuerURL := resolveOIDCIssuer(ctx, hostName)
ctx = oidc.InsecureIssuerURLContext(ctx, issuerURL)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] oidc.InsecureIssuerURLContext disables the check that the discovered config's issuer field matches the requested issuer. Pre-PR this was benign because issuerURL was always pinned to https://<host>/oidc. Post-PR the issuer is taken from a host-supplied JSON document (line 50) and may point at a different host. Not exploitable without controlling the target host's HTTPS response, but it widens the trust surface and deserves an explicit decision:

  • validate the resolved issuer host against an allowlist of Databricks suffixes before passing to NewProvider, or
  • drop the InsecureIssuerURLContext override on the AWS/GCP path now that the issuer is resolved dynamically, so go-oidc re-enforces issuer matching.

Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

provider, err := oidc.NewProvider(ctx, issuerURL)
if err != nil {
Expand All @@ -47,6 +59,71 @@ func GetEndpoint(ctx context.Context, hostName string) (oauth2.Endpoint, error)
return endpoint, err
}

// hostMetadata is the subset of /.well-known/databricks-config we consume.
type hostMetadata struct {
OIDCEndpoint string `json:"oidc_endpoint"`
AccountID string `json:"account_id"`
}

// resolveOIDCIssuer returns the OIDC issuer URL to use for AWS/GCP OAuth discovery.
//
// On a unified / SPOG host, the bare-host OIDC discovery doc points at the
// account-agnostic account-console login. That mints a token for the caller's
// default account, which the target workspace rejects ("Invalid Token") when the
// workspace belongs to a different account. Such hosts advertise the correct,
// account-rooted OIDC endpoint via /.well-known/databricks-config (with an
// {account_id} placeholder); we consult it and substitute the account id.
//
// For a normal workspace host the advertised endpoint is just https://<host>/oidc,
// so the result is identical to the historical bare-host issuer. Any failure
// (endpoint absent, non-200, unparseable, missing field, timeout) falls back to
// the bare-host issuer, preserving existing behavior.
func resolveOIDCIssuer(ctx context.Context, hostName string) string {
fallback := fmt.Sprintf("https://%s/oidc", hostName)

url := fmt.Sprintf("https://%s/.well-known/databricks-config", hostName)
client := &http.Client{Timeout: hostConfigTimeout}

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] &http.Client{Timeout: hostConfigTimeout} has a nil Transport, so it uses http.DefaultTransport — bypassing WithTransport, WithSkipTLSHostVerify (connector.go:378-399), and the driver's PooledTransport. A user behind a corporate proxy (programmatic transport), with a custom CA / mTLS, or with InsecureSkipVerify set in dev will have this fetch fail TLS/connect and silently fall back to bare-host discovery — the exact broken path this PR fixes — with no diagnostic. (Env-var proxies still work; http.DefaultTransport honors HTTPS_PROXY.)

Not a regression: the pre-existing oidc.NewProvider(ctx, …) already uses the default client (no oidc.ClientContext installed). But it does mean the fix is a no-op in exactly the enterprise/dev environments where SPOG is most likely exercised. fetchHostMetadata already takes an injected *http.Client; thread the driver's configured client/transport (or at least cfg.TLSConfig) down through GetEndpoint → resolveOIDCIssuer. This also closes the testability gap on resolveOIDCIssuer.


Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[LOW] Every AWS/GCP GetEndpoint now makes a synchronous /.well-known/databricks-config call before the existing OIDC discovery, on the connection-setup path. It is bounded only by the fixed 30s hostConfigTimeout — both production callers pass context.Background(), so the caller's deadline doesn't tighten it (though fetchHostMetadata does honor ctx cancellation via NewRequestWithContext). The result is immutable per host but is re-fetched on every authenticator construction with no caching.

Consider: deriving the timeout from the incoming ctx, a tighter default, and memoizing per host. databricks-sdk-go resolves host metadata once per Config and reuses it. (architecture + security + devil's-advocate)


Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback


meta, ok := fetchHostMetadata(ctx, client, url)
if !ok || meta.OIDCEndpoint == "" {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[HIGH] resolveOIDCIssuer guards only !ok || meta.OIDCEndpoint == ""; it never checks meta.AccountID. If metadata returns oidc_endpoint: ".../accounts/{account_id}" with account_id: "", substituteAccountID (line 98) yields .../accounts/ — an account-less, malformed issuer. Execution has already committed past the fallback here, so oidc.NewProvider fails and GetEndpoint returns a hard error rather than degrading to the bare host.

This directly contradicts this function's own doc comment: "Any failure (… missing field …) falls back to the bare-host issuer."

Fix: when the post-substitution string still contains {account_id} (or OIDCEndpoint has the placeholder while AccountID == ""), return fallback. Flagged independently by the devil's-advocate, test, language, and architecture reviewers.


Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

return fallback

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Every non-200, redirect-to-HTML, malformed JSON, or wrong-account_id response collapses to the same silent bare-host fallback. A genuinely misconfigured SPOG host then re-produces the opaque 400 Invalid Token this PR exists to eliminate, with zero breadcrumb as to why.

The repo already uses zerolog/log in these packages — a log.Debug()/log.Warn() on the fallback path would pay for itself in support time. (security + devil's-advocate)


Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

}

return substituteAccountID(meta)
}

// substituteAccountID resolves the {account_id} placeholder in the advertised
// oidc_endpoint. Workspace hosts have no placeholder and are returned unchanged.
func substituteAccountID(meta hostMetadata) string {
return strings.ReplaceAll(meta.OIDCEndpoint, "{account_id}", meta.AccountID)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[LOW] substituteAccountID returns meta.OIDCEndpoint verbatim with no scheme check. If the metadata doc returns oidc_endpoint: "http://.../oidc", oidc.NewProvider issues a plaintext discovery GET and the derived authorize/token URLs are HTTP — OAuth traffic in cleartext. The hard-coded fallback is always HTTPS, so this only appears on the new metadata-driven path.

Suggested fix

a one-line strings.HasPrefix(resolved, "https://") guard (fall back otherwise) closes it and matches the stated "fall back on anything unexpected" intent. Gated on controlling the (HTTPS-served) metadata response.

--- Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

}

// fetchHostMetadata GETs /.well-known/databricks-config and decodes it. The bool
// is false on any failure (request error, non-200, unparseable body) so callers
// fall back to bare-host discovery.
func fetchHostMetadata(ctx context.Context, client *http.Client, url string) (hostMetadata, bool) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return hostMetadata{}, false
}

resp, err := client.Do(req)
if err != nil {
return hostMetadata{}, false
}
defer resp.Body.Close() //nolint:errcheck

if resp.StatusCode != http.StatusOK {
return hostMetadata{}, false
}

var meta hostMetadata
if err := json.NewDecoder(resp.Body).Decode(&meta); err != nil {
return hostMetadata{}, false
}
return meta, true
}

func GetScopes(hostName string, scopes []string) []string {
for _, s := range []string{oidc.ScopeOfflineAccess} {
if !HasScope(scopes, s) {
Expand Down Expand Up @@ -135,6 +212,16 @@ func InferCloudFromHost(hostname string) CloudType {
return GCP
}
}

// Unified / SPOG (Single Pane of Glass) AWS hosts use bare *.databricks.com
// custom URLs (e.g. <name>.databricks.com, <name>.staging.databricks.com) that
// match none of the lists above. Treat them as AWS. This is checked last so the
// more specific Azure (.azuredatabricks.net) and GCP (.gcp.databricks.com) hosts
// are classified first.
if strings.Contains(hostname, "databricks.com") {
return AWS
}

return Unknown
}

Expand Down
137 changes: 137 additions & 0 deletions auth/oauth/oauth_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package oauth

import (
"context"
"net/http"
"net/http/httptest"
"testing"
)

func TestInferCloudFromHost(t *testing.T) {
cases := []struct {
host string
want CloudType
}{
// Standard per-workspace hosts.
{"dbc-1234.cloud.databricks.com", AWS},
{"example.cloud.databricks.us", AWS},
{"foo.dev.databricks.com", AWS},
{"adb-123.azuredatabricks.net", Azure},
{"x.databricks.azure.us", Azure},
{"y.databricks.azure.cn", Azure},
{"ws.gcp.databricks.com", GCP},
// SPOG / unified custom-URL AWS hosts (the fix): must classify as AWS,
// not Unknown, and must NOT be swallowed by the GCP/Azure checks.
{"pecoaws.databricks.com", AWS},
{"dogfood.staging.databricks.com", AWS},
// Azure SPOG stays Azure.
{"dogfood-spog.staging.azuredatabricks.net", Azure},
// GCP custom host must remain GCP even though it contains "databricks.com".
{"foo.gcp.databricks.com", GCP},
// Truly unrelated host stays Unknown.
{"example.com", Unknown},
}

for _, tc := range cases {
t.Run(tc.host, func(t *testing.T) {
if got := InferCloudFromHost(tc.host); got != tc.want {
t.Fatalf("InferCloudFromHost(%q) = %v, want %v", tc.host, got, tc.want)
}
})
}
}

func TestGetAzureDnsZone(t *testing.T) {
// Documents current behavior: the generic suffix is matched first, so staging
// and dev Azure hosts resolve to the prod tenant. (Separate fix tracked.)
cases := []struct {
host string
want string
}{
{"adb-123.azuredatabricks.net", ".azuredatabricks.net"},
{"x.databricks.azure.us", ".databricks.azure.us"},
{"nope.example.com", ""},
}
for _, tc := range cases {
t.Run(tc.host, func(t *testing.T) {
if got := GetAzureDnsZone(tc.host); got != tc.want {
t.Fatalf("GetAzureDnsZone(%q) = %q, want %q", tc.host, got, tc.want)
}
})
}
}

func TestResolveOIDCIssuer_substitutesAccountID(t *testing.T) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] TestResolveOIDCIssuer_substitutesAccountID (and _workspaceHostUnchanged at line 90) are misnamed: they call fetchHostMetadata + substituteAccountID directly and never call resolveOIDCIssuer. So the actual fallback wiring (!ok || OIDCEndpoint == "" → bare host), the URL-format construction, the oidc_endpoint == "" guard, and the empty-account_id case (see the High finding) are all uncovered. TestFetchHostMetadata_failuresFallBack proves failure detection (ok=false), not the fallback string mapping. And GetEndpoint end-to-end — the M2M TokenURL actually coming out account-rooted, the whole point of the PR — is untested.

Suggested fix

add a table-driven resolveOIDCIssuer test against an httptest server covering substitute / unchanged / empty-endpoint-fallback / failure-fallback / empty-account_id. (test + devil's-advocate)

--- Posted by code-review-squad · /full-review · feedback: #code-review-squad-feedback

// Unified / SPOG host advertises an account-rooted endpoint with a placeholder.
srv := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/.well-known/databricks-config" {
w.WriteHeader(http.StatusNotFound)
return
}
_, _ = w.Write([]byte(`{
"oidc_endpoint": "https://spog.example.com/oidc/accounts/{account_id}",
"account_id": "7a99b43c-b46c-432b-b0a7-814217701909",
"host_type": "unified"
}`))
}))
defer srv.Close()

meta, ok := fetchHostMetadata(context.Background(), srv.Client(), srv.URL+"/.well-known/databricks-config")
if !ok {
t.Fatal("fetchHostMetadata returned ok=false, want true")
}
got := substituteAccountID(meta)
want := "https://spog.example.com/oidc/accounts/7a99b43c-b46c-432b-b0a7-814217701909"
if got != want {
t.Fatalf("issuer = %q, want %q", got, want)
}
}

func TestResolveOIDCIssuer_workspaceHostUnchanged(t *testing.T) {
// Normal workspace host: endpoint has no placeholder, so it is returned as-is
// (equivalent to the historical https://<host>/oidc).
srv := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(`{
"oidc_endpoint": "https://ws.cloud.databricks.com/oidc",
"account_id": "7a99b43c-b46c-432b-b0a7-814217701909",
"host_type": "workspace"
}`))
}))
defer srv.Close()

meta, ok := fetchHostMetadata(context.Background(), srv.Client(), srv.URL+"/.well-known/databricks-config")
if !ok {
t.Fatal("fetchHostMetadata returned ok=false, want true")
}
if got := substituteAccountID(meta); got != "https://ws.cloud.databricks.com/oidc" {
t.Fatalf("issuer = %q, want unchanged workspace endpoint", got)
}
}

func TestFetchHostMetadata_failuresFallBack(t *testing.T) {
t.Run("404", func(t *testing.T) {
srv := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
}))
defer srv.Close()
if _, ok := fetchHostMetadata(context.Background(), srv.Client(), srv.URL); ok {
t.Fatal("ok=true on 404, want false (fallback)")
}
})

t.Run("garbage body", func(t *testing.T) {
srv := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("not json"))
}))
defer srv.Close()
if _, ok := fetchHostMetadata(context.Background(), srv.Client(), srv.URL); ok {
t.Fatal("ok=true on garbage body, want false (fallback)")
}
})

t.Run("unreachable", func(t *testing.T) {
if _, ok := fetchHostMetadata(context.Background(), &http.Client{}, "https://127.0.0.1:1/nope"); ok {
t.Fatal("ok=true on unreachable host, want false (fallback)")
}
})
}
Loading