Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c82083f
feat(sync): [DC-216839]: use HTTP/2 with automatic HTTP/1.1 fallback …
mrudnoru Apr 29, 2026
ee96f23
refactor(sync): extract http2 fallback transport into its own file
mrudnoru May 18, 2026
b64851d
refactor(sync): [DC-216839]: detect HTTP/2 framing errors via typed e…
mrudnoru May 18, 2026
b960dfb
refactor(sync): [DC-216839]: factor shared transport-clone helper
mrudnoru May 18, 2026
740df0d
feat(sync): [DC-216839]: remember HTTP/2 fallback decision per host
mrudnoru May 18, 2026
9b09416
docs(sync): [DC-216839]: document HTTP/2 transport with HTTP/1.1 fall…
mrudnoru May 18, 2026
61a0e0d
test(sync): add http2 fallback blackbox smoke test
mrudnoru May 18, 2026
517aa7a
fix(sync): [DC-216839]: detect typed HTTP/2 framing errors via Config…
mrudnoru May 21, 2026
d4cb051
docs(sync): [DC-216839]: restore upstream timeout-rationale comments
mrudnoru May 22, 2026
662b2d2
docs(sync): [DC-216839]: drop orphaned timeout comment from service.g…
mrudnoru May 22, 2026
91e2007
test(sync): fix golangci-lint issues in http2 fallback tests
mrudnoru May 25, 2026
3564832
fix(sync): close primary response and guard unrewindable HTTP/2 fallb…
mrudnoru May 29, 2026
0008ca0
test(sync): restore http2 fallback blackbox smoke test
mrudnoru May 29, 2026
b2fa7a4
test(sync): implement retry logic for manifest retrieval in TestTLS
rchincha Jun 5, 2026
f113f27
fix(sync): remove unused error variable in TestTLS function
rchincha Jun 5, 2026
03658d4
fix(sync): add missing newline in TestTLS function for readability
rchincha Jun 5, 2026
542d8c5
fix(sync): implement timeout for server readiness and index synchroni…
rchincha Jun 6, 2026
b30c98b
fix: rebase go.mod
rchincha Jun 6, 2026
8a52637
fix(sync): add missing newlines in TestConfigReloader and TestTLS for…
rchincha Jun 6, 2026
602bfdc
fix(sync): extend index deadline in TestTLS to 2 minutes for improved…
rchincha Jun 6, 2026
120c96d
fix(sync): enhance getCertificates function to read certificate files…
rchincha Jun 6, 2026
e7da7b5
fix(sync): implement TLS configuration for HTTP transport and enhance…
rchincha Jun 6, 2026
882fb5d
fix: linter errors
rchincha Jun 6, 2026
072bb58
fix(sync): enhance HTTP/2 fallback transport and improve test coverage
rchincha Jun 10, 2026
ca5e796
fix(sync): preserve cert-dir TLS in HTTP/2 fallback transport
rchincha Jun 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,18 @@ sync can also read the certificates directly under certDir:

Besides sync-auth.json file, zot also reads and uses docker credentials by default: https://docs.docker.com/reference/cli/docker/login/#description

### Sync's HTTP/2 transport with HTTP/1.1 fallback

Sync requests now use HTTP/2 by default. Some upstream load balancers (notably Docker Hub's) occasionally send raw HTTP/2 SETTINGS frames on a connection that Go's `net/http` opened as HTTP/1.1, surfacing as `malformed HTTP response` or HTTP/2 `INTERNAL_ERROR` / `PROTOCOL_ERROR`. The sync transport catches those framing errors at the RoundTrip layer and transparently retries the request over HTTP/1.1, so neither the regclient pipeline nor the on-demand caller sees the failure.

After a host triggers the fallback once, sync routes subsequent requests for the same host straight to HTTP/1.1 for 15 minutes; the host gets another HTTP/2 attempt once that window elapses. There is no configuration knob — the behavior is automatic and applies to every registry under `extensions.sync.registries[*].urls[*]`.

Operator upgrade notes:

- HTTPS upstreams already negotiated HTTP/2 before this change (the sync transport cloned `http.DefaultTransport`, which attempts HTTP/2 by default). The only behavioral change is the transparent HTTP/1.1 retry on framing errors. No config change is required.
- Healthy upstreams continue to serve sync over HTTP/2; only hosts that emit framing errors are downgraded, and only until the sticky window expires.
- The warning line `HTTP/2 framing error from upstream, retrying with HTTP/1.1` is logged on the first occurrence per host. Repeated downgrades within the sticky window are silent to avoid log spam.

## Search and CVE scanning (Trivy)

The `search` extension can include a `cve` section so zot downloads the [Trivy](https://github.com/aquasecurity/trivy) vulnerability database and exposes CVE data via the search API (for example GraphQL).
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ require (
github.com/zitadel/oidc/v3 v3.47.5
go.etcd.io/bbolt v1.4.3
golang.org/x/crypto v0.52.0
golang.org/x/net v0.55.0
golang.org/x/oauth2 v0.36.0
golang.org/x/sys v0.45.0
google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af
Expand Down Expand Up @@ -528,7 +529,6 @@ require (
go.yaml.in/yaml/v4 v4.0.0-rc.3 // indirect
golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f // indirect
golang.org/x/mod v0.36.0 // indirect
golang.org/x/net v0.55.0 // indirect
golang.org/x/sync v0.20.0 // indirect
golang.org/x/term v0.43.0 // indirect
golang.org/x/text v0.37.0 // indirect
Expand Down
350 changes: 350 additions & 0 deletions pkg/extensions/sync/http2_fallback.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
//go:build sync

package sync

import (
cryptotls "crypto/tls"
"crypto/x509"
"errors"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"time"

"github.com/regclient/regclient"
"golang.org/x/net/http2"

syncconf "zotregistry.dev/zot/v2/pkg/extensions/config/sync"
"zotregistry.dev/zot/v2/pkg/log"
)

// http2FallbackStickyTTL is how long a host stays on the HTTP/1.1 fallback after it has
// framed-errored once. Picked to ride out a single LB rollout without permanently giving
// up HTTP/2 for a host that may recover.
const http2FallbackStickyTTL = 15 * time.Minute

// http2FallbackTransport tries HTTP/2 first, falls back to HTTP/1.1 on framing errors.
// Docker Hub's LB occasionally sends raw HTTP/2 SETTINGS frames on connections that Go's
// net/http opened as HTTP/1.1, causing "malformed HTTP response" errors. This transport
// catches those errors at the RoundTrip level and retries transparently with HTTP/1.1,
// so regclient never sees the failure and never enters its backoff cycle.
//
// Once a host has framed-errored, the transport remembers that choice for
// http2FallbackStickyTTL and routes subsequent requests for that host straight to the
// fallback. After the TTL expires the host gets another HTTP/2 attempt, so a temporary
// upstream issue does not pin the host to HTTP/1.1 forever.
type http2FallbackTransport struct {
primary http.RoundTripper
fallback http.RoundTripper
log log.Logger
stickyTTL time.Duration
now func() time.Time
stickyHost sync.Map // host string -> time.Time when entry expires
}

type hostAwareTransport struct {
base *http.Transport
certDirs []string
configureHTTP2 bool
log log.Logger
transports sync.Map // host string -> *http.Transport
}

func (t *http2FallbackTransport) RoundTrip(req *http.Request) (*http.Response, error) {
host := req.URL.Host
if t.hostStuckOnFallback(host) {
return t.fallback.RoundTrip(req)
}

resp, err := t.primary.RoundTrip(req)
if err == nil || !isHTTP2FramingError(err) {
return resp, err
}

if resp != nil && resp.Body != nil {
_ = resp.Body.Close()
}

t.markHostStuck(host)

t.log.Warn().Str("method", req.Method).Str("url", req.URL.String()).
Err(err).Msg("HTTP/2 framing error from upstream, retrying with HTTP/1.1")

if req.Body != nil && req.Body != http.NoBody {
// A real body with no GetBody can't be rewound; the primary may have consumed it,
// so retrying would send a truncated payload. Return the primary error instead.
if req.GetBody == nil {
return nil, err
}

body, bodyErr := req.GetBody()
if bodyErr != nil {
return nil, err
}

fallbackReq := req.Clone(req.Context())
fallbackReq.Body = body
fallbackReq.GetBody = req.GetBody

return t.fallback.RoundTrip(fallbackReq)
}

return t.fallback.RoundTrip(req)
}

func (t *http2FallbackTransport) hostStuckOnFallback(host string) bool {
raw, ok := t.stickyHost.Load(host)
if !ok {
return false
}

expiresAt, ok := raw.(time.Time)
if !ok {
return false
}

if !t.now().Before(expiresAt) {
t.stickyHost.Delete(host)

return false
}

return true
}

func (t *http2FallbackTransport) markHostStuck(host string) {
t.stickyHost.Store(host, t.now().Add(t.stickyTTL))
}

func isHTTP2FramingError(err error) bool {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This matches all StreamError / GoAwayError codes, not just protocol/framing mismatches (e.g. REFUSED_STREAM, ENHANCE_YOUR_CALM). For Docker Hub that’s probably fine, but it could mask genuine HTTP/2 issues on other registries by silently downgrading to HTTP/1.1.

I suggest this is refactored in 3 separate functions

func shouldRetryOnHTTP11Fallback(err error) bool {
    return isHTTP2SessionError(err) || isHTTP1HTTP2Mismatch(err)
}

func isHTTP2SessionError(err error) bool { /* StreamError, GoAwayError */ }

func isHTTP1HTTP2Mismatch(err error) bool {
    return strings.Contains(err.Error(), "malformed HTTP response")
}

var streamErr http2.StreamError
if errors.As(err, &streamErr) {
return true
}

var goAwayErr *http2.GoAwayError
if errors.As(err, &goAwayErr) {
return true
}

// The "malformed HTTP response" case is produced by net/http when an HTTP/1.1 connection
// receives raw HTTP/2 SETTINGS frames. Go's stdlib does not expose a typed error for this
// path (see https://github.com/golang/go/issues/40926), so we keep a substring match.
return strings.Contains(err.Error(), "malformed HTTP response")
}

func (t *hostAwareTransport) RoundTrip(req *http.Request) (*http.Response, error) {
return t.transportForHost(req.URL.Host).RoundTrip(req)
}

func (t *hostAwareTransport) transportForHost(host string) *http.Transport {
if cached, ok := t.transports.Load(host); ok {
if transport, ok := cached.(*http.Transport); ok {
return transport
}
}

transport := t.base.Clone()
configureHostCertDirTLS(transport, host, t.certDirs)

if t.configureHTTP2 {
if err := http2.ConfigureTransport(transport); err != nil {
t.log.Warn().Err(err).
Msg("failed to configure http2 on sync transport, framing-error fallback may be limited to substring detection")
}
}

actual, _ := t.transports.LoadOrStore(host, transport)
if cached, ok := actual.(*http.Transport); ok {
return cached
}

return transport
}

func cloneOrCreateTLSConfig(transport *http.Transport) *cryptotls.Config {
if transport.TLSClientConfig != nil {
return transport.TLSClientConfig.Clone()
}

return &cryptotls.Config{}
}

func appendCertsToPool(pool *x509.CertPool, certsPEM []byte) (*x509.CertPool, bool) {
if pool == nil {
var err error
pool, err = x509.SystemCertPool()
if err != nil || pool == nil {
pool = x509.NewCertPool()
}
} else {
pool = pool.Clone()
}

if !pool.AppendCertsFromPEM(certsPEM) {
return nil, false
}

return pool, true
}

func configureHostCertDirTLS(transport *http.Transport, host string, certDirs []string) {
if len(certDirs) == 0 {
return
}

tlsConfig := cloneOrCreateTLSConfig(transport)
rootPool := tlsConfig.RootCAs
updated := false

for _, certDir := range certDirs {
if certDir == "" {
continue
}

hostDir := filepath.Join(certDir, host)
files, err := os.ReadDir(hostDir)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
if err != nil {
continue
}

for _, file := range files {
if file.IsDir() || !strings.HasSuffix(file.Name(), ".crt") {
continue
}

certPEM, err := os.ReadFile(filepath.Join(hostDir, file.Name())) //nolint:gosec // hostDir is derived from configured cert directories

Check failure on line 218 in pkg/extensions/sync/http2_fallback.go

View workflow job for this annotation

GitHub Actions / lint

The line is 137 characters long, which exceeds the maximum of 120 characters. (lll)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
if err != nil {
continue
}

pool, ok := appendCertsToPool(rootPool, certPEM)
if !ok {
continue
}

rootPool = pool
updated = true
}
}

if !updated {
return
}

tlsConfig.RootCAs = rootPool
transport.TLSClientConfig = tlsConfig
}

func clonedTransport(opts syncconf.RegistryConfig) *http.Transport {
// Configure transport with timeouts to prevent indefinite hangs.
// See https://blog.cloudflare.com/the-complete-guide-to-golang-net-http-timeouts/
// Clone DefaultTransport to preserve proxy/TLS settings and existing timeouts
// (DialContext: 30s, TLSHandshakeTimeout: 10s).
// regclient uses DefaultTransport internally if no custom transport is provided, so this ensures compatibility.
transport := http.DefaultTransport.(*http.Transport).Clone() //nolint: forcetypeassert

// ResponseHeaderTimeout: prevents hanging when server connects but doesn't send headers.
// Set programmatically in root.go. This timeout applies only to waiting for response headers
// after the request is sent. It does NOT include DialContext (30s) or TLSHandshakeTimeout (10s),
// which are separate component timeouts. Doesn't cover body transfer time, which is expected
// to be slow for large images.
transport.ResponseHeaderTimeout = opts.ResponseHeaderTimeout

configureTransportTLS(transport, opts)

return transport
}

func configureTransportTLS(transport *http.Transport, opts syncconf.RegistryConfig) {
tlsConfig := cloneOrCreateTLSConfig(transport)
needsTLSConfig := false

if opts.TLSVerify != nil && !*opts.TLSVerify {
tlsConfig.InsecureSkipVerify = true //nolint:gosec // this is an explicit sync configuration option
needsTLSConfig = true
}

if opts.CertDir == "" {
if needsTLSConfig {
transport.TLSClientConfig = tlsConfig
}

return
}

clientCert, clientKey, regCert, err := getCertificates(opts.CertDir)
if err != nil {
// Keep the transport usable; the sync path will surface the failure if
// the cert files are actually required.
if needsTLSConfig {
transport.TLSClientConfig = tlsConfig
}

return
}

if regCert != "" {
pool, ok := appendCertsToPool(tlsConfig.RootCAs, []byte(regCert))
if ok {
tlsConfig.RootCAs = pool
needsTLSConfig = true
}
}

if clientCert != "" && clientKey != "" {
cert, err := cryptotls.X509KeyPair([]byte(clientCert), []byte(clientKey))
if err == nil {
tlsConfig.Certificates = []cryptotls.Certificate{cert}
needsTLSConfig = true
}
}

if needsTLSConfig {
transport.TLSClientConfig = tlsConfig
}
}

func buildTransportCertDirs(opts syncconf.RegistryConfig) []string {
certDirs := []string{regclient.DockerCertDir}
if opts.CertDir != "" {
certDirs = append(certDirs, opts.CertDir)
}

return certDirs
}

// newHTTP2FallbackTransport builds a RoundTripper that prefers HTTP/2 for upstream sync
// and falls back to HTTP/1.1 on the framing errors enumerated in isHTTP2FramingError.
// Both transports share the same timeout configuration; the fallback only differs by
// disabling HTTP/2 negotiation, so an upstream that breaks HTTP/2 can still be reached.
func newHTTP2FallbackTransport(opts syncconf.RegistryConfig, logger log.Logger) http.RoundTripper {
certDirs := buildTransportCertDirs(opts)

primaryTransport := &hostAwareTransport{
base: clonedTransport(opts),
certDirs: certDirs,
configureHTTP2: true,
log: logger,
}

fallbackBase := clonedTransport(opts)
fallbackBase.TLSNextProto = make(map[string]func(string, *cryptotls.Conn) http.RoundTripper)
fallbackBase.ForceAttemptHTTP2 = false

fallbackTransport := &hostAwareTransport{
base: fallbackBase,
certDirs: certDirs,
log: logger,
}

return &http2FallbackTransport{
primary: primaryTransport,
fallback: fallbackTransport,
log: logger,
stickyTTL: http2FallbackStickyTTL,
now: time.Now,
}
}
Loading
Loading