go-llm-interactive-proxy/config/config.yaml at main · matdev83/go-llm-interactive-proxy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
server:
  # Default bind is loopback-only so single-user mode never listens on all interfaces by accident.
  address: "127.0.0.1:8080"
  # Legacy server.auth_mode (no_auth | external) still merges into effective auth for audit; prefer access + auth below.
  # max_request_body_bytes: 10485760  # optional override; omit for handler default (8 MiB)
  # Optional [http.Server] timeouts (Go duration strings). Omitted fields use stdhttp defaults:
  # read_header_timeout: 10s
  # read_timeout: 30s
  # write_timeout: 120s
  # idle_timeout: 120s
  # max_pending_wire_events: 0   # cap backend adapter pending-event queues per stream; 0 = unlimited
  # Optional standards-compliant hold-alive for streaming clients while pre-request admission runs.
  # Emits HTTP 102 Processing informational responses only; final response status/body stay protocol-native.
  # pre_request_keepalive:
  #   enabled: false
  #   interval: 15s

# --- Access and authentication (see internal/core/config/access_auth_model.go) ---
# Omitted access.mode defaults to single_user. Omitted server.address defaults to explicit loopback (127.0.0.1:8080).
# single_user allows only loopback binds; all-interfaces (0.0.0.0, ::, or bare :port) requires multi_user plus strong auth.
#
# (A) Default OSS single-user (effective when access/auth are omitted): loopback + local_noop + structured auth/session events.
# access:
#   mode: single_user
# auth:
#   handler: local_noop
#   required_level: none
#   event_failure_policy: best_effort
#   # Auth/session audit events may include PII (principal display name, ids, client refs). Route logs to
#   # retention-compliant storage; the proxy does not redact those fields beyond challenge-summary sanitization.
#   # event_delivery: default   # structured log sink (default) | disabled | custom (custom needs BuildOptions.AuthEventSink)
#   # fail_closed (see (B) below) denies the request if the auth-event sink errors; best_effort ignores sink errors.
#
# (B) Multi-user + local API keys (shared host); use loopback or non-loopback listen only with server.auth_mode: external.
# access:
#   mode: multi_user
# server:
#   address: "0.0.0.0:8080"
#   auth_mode: external
# auth:
#   handler: local_api_key
#   required_level: api_key
#   event_failure_policy: fail_closed
#   local_api_keys:
#     - key_id: example-device
#       principal_id: example-user
#       # Each key must be >= 16 characters (Unicode code points). For multi_user on a reachable bind,
#       # add perimeter rate limiting or WAF brute-force protection; the proxy does not throttle auth attempts.
#       key: "replace-with-operator-secret-not-committed"
#
# (C) Remote auth (interface-only in OSS): select handler remote; standard binary does not create remote transport clients.
# You must inject RemoteDecider at the composition root (see internal/infra/runtimebundle BuildOptions).
# access:
#   mode: multi_user
# server:
#   address: "127.0.0.1:8080"
#   auth_mode: external
# auth:
#   handler: remote
#   required_level: api_key_sso
#   remote:
#     endpoint: "https://auth.example.invalid/v1"
#     handler: enterprise

# Outbound LLM HTTP client: when trust_environment_proxy is false, HTTP_PROXY/HTTPS_PROXY are ignored.
# Optional pool/timeouts (defaults match internal/infra/httpclient.DefaultTransportTune):
# http_client:
#   trust_environment_proxy: false
#   max_idle_conns: 100
#   max_idle_conns_per_host: 64
#   idle_conn_timeout: 90s
#   response_header_timeout: 60s
#   client_timeout: 120s

# Optional connection pool for managed PostgreSQL (store: postgres for continuity and/or secure_session).
# Omitted or zero values leave driver defaults. Durations are Go duration strings (e.g. 30m, 90s).
# database:
#   max_open_conns: 8
#   max_idle_conns: 2
#   conn_max_lifetime: 30m
#   conn_max_idle_time: 2m

logging:
  level: info
  format: json
  add_source: false
  access_log: false
  # access_log_skip_paths:
  #   - "/healthz"

diagnostics:
  enabled: true
  health_path: "/healthz"
  attempts_path: "/admin/attempts"
  inventory_path: "/debug/inventory"
  route_trace_path: "/debug/route_trace"
  # Optional: require header X-LIP-Diagnostics-Secret (min 12 chars) on attempts/inventory/route_trace/pprof.
  # shared_secret: "change-me-please-12+"
  # Optional: net/http/pprof (only when diagnostics.enabled). Use localhost binding or auth in front; paths must not overlap.
  # pprof_path: "/debug/pprof"

# Prometheus / OpenTelemetry (optional). Metrics path must not overlap diagnostic routes above.
# observability:
#   metrics:
#     enabled: false
#     path: "/metrics"
#   tracing:
#     enabled: false
#     # service_name: "lipstd"

routing:
  max_attempts: 3
  affinity:
    store: memory
    # Explicit route affinity ({affinity=session} / {affinity=client}) requires an identity by default.
    # Use ignore only for local/single-user experiments where missing identity should disable stickiness.
    missing_identity: fail_closed
  # Route selectors support failover (`|`), weighted branches (`^`), query defaults, and TTFT budgets.
  # TTFT budget values are integer seconds. A leading global block caps the whole client-side A-leg wait
  # for first output across all pre-output attempts, while per-leaf annotations cap one backend attempt:
  #   "{ttft_timeout=60}[ttft_timeout=30]openai-responses:gpt-4o-mini^[ttft_timeout=20]gemini:gemini-2.0-flash"
  # Route-wide affinity lives in the same leading global block:
  #   "{affinity=session}[weight=1]openai-a:gpt-4o-mini^[weight=1]openai-b:gpt-4o-mini"
  # Aliases are also accepted: {session_sticky} and {client_sticky}.
  default_route: "openai-responses:gpt-4o-mini"

# Optional regexp rewrites of the full route selector string (incoming X-LIP-Route or default_route).
# First matching rule wins; the replacement template must parse as a valid selector (see routing.Parse). After a
# match, regexp capture expansion runs; the rewritten string is parsed again at request/default-route time, so a
# capture can still yield an invalid selector even when the template alone was valid. Not recursive.
# model_aliases:
#   - pattern: '^gpt-4$'
#     replacement: 'openai-responses:gpt-4o-mini'

# Hook bus tuning (see docs/capability-catalogs.md and internal/plugins/features/REFERENCE_PLUGINS.md).
hooks:
  tool_reactor_error_policy: fail_open

# Optional models.dev snapshot catalog (disabled by default). Operator guide: docs/capability-catalogs.md
# model_catalog:
#   enabled: false
#   external_updates_enabled: false
#   # Local JSON snapshot path (required when enabled or when external_updates_enabled needs a cache).
#   cache_path: ./data/model_catalog.json
#   # Background refresh (only when external_updates_enabled: true):
#   # update_interval: 1h
#   # source_url: "https://example.com/models.json"   # HTTPS recommended; see doc "Trust and exposure"
#   # fetch_timeout: 30s   # optional extra bound when fetch context has no deadline (defense in depth)
#   # diagnostics_path: /debug/model_catalog   # must start with /; must not overlap diagnostics.* paths
#   # model_overrides:
#   #   - model: "gpt-4o"
#   #     tools: true
#   #     context_limit_tokens: 128000
#   # backend_model_overrides:
#   #   - backend: "openai-responses"
#   #     model: "gpt-4o-mini"
#   #     reasoning: false

# Backend model inventory cache and refresh for routing-time vendor/model lookups.
# This is separate from model_catalog: it records which configured backend instances expose which models.
# model_inventory:
#   cache_path: ./data/backend_model_inventory.json
#   refresh_enabled: true
#   refresh_interval: 1h  # minimum 1h; failed refresh keeps the last successful inventory
#   fetch_timeout: 30s   # per-backend startup/refresh inventory fetch timeout

continuity:
  in_memory: true
  store: memory
  # For local durable continuity (file SQLite, existing adapter):
  # store: sqlite
  # sqlite_path: ./data/continuity.db
  # For managed PostgreSQL (Bun-backed store; set postgres_dsn, optional top-level database pool):
  # store: postgres
  # postgres_dsn: "postgres://user:pass@host:5432/continuity?sslmode=disable"

# Token accounting records provider-billable and client-visible usage through runtime preflight,
# stream reconstruction, ledger writes, admin dry-run counting, and metrics-safe observations.
# accounting:
#   enabled: true
#   mode: provider_first          # provider_first | local_only | provider_required
#   count_timeout: 750ms          # bounds provider/local CountCall and stream reconstruction counts
#   tokenizer:
#     default_encoding: cl100k_base
#     model_mappings:
#       gpt-4o-mini: o200k_base
#   preflight:
#     mode: advisory              # advisory | required
#     max_context_tokens: 128000
#     clamp_max_output_tokens: true
#   ledger:
#     store: sqlite               # memory | sqlite | postgres
#     sqlite_path: ./data/token-accounting.db
#     # store: postgres
#     # postgres_dsn: "postgres://user:pass@host:5432/token_accounting?sslmode=disable"
#     write_policy: required      # required fail-closes; best_effort observes/logs and continues
#   admin:
#     enabled: true
#     path: "/admin/token-count"
#     max_body_bytes: 1048576
#   observability:
#     enabled: true

# Secure sessions (proxy-owned session id + resume proofs) are always on; omit secure_session to use defaults
# (memory store, loopback-friendly). Memory store session evidence is non-durable (lost on restart) unless you use
# store: sqlite with sqlite_path below. token_fingerprint_key may be omitted for memory (ephemeral process-local key);
# sqlite requires token_fingerprint_key (>=32 chars). Continuity keys and provider conversation ids are not proof of
# session ownership; only proxy-issued resume material authorizes a turn.
# Durable local mode (store: sqlite) needs sqlite_path and token_fingerprint_key (>=32 chars).
# Managed durable (store: postgres) needs postgres_dsn and the same long token_fingerprint_key.
# Durable audit (audit_durability: durable) requires a durable store (sqlite or postgres), not memory.
# secure_session:
#   store: memory
#   # sqlite_path: ./data/secure_session.db
#   # postgres_dsn: "postgres://user:pass@host:5432/secure_session?sslmode=disable"  # when store: postgres
#   resume_window: 168h
#   token_fingerprint_key: "replace-with-32+byte-secret----------------"
#   audit_durability: best_effort   # or durable (requires store: sqlite or postgres, plus key; sqlite also needs sqlite_path)
#   redaction_default: standard
#   non_durable_warning: log      # silent | log | strict (memory store: operator log when "log" at startup)
#   diagnostics_expose_summaries: false
#   diagnostics_path_prefix: "/debug/sessions"
#   # When diagnostics_expose_summaries is true, set diagnostics.shared_secret (>=12 chars) so operator routes are not world-readable.
#   require_workspace_id: false   # when true, reject turns with no resolved workspace id (secure-session BeginTurn)
#   workspace_resolve_on_error: fail_open   # fail_closed: workspace resolver errors deny the request (Req 11.6)
#   resume_token_bind_principal_only: false # when true, resume fingerprints use only principal id (stable across client hint drift)
#   # Optional: process-local TTL cache for durable SQL stores (sqlite/postgres) reducing repeated sessionExists /
#   # transcript_enabled reads. Empty sql_query_cache_ttl disables caching (default). Short TTL trades fewer DB round
#   # trips for brief staleness if transcript policy or session rows are mutated out-of-band; future admin mutation
#   # APIs must invalidate this cache when added.
#   # sql_query_cache_ttl: 30s
#   # sql_query_cache_max_entries: 4096   # optional cap per logical cache; omit or zero uses 4096 when ttl is set

plugins:
  frontends:
    - id: openai-responses
      enabled: true
      config: {}
    - id: openai-legacy
      enabled: true
      config: {}
    - id: anthropic
      enabled: true
      config: {}
    - id: gemini
      enabled: true
      config: {}
  backends:
    - id: openai-responses
      enabled: false
      config: {}
    - id: openai-legacy
      enabled: false
      config: {}
    - id: anthropic
      enabled: false
      config: {}
    - id: gemini
      enabled: false
      config: {}
    - id: bedrock
      enabled: false
      config: {}
    - id: acp
      enabled: false
      config: {}
    - id: openrouter
      enabled: false
      config: {}
      # base_url: https://openrouter.ai/api/v1  # default
      # api_key: ""          # or use OPENROUTER_API_KEY / OPENROUTER_API_KEY_N env vars
      # static_referer: ""   # optional static HTTP-Referer for attribution
      # static_title: ""     # optional static X-Title for app name
    - id: nvidia
      enabled: false
      config: {}
      # base_url: https://integrate.api.nvidia.com/v1  # default
      # api_key: ""          # or use NVIDIA_API_KEY / NVIDIA_API_KEY_N env vars
    - id: huggingface
      enabled: false
      config: {}
      # base_url: https://router.huggingface.co/v1  # default
      # api_key: ""          # or use HUGGINGFACE_API_KEY / HUGGINGFACE_API_KEY_N env vars
    - id: opencode-go
      enabled: false
      config: {}
      # base_url: https://opencode.ai/zen/go/v1  # default
      # api_key: ""          # or use OPENCODE_GO_API_KEY / OPENCODE_GO_API_KEY_N env vars
    - id: opencode-zen
      enabled: false
      config: {}
      # base_url: https://opencode.ai/zen/v1  # default
      # api_key: ""          # or use OPENCODE_API_KEY / OPENCODE_ZEN_API_KEY (+ _N variants) env vars
    - id: openai-codex
      enabled: false
      config: {}
      # base_url: https://chatgpt.com/backend-api/codex  # default
      # access_token: ""     # or api_key for operator consistency
      # auth_json_path: ""   # optional explicit Codex CLI auth.json; default discovers ~/.codex/auth.json
      # account_id: ""       # optional ChatGPT account id header
      # refresh_token: ""    # optional OAuth refresh token
      # default_reasoning_effort: ""  # e.g. medium, high
      # managed_oauth_enabled: false
      # managed_oauth_storage_path: var/openai_codex_oauth_accounts
      # managed_oauth_selection_strategy: first-available  # first-available | round-robin | session-affinity
      # managed_oauth_allow_auth_json_fallback: true
      # transport: https  # default; websocket/auto require experimental_websocket: true
      # experimental_websocket: false
      # gpt55_downgrade_disabled: false
      # OPENAI_CODEX_ACCESS_TOKEN / OPENAI_CODEX_API_KEY (+ _N variants) env vars
    - id: ollama
      enabled: false
      config: {}
      # base_url: http://localhost:11434/v1  # default local Ollama OpenAI-compatible root
      # responses_api: auto  # auto | enabled | disabled (auto probes /api/version for >= 0.13.3)
      # api_key: ""          # optional reverse-proxy auth; default installs use dummy credential
      # discovery:
      #   enabled: true
      #   local_models: true
      #   capabilities: true
      #   timeout: 15s
    - id: ollama-cloud
      enabled: false
      config: {}
      # base_url: http://localhost:11434/v1  # local Ollama app proxying cloud models
      # responses_api: auto
      # discovery:
      #   enabled: true
      #   cloud_models: true
      #   capabilities: true
      #   cloud_models_url: https://ollama.com/api/tags
      #   timeout: 15s
    - id: llamacpp
      enabled: false
      config: {}
      # base_url: http://localhost:8080/v1  # default local llama.cpp OpenAI-compatible root
      # api_key: ""          # optional reverse-proxy auth; default installs use dummy credential
      # discovery:
      #   catalog: true
      #   catalog_url: https://models.dev/api.json
      #   timeout: 15s
    - id: lmstudio
      enabled: false
      config: {}
      # base_url: http://localhost:1234/v1  # default local LM Studio OpenAI-compatible root
      # api_key: ""          # optional reverse-proxy auth; default installs use dummy credential
      # discovery:
      #   catalog: true
      #   catalog_url: https://models.dev/api.json
      #   timeout: 15s
    - id: vllm
      enabled: false
      config: {}
      # base_url: http://localhost:8000/v1  # default local vLLM OpenAI-compatible root
      # api_key: ""          # optional reverse-proxy auth; default installs use dummy credential
      # discovery:
      #   catalog: true
      #   catalog_url: https://models.dev/api.json
      #   timeout: 15s
    # Custom compatible backends let operators add API-compatible providers without code.
    # Use kind to select the generic factory; id remains the runtime route backend instance.
    # backend_prefix must be unique, cannot contain / or :, and cannot use standard connector prefixes such as nvidia/openrouter/huggingface/anthropic.
    # api_key_env_var_root reads ROOT, ROOT_2, ROOT_3, ... using the standard static key convention.
    # - id: provider123
    #   kind: custom-openai-legacy-compatible
    #   enabled: false
    #   config:
    #     backend_prefix: provider123
    #     base_url: https://api.provider123.example/v1
    #     api_key_env_var_root: PROVIDER123_API_KEY
    # - id: provider123-responses
    #   kind: custom-openai-responses-compatible
    #   enabled: false
    #   config:
    #     backend_prefix: provider123-responses
    #     base_url: https://api.provider123.example/v1
    #     api_key_env_var_root: PROVIDER123_RESPONSES_API_KEY
    # - id: provider-anthropic
    #   kind: custom-anthropic-compatible
    #   enabled: false
    #   config:
    #     backend_prefix: provider-anthropic
    #     base_url: https://api.provider-anthropic.example
    #     api_key_env_var_root: PROVIDER_ANTHROPIC_API_KEY
  features:
    - id: submit-noop
      enabled: true
      config: {}
    - id: parts-noop
      enabled: true
      config: {}
    - id: tool-reactor-noop
      enabled: true
      config: {}
    # Reference (non-noop) examples — enable as needed:
    # - id: ref-submit-annotate
    #   enabled: true
    #   config: { marker: "staging" }
    # - id: ref-request-suffix
    #   enabled: true
    #   config: { suffix: " [ref]" }
    # - id: ref-tool-prefix
    #   enabled: true
    #   config: { prefix: ">>" }