theopenco · steebchen · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · chatgpt-codex-connector
diff --git a/apps/docs/content/features/reasoning.mdx b/apps/docs/content/features/reasoning.mdx
@@ -16,7 +16,7 @@ You can find all reasoning-enabled models on our [models page with reasoning fil
 
 - OpenAI's GPT-5 series (e.g., `gpt-5`, `gpt-5-mini`)
   - Note: GPT-5 models use reasoning but currently do not return the reasoning content in the response.
-- Anthropic's Claude 3.7 Sonnet
+- Anthropic's Claude Sonnet and Claude Opus models
 - Google's Gemini 2.0 Flash Thinking and Gemini 2.5 Pro
 - GPT OSS models such as `gpt-oss-120b` and `gpt-oss-20b`
 - Z.AI's reasoning models
@@ -162,12 +162,14 @@ curl -X POST "https://api.llmgateway.io/v1/chat/completions" \
 
 ### Supported Models
 
-The `reasoning.max_tokens` parameter is supported by:
+`reasoning.max_tokens` is supported by Anthropic Claude and Google Gemini thinking models. To see exactly which models support reasoning, filter the [models page by the reasoning capability](https://llmgateway.io/models?filters=1&reasoning=true). When using auto-routing or root models with `reasoning.max_tokens`, only providers that support this feature will be considered.
 
-- **Anthropic Claude**: Claude 3.7 Sonnet, Claude Sonnet 4, Claude Opus 4, Claude Opus 4.5
-- **Google Gemini**: Gemini 2.5 Pro, Gemini 2.5 Flash, Gemini 3 Pro Preview
-
-When using auto-routing or root models with `reasoning.max_tokens`, only providers that support this feature will be considered.
+<Callout type="warning">
+	Anthropic's newer Claude Opus models use [adaptive
+	thinking](#adaptive-thinking-claude-opus) and do **not** honor an exact
+	`reasoning.max_tokens` budget — the value is mapped onto an effort level
+	instead. Prefer `reasoning_effort` / `reasoning.effort` for those models.
+</Callout>
 
 ### Provider-Specific Constraints
 
@@ -188,6 +190,31 @@ If you specify `reasoning.max_tokens` for a model that doesn't support it, you'l
 }
 ```
 
+## Adaptive Thinking (Claude Opus)
+
+Anthropic's newer Claude Opus models — **Claude Opus 4.6 and later** — use **adaptive thinking** instead of a fixed reasoning budget. With adaptive thinking, the model itself decides how much to reason based on the difficulty of each prompt. This changes how the reasoning parameters behave:
+
+- **No exact reasoning-token budget.** These models do not accept an explicit thinking budget, so `reasoning.max_tokens` is **not** enforced as a hard limit. For backward compatibility the gateway still accepts the parameter and maps the requested budget onto an effort level (`< 2k → low`, `< 8k → medium`, `< 24k → high`, otherwise `xhigh`) so it still influences depth — but the exact token count is not guaranteed. Prefer `reasoning_effort` / `reasoning.effort` for these models.
+- **Effort is a hint, not a fixed depth.** Even when you specify a `reasoning_effort` such as `high`, the model can dynamically decide to reason only briefly — or skip visible reasoning entirely — for simple prompts and begin its answer almost immediately, while thinking more on harder ones. This is expected adaptive behavior, not reasoning being disabled.
+
+Control the depth with `reasoning_effort` or `reasoning.effort` (`low`, `medium`, `high`, `xhigh`, `max`); the gateway translates this into Anthropic's adaptive `output_config.effort`.
+
+```bash
+curl -X POST "https://api.llmgateway.io/v1/chat/completions" \
+  -H "Authorization: Bearer $LLM_GATEWAY_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "claude-opus-4-6",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Explain the P vs NP problem and why it matters."
+      }
+    ],
+    "reasoning_effort": "high"
+  }'
+```
+
 ## Streaming Reasoning Content
 
 When streaming is enabled, reasoning content will be streamed as part of the response chunks:

diff --git a/packages/actions/src/prepare-request-body.adaptive.spec.ts b/packages/actions/src/prepare-request-body.adaptive.spec.ts
@@ -87,4 +87,41 @@ describe("prepareRequestBody - adaptive thinking (Opus 4.6/4.7/4.8)", () => {
 			display: "summarized",
 		});
 	});
+
+	// A bare reasoning.max_tokens budget can't be enforced on adaptive models
+	// (they reject budget_tokens), so it is bucketed into an effort level instead
+	// of being dropped: <2k -> low, <8k -> medium, <24k -> high, else xhigh.
+	for (const model of [
+		"claude-opus-4-6",
+		"claude-opus-4-7",
+		"claude-opus-4-8",
+	]) {
+		test.each([
+			[1024, "low"],
+			[4096, "medium"],
+			[8000, "high"],
+			[16000, "high"],
+			[32000, "xhigh"],
+		] as const)(
+			`${model} buckets reasoning.max_tokens=%i to effort %s`,
+			async (maxTokens, expected) => {
+				const body = await buildAnthropicBody(model, {
+					reasoning_max_tokens: maxTokens,
+				});
+				expect(body.thinking).toEqual({
+					type: "adaptive",
+					display: "summarized",
+				});
+				expect(body.output_config?.effort).toBe(expected);
+			},
+		);
+	}
+
+	test("explicit reasoning_effort wins over a max_tokens budget", async () => {
+		const body = await buildAnthropicBody("claude-opus-4-6", {
+			reasoning_effort: "low",
+			reasoning_max_tokens: 32000,
+		});
+		expect(body.output_config?.effort).toBe("low");
+	});
 });
diff --git a/packages/actions/src/prepare-request-body.ts b/packages/actions/src/prepare-request-body.ts
@@ -26,6 +26,63 @@ import { transformGoogleMessages } from "./transform-google-messages.js";
 
 type OpenAIImageQuality = "low" | "medium" | "high" | "auto";
 
+type AdaptiveEffort = "low" | "medium" | "high" | "xhigh" | "max";
+
+/**
+ * Resolve `output_config.effort` for adaptive-thinking Anthropic models
+ * (Opus 4.6+). Precedence: explicit `effort`, then `reasoning_effort` mapped
+ * onto the adaptive scale, then `reasoning.max_tokens` bucketed into an effort
+ * level. Adaptive models reject `budget_tokens`, so a requested budget is
+ * translated into a depth hint instead of being dropped. Returns undefined when
+ * no reasoning controls were sent, leaving the model at its default depth.
+ */
+function resolveAdaptiveEffort(
+	effort: "low" | "medium" | "high" | undefined,
+	reasoning_effort:
+		| "none"
+		| "minimal"
+		| "low"
+		| "medium"
+		| "high"
+		| "xhigh"
+		| "max"
+		| undefined,
+	reasoning_max_tokens: number | undefined,
+): AdaptiveEffort | undefined {
+	if (effort !== undefined) {
+		return effort;
+	}
+	if (reasoning_effort) {
+		switch (reasoning_effort) {
+			case "minimal":
+			case "low":
+				return "low";
+			case "medium":
+				return "medium";
+			case "xhigh":
+				return "xhigh";
+			case "max":
+				return "max";
+			case "high":
+			default:
+				return "high";
+		}
+	}
+	if (reasoning_max_tokens !== undefined) {
+		if (reasoning_max_tokens < 2000) {
+			return "low";
+		}
+		if (reasoning_max_tokens < 8000) {
+			return "medium";
+		}
+		if (reasoning_max_tokens < 24000) {
+			return "high";
+		}
+		return "xhigh";
+	}
+	return undefined;
+}
+
 function getProviderMapping(
 	modelDef: ModelDefinition | undefined,
 	usedProvider: ProviderId,
@@ -1908,28 +1965,21 @@ export async function prepareRequestBody(
 					// thinking text — their default flipped to "omitted" (empty thinking,
 					// signature only), unlike Opus 4.6 which defaults to "summarized".
 					requestBody.thinking = { type: "adaptive", display: "summarized" };
-					if (effort === undefined && reasoning_effort) {
-						const mapEffort = (
-							e: typeof reasoning_effort,
-						): "low" | "medium" | "high" | "xhigh" | "max" => {
-							switch (e) {
-								case "minimal":
-								case "low":
-									return "low";
-								case "medium":
-									return "medium";
-								case "high":
-									return "high";
-								case "xhigh":
-									return "xhigh";
-								case "max":
-									return "max";
-								default:
-									return "high";
-							}
-						};
-						requestBody.output_config ??= {};
-						requestBody.output_config.effort = mapEffort(reasoning_effort);
+					// Explicit `effort` is applied below alongside the other optional
+					// parameters; here we derive the adaptive depth from
+					// `reasoning_effort` or, failing that, bucket a requested
+					// `reasoning.max_tokens` into an effort level so the budget still
+					// influences depth instead of being silently dropped.
+					if (effort === undefined) {
+						const adaptiveEffort = resolveAdaptiveEffort(
+							undefined,
+							reasoning_effort,
+							reasoning_max_tokens,
+						);
+						if (adaptiveEffort !== undefined) {
+							requestBody.output_config ??= {};
+							requestBody.output_config.effort = adaptiveEffort;
+						}
 					}
 				} else {
 					requestBody.thinking = {
@@ -2380,28 +2430,11 @@ export async function prepareRequestBody(
 						type: "adaptive",
 						display: "summarized",
 					};
-					const mapEffort = (
-						e: typeof reasoning_effort,
-					): "low" | "medium" | "high" | "xhigh" | "max" => {
-						switch (e) {
-							case "minimal":
-							case "low":
-								return "low";
-							case "medium":
-								return "medium";
-							case "high":
-								return "high";
-							case "xhigh":
-								return "xhigh";
-							case "max":
-								return "max";
-							default:
-								return "high";
-						}
-					};
-					const adaptiveEffort =
-						effort ??
-						(reasoning_effort ? mapEffort(reasoning_effort) : undefined);
+					const adaptiveEffort = resolveAdaptiveEffort(
+						effort,
+						reasoning_effort,
+						reasoning_max_tokens,
+					);
 					if (adaptiveEffort !== undefined) {
 						requestBody.additionalModelRequestFields.output_config = {
 							effort: adaptiveEffort,