DaviRain-Su · pull · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/.github/APPROVED_CONTRIBUTORS b/.github/APPROVED_CONTRIBUTORS
@@ -237,3 +237,5 @@ davidlifschitz pr
 vdxz pr
 
 dangooddd pr
+
+Mearman pr
diff --git a/README.md b/README.md
@@ -62,9 +62,9 @@ Pi does not include a built-in permission system for restricting filesystem, pro
 
 If you need stronger boundaries, containerize or sandbox Pi. See [packages/coding-agent/docs/containerization.md](packages/coding-agent/docs/containerization.md) for three patterns:
 
-- **OpenShell**: run the whole `pi` process in a policy-controlled sandbox.
 - **Gondolin extension**: keep `pi` and provider auth on the host while routing built-in tools and `!` commands into a local Linux micro-VM.
 - **Plain Docker**: run the whole `pi` process in a local container for simple isolation.
+- **OpenShell**: run the whole `pi` process in a policy-controlled sandbox.
 
 ## Contributing
 

diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixed
 
+- Fixed GitHub Copilot Claude adaptive-thinking effort metadata to match manually checked Copilot model capabilities ([#4637](https://github.com/earendil-works/pi/issues/4637)).
 - Fixed OpenCode/OpenCode Go completion models that reject `prompt_cache_retention` to omit long-retention cache fields when `cacheRetention` is `long` ([#5702](https://github.com/earendil-works/pi/issues/5702)).
 
 ## [0.79.3] - 2026-06-13

diff --git a/packages/ai/scripts/generate-models.ts b/packages/ai/scripts/generate-models.ts
@@ -196,6 +196,14 @@ const OPENCODE_OPENAI_COMPLETIONS_LONG_CACHE_RETENTION_UNSUPPORTED_MODELS = new
 	"opencode-go:kimi-k2.6",
 ]);
 
+// Checked manually against the authenticated GitHub Copilot /models endpoint on 2026-06-15.
+// Keep this to narrow corrections over models.dev metadata instead of snapshotting Copilot's catalog.
+const GITHUB_COPILOT_THINKING_LEVEL_OVERRIDES = {
+	"claude-opus-4.7": { minimal: "low" },
+	"claude-opus-4.8": { minimal: "low" },
+	"claude-sonnet-4.6": { minimal: "low", xhigh: "max" },
+} satisfies Record<string, NonNullable<Model<Api>["thinkingLevelMap"]>>;
+
 function mergeThinkingLevelMap(model: Model<any>, map: NonNullable<Model<any>["thinkingLevelMap"]>): void {
 	model.thinkingLevelMap = { ...model.thinkingLevelMap, ...map };
 }
@@ -358,6 +366,12 @@ function applyThinkingLevelMetadata(model: Model<any>): void {
 		// Ring reasons by default. Only high/xhigh have documented explicit effort controls.
 		mergeThinkingLevelMap(model, ANT_LING_RING_THINKING_LEVEL_MAP);
 	}
+	if (model.provider === "github-copilot") {
+		const override = GITHUB_COPILOT_THINKING_LEVEL_OVERRIDES[model.id];
+		if (override) {
+			mergeThinkingLevelMap(model, override);
+		}
+	}
 }
 
 function getAnthropicMessagesCompat(provider: string, modelId: string): AnthropicMessagesCompat | undefined {

diff --git a/packages/ai/src/models.generated.ts b/packages/ai/src/models.generated.ts
@@ -4194,7 +4194,7 @@ export const MODELS = {
 			headers: {"User-Agent":"GitHubCopilotChat/0.35.0","Editor-Version":"vscode/1.107.0","Editor-Plugin-Version":"copilot-chat/0.35.0","Copilot-Integration-Id":"vscode-chat"},
 			compat: {"forceAdaptiveThinking":true,"supportsTemperature":false},
 			reasoning: true,
-			thinkingLevelMap: {"xhigh":"xhigh"},
+			thinkingLevelMap: {"xhigh":"xhigh","minimal":"low"},
 			input: ["text", "image"],
 			cost: {
 				input: 5,
@@ -4214,7 +4214,7 @@ export const MODELS = {
 			headers: {"User-Agent":"GitHubCopilotChat/0.35.0","Editor-Version":"vscode/1.107.0","Editor-Plugin-Version":"copilot-chat/0.35.0","Copilot-Integration-Id":"vscode-chat"},
 			compat: {"forceAdaptiveThinking":true,"supportsTemperature":false},
 			reasoning: true,
-			thinkingLevelMap: {"xhigh":"xhigh"},
+			thinkingLevelMap: {"xhigh":"xhigh","minimal":"low"},
 			input: ["text", "image"],
 			cost: {
 				input: 5,
@@ -4272,6 +4272,7 @@ export const MODELS = {
 			headers: {"User-Agent":"GitHubCopilotChat/0.35.0","Editor-Version":"vscode/1.107.0","Editor-Plugin-Version":"copilot-chat/0.35.0","Copilot-Integration-Id":"vscode-chat"},
 			compat: {"forceAdaptiveThinking":true},
 			reasoning: true,
+			thinkingLevelMap: {"minimal":"low","xhigh":"max"},
 			input: ["text", "image"],
 			cost: {
 				input: 3,
@@ -4831,6 +4832,42 @@ export const MODELS = {
 			contextWindow: 262144,
 			maxTokens: 32768,
 		} satisfies Model<"google-generative-ai">,
+		"gemma-4-E2B-it": {
+			id: "gemma-4-E2B-it",
+			name: "Gemma 4 E2B IT",
+			api: "google-generative-ai",
+			provider: "google",
+			baseUrl: "https://generativelanguage.googleapis.com/v1beta",
+			reasoning: true,
+			thinkingLevelMap: {"off":null,"minimal":"MINIMAL","low":null,"medium":null,"high":"HIGH"},
+			input: ["text", "image"],
+			cost: {
+				input: 0,
+				output: 0,
+				cacheRead: 0,
+				cacheWrite: 0,
+			},
+			contextWindow: 131072,
+			maxTokens: 8192,
+		} satisfies Model<"google-generative-ai">,
+		"gemma-4-E4B-it": {
+			id: "gemma-4-E4B-it",
+			name: "Gemma 4 E4B IT",
+			api: "google-generative-ai",
+			provider: "google",
+			baseUrl: "https://generativelanguage.googleapis.com/v1beta",
+			reasoning: true,
+			thinkingLevelMap: {"off":null,"minimal":"MINIMAL","low":null,"medium":null,"high":"HIGH"},
+			input: ["text", "image"],
+			cost: {
+				input: 0,
+				output: 0,
+				cacheRead: 0,
+				cacheWrite: 0,
+			},
+			contextWindow: 131072,
+			maxTokens: 8192,
+		} satisfies Model<"google-generative-ai">,
 	},
 	"google-vertex": {
 		"gemini-1.5-flash": {
@@ -9432,13 +9469,13 @@ export const MODELS = {
 			thinkingLevelMap: {"minimal":null,"low":null,"medium":null,"high":"high","xhigh":"xhigh"},
 			input: ["text"],
 			cost: {
-				input: 0.098,
-				output: 0.196,
+				input: 0.09,
+				output: 0.18,
 				cacheRead: 0.02,
 				cacheWrite: 0,
 			},
 			contextWindow: 1048576,
-			maxTokens: 4096,
+			maxTokens: 65536,
 		} satisfies Model<"openai-completions">,
 		"deepseek/deepseek-v4-pro": {
 			id: "deepseek/deepseek-v4-pro",
@@ -13304,6 +13341,25 @@ export const MODELS = {
 			contextWindow: 262144,
 			maxTokens: 131000,
 		} satisfies Model<"openai-completions">,
+		"moonshotai/Kimi-K2.7-Code": {
+			id: "moonshotai/Kimi-K2.7-Code",
+			name: "Kimi K2.7 Code",
+			api: "openai-completions",
+			provider: "together",
+			baseUrl: "https://api.together.ai/v1",
+			compat: {"supportsStore":false,"supportsDeveloperRole":false,"supportsReasoningEffort":false,"maxTokensField":"max_tokens","supportsStrictMode":false,"supportsLongCacheRetention":false,"thinkingFormat":"together"},
+			reasoning: true,
+			thinkingLevelMap: {"minimal":null,"low":null,"medium":null},
+			input: ["text"],
+			cost: {
+				input: 0.95,
+				output: 4,
+				cacheRead: 0.19,
+				cacheWrite: 0,
+			},
+			contextWindow: 262144,
+			maxTokens: 131072,
+		} satisfies Model<"openai-completions">,
 		"nvidia/nemotron-3-ultra-550b-a55b": {
 			id: "nvidia/nemotron-3-ultra-550b-a55b",
 			name: "Nemotron 3 Ultra 550B A55B",

diff --git a/packages/ai/src/models.ts b/packages/ai/src/models.ts
@@ -37,10 +37,13 @@ export function getModels<TProvider extends KnownProvider>(
 }
 
 export function calculateCost<TApi extends Api>(model: Model<TApi>, usage: Usage): Usage["cost"] {
+	// Anthropic charges 2x base input for 1h cache writes.
+	const longWrite = usage.cacheWrite1h ?? 0;
+	const shortWrite = usage.cacheWrite - longWrite;
 	usage.cost.input = (model.cost.input / 1000000) * usage.input;
 	usage.cost.output = (model.cost.output / 1000000) * usage.output;
 	usage.cost.cacheRead = (model.cost.cacheRead / 1000000) * usage.cacheRead;
-	usage.cost.cacheWrite = (model.cost.cacheWrite / 1000000) * usage.cacheWrite;
+	usage.cost.cacheWrite = (model.cost.cacheWrite * shortWrite + model.cost.input * 2 * longWrite) / 1000000;
 	usage.cost.total = usage.cost.input + usage.cost.output + usage.cost.cacheRead + usage.cost.cacheWrite;
 	return usage.cost;
 }

diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts
@@ -535,6 +535,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages", AnthropicOpti
 					output.usage.output = event.message.usage.output_tokens || 0;
 					output.usage.cacheRead = event.message.usage.cache_read_input_tokens || 0;
 					output.usage.cacheWrite = event.message.usage.cache_creation_input_tokens || 0;
+					output.usage.cacheWrite1h = event.message.usage.cache_creation?.ephemeral_1h_input_tokens || 0;
 					// Anthropic doesn't provide total_tokens, compute from components
 					output.usage.totalTokens =
 						output.usage.input + output.usage.output + output.usage.cacheRead + output.usage.cacheWrite;

diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts
@@ -267,6 +267,8 @@ export interface Usage {
 	output: number;
 	cacheRead: number;
 	cacheWrite: number;
+	/** Subset of `cacheWrite` written with 1h retention. Only Anthropic reports this split. */
+	cacheWrite1h?: number;
 	totalTokens: number;
 	cost: {
 		input: number;

diff --git a/packages/ai/test/anthropic-cache-write-1h-cost.test.ts b/packages/ai/test/anthropic-cache-write-1h-cost.test.ts
@@ -0,0 +1,86 @@
+import type Anthropic from "@anthropic-ai/sdk";
+import { describe, expect, it } from "vitest";
+import { getModel } from "../src/models.ts";
+import { streamAnthropic } from "../src/providers/anthropic.ts";
+import type { Context } from "../src/types.ts";
+
+function createSseResponse(events: Array<{ event: string; data: string }>): Response {
+	const body = events.map(({ event, data }) => `event: ${event}\ndata: ${data}\n`).join("\n");
+	return new Response(body, { status: 200, headers: { "content-type": "text/event-stream" } });
+}
+
+function createFakeAnthropicClient(response: Response): Anthropic {
+	return {
+		messages: { create: () => ({ asResponse: async () => response }) },
+	} as unknown as Anthropic;
+}
+
+function eventsWithCacheCreation(
+	cacheCreation: Record<string, number> | undefined,
+): Array<{ event: string; data: string }> {
+	const startUsage: Record<string, unknown> = {
+		input_tokens: 100,
+		output_tokens: 0,
+		cache_read_input_tokens: 0,
+		cache_creation_input_tokens: 1_000_000,
+	};
+	if (cacheCreation) startUsage.cache_creation = cacheCreation;
+	return [
+		{
+			event: "message_start",
+			data: JSON.stringify({ type: "message_start", message: { id: "msg_test", usage: startUsage } }),
+		},
+		{
+			event: "content_block_start",
+			data: JSON.stringify({ type: "content_block_start", index: 0, content_block: { type: "text", text: "" } }),
+		},
+		{
+			event: "content_block_delta",
+			data: JSON.stringify({ type: "content_block_delta", index: 0, delta: { type: "text_delta", text: "Hi" } }),
+		},
+		{ event: "content_block_stop", data: JSON.stringify({ type: "content_block_stop", index: 0 }) },
+		{
+			event: "message_delta",
+			data: JSON.stringify({
+				type: "message_delta",
+				delta: { stop_reason: "end_turn" },
+				usage: {
+					input_tokens: 100,
+					output_tokens: 5,
+					cache_read_input_tokens: 0,
+					cache_creation_input_tokens: 1_000_000,
+				},
+			}),
+		},
+		{ event: "message_stop", data: JSON.stringify({ type: "message_stop" }) },
+	];
+}
+
+// claude-opus-4-8: input 5, cacheWrite (5m) 6.25 per Mtok. 1h write = 2x input = 10.
+const context: Context = { messages: [{ role: "user", content: "hi", timestamp: Date.now() }] };
+
+describe("Anthropic 1h cache write cost", () => {
+	it("prices the 1h portion at 2x input and the rest at the 5m rate", async () => {
+		const model = getModel("anthropic", "claude-opus-4-8");
+		const response = createSseResponse(
+			eventsWithCacheCreation({ ephemeral_5m_input_tokens: 600_000, ephemeral_1h_input_tokens: 400_000 }),
+		);
+		const result = await streamAnthropic(model, context, { client: createFakeAnthropicClient(response) }).result();
+
+		expect(result.usage.cacheWrite).toBe(1_000_000);
+		expect(result.usage.cacheWrite1h).toBe(400_000);
+		// 600k * 6.25/Mtok + 400k * 10/Mtok = 3.75 + 4.0 = 7.75
+		expect(result.usage.cost.cacheWrite).toBeCloseTo(7.75, 10);
+	});
+
+	it("falls back to the 5m rate when no breakdown is reported", async () => {
+		const model = getModel("anthropic", "claude-opus-4-8");
+		const response = createSseResponse(eventsWithCacheCreation(undefined));
+		const result = await streamAnthropic(model, context, { client: createFakeAnthropicClient(response) }).result();
+
+		expect(result.usage.cacheWrite).toBe(1_000_000);
+		expect(result.usage.cacheWrite1h ?? 0).toBe(0);
+		// 1M * 6.25/Mtok = 6.25
+		expect(result.usage.cost.cacheWrite).toBeCloseTo(6.25, 10);
+	});
+});
diff --git a/packages/ai/test/github-copilot-anthropic.test.ts b/packages/ai/test/github-copilot-anthropic.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it, vi } from "vitest";
-import { getModel } from "../src/models.ts";
+import { getModel, getSupportedThinkingLevels } from "../src/models.ts";
 import { streamAnthropic } from "../src/providers/anthropic.ts";
 import type { Context } from "../src/types.ts";
 
@@ -54,6 +54,16 @@ describe("Copilot Claude via Anthropic Messages", () => {
 		messages: [{ role: "user", content: "Hello", timestamp: Date.now() }],
 	};
 
+	it("applies Copilot-specific adaptive thinking effort overrides", () => {
+		const opus47 = getModel("github-copilot", "claude-opus-4.7");
+		expect(opus47.thinkingLevelMap).toMatchObject({ minimal: "low", xhigh: "xhigh" });
+		expect(getSupportedThinkingLevels(opus47)).toContain("xhigh");
+
+		const sonnet46 = getModel("github-copilot", "claude-sonnet-4.6");
+		expect(sonnet46.thinkingLevelMap).toMatchObject({ minimal: "low", xhigh: "max" });
+		expect(getSupportedThinkingLevels(sonnet46)).toContain("xhigh");
+	});
+
 	it("uses Bearer auth, Copilot headers, and valid Anthropic Messages payload", async () => {
 		const model = getModel("github-copilot", "claude-sonnet-4.6");
 		expect(model.api).toBe("anthropic-messages");

diff --git a/packages/coding-agent/docs/containerization.md b/packages/coding-agent/docs/containerization.md
@@ -10,46 +10,12 @@ There are two general options. You can either
 
 | Pattern | What is isolated | Best for | Notes |
 | --- | --- | --- | --- |
-| OpenShell | Whole `pi` process in a policy-controlled sandbox | Local or remote managed sandbox | Requires an OpenShell gateway |
 | Gondolin extension | Built-in tools and `!` commands | Local micro-VM isolation while keeping auth on host | See [`examples/extensions/gondolin/`](../examples/extensions/gondolin/). |
 | Plain Docker | Whole `pi` process in a local container | Simple local isolation | Provider API keys enter the container. |
+| OpenShell | Whole `pi` process in a policy-controlled sandbox | Local or remote managed sandbox | Requires an OpenShell gateway |
 
 Extensions run wherever the `pi` process runs. If you run host `pi` with a tool-routing extension, other custom extension tools still run on the host unless they also delegate their operations.
 
-## OpenShell
-
-Use [NVIDIA OpenShell](https://docs.nvidia.com/openshell/about/overview) when you want a policy-controlled sandbox with filesystem, process, network, credential, and inference controls.
-OpenShell can run sandboxes through a local gateway backed by Docker, Podman, or a VM runtime, or through a remote Kubernetes gateway.
-
-Every sandbox requires an active gateway.
-Register and select one before creating a sandbox:
-
-```bash
-openshell gateway add <gateway-url> --name <name>
-openshell gateway select <name>
-```
-
-Launch `pi` inside an OpenShell sandbox:
-
-```bash
-openshell sandbox create --name pi-sandbox --from pi -- pi
-```
-
-In this pattern, the whole `pi` process runs inside the sandbox.
-Built-in tools, `!` commands, and extension tools execute inside the OpenShell boundary.
-
-If the gateway is remote, project files are not bind-mounted from the host, meaning writes in the sandbox are not reflected on your machine.
-Clone the repository inside the sandbox or use OpenShell file transfer commands:
-
-```bash
-openshell sandbox upload pi-sandbox ./repo /workspace
-openshell sandbox download pi-sandbox /workspace/repo ./repo-out
-```
-
-OpenShell providers can keep raw model API keys outside the sandbox.
-When inference routing is configured, code inside the sandbox can call `https://inference.local`, and the gateway injects the configured provider credentials upstream.
-Configure Pi to use the corresponding OpenAI-compatible or Anthropic-compatible endpoint if you want model traffic to use this route.
-
 ## Gondolin
 
 [Gondolin](https://github.com/earendil-works/gondolin) is a local Linux micro-VM.
@@ -109,3 +75,37 @@ docker run --rm -it \
 The `-v "$PWD:/workspace"` mounts your current directory into the container at /workspace such that reads and writes in `/workspace` inside Docker directly affect your host files, like in the Gondolin example.
 
 Use a named volume for `/root/.pi/agent` if you want container-local settings and sessions. Mounting your host `~/.pi/agent` exposes host auth and session files to the container.
+
+## OpenShell
+
+Use [NVIDIA OpenShell](https://docs.nvidia.com/openshell/about/overview) when you want a policy-controlled sandbox with filesystem, process, network, credential, and inference controls.
+OpenShell can run sandboxes through a local gateway backed by Docker, Podman, or a VM runtime, or through a remote Kubernetes gateway.
+
+Every sandbox requires an active gateway.
+Register and select one before creating a sandbox:
+
+```bash
+openshell gateway add <gateway-url> --name <name>
+openshell gateway select <name>
+```
+
+Launch `pi` inside an OpenShell sandbox:
+
+```bash
+openshell sandbox create --name pi-sandbox --from pi -- pi
+```
+
+In this pattern, the whole `pi` process runs inside the sandbox.
+Built-in tools, `!` commands, and extension tools execute inside the OpenShell boundary.
+
+If the gateway is remote, project files are not bind-mounted from the host, meaning writes in the sandbox are not reflected on your machine.
+Clone the repository inside the sandbox or use OpenShell file transfer commands:
+
+```bash
+openshell sandbox upload pi-sandbox ./repo /workspace
+openshell sandbox download pi-sandbox /workspace/repo ./repo-out
+```
+
+OpenShell providers can keep raw model API keys outside the sandbox.
+When inference routing is configured, code inside the sandbox can call `https://inference.local`, and the gateway injects the configured provider credentials upstream.
+Configure Pi to use the corresponding OpenAI-compatible or Anthropic-compatible endpoint if you want model traffic to use this route.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -237,3 +237,5 @@ davidlifschitz pr
		vdxz pr

		dangooddd pr

		Mearman pr