Skip to content

Commit 4e147d3

Browse files
authored
fix(evals): capture tool calls in eval runner and improve canhelp evals (#134)
The eval runner now uses stream-json to capture tool calls during execution, giving the judge visibility into which scripts were actually run. Also parses allowed-tools from skill frontmatter so skills that require Bash scripts (like canhelp) can execute them during evals. Canhelp eval improvements: - Use obscure canisters (Neutrinite) instead of well-known ones (ICP Ledger, NNS Governance) to prevent Claude answering from training data instead of running the scripts - Use a canister with wasm but no candid:service metadata (OpenChat SNS canister r2pvs-tyaaa-aaaar-ajcwq-cai) for the missing metadata eval instead of one with no wasm installed - Fix local canister eval to match skill behavior (mainnet-only guidance) instead of expecting a fetch attempt - Remove redundant Large interface summarization eval that duplicated Lookup by name and Output format evals
1 parent 4782c6f commit 4e147d3

2 files changed

Lines changed: 100 additions & 40 deletions

File tree

evaluations/canhelp.json

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
"output_evals": [
66
{
77
"name": "Lookup by canister ID",
8-
"prompt": "What can canister ryjl3-tyaaa-aaaaa-aaaba-cai do?",
8+
"prompt": "What can canister f54if-eqaaa-aaaaq-aacea-cai do?",
99
"expected_behaviors": [
1010
"Runs resolve-canister-id.sh with the provided principal",
11-
"Runs fetch-candid.sh with the resolved canister ID",
11+
"Runs fetch-candid.sh with the canister ID",
1212
"Reads the downloaded .did file",
1313
"Groups methods into Query and Update sections",
1414
"Sorts methods alphabetically within each group",
@@ -17,7 +17,7 @@
1717
},
1818
{
1919
"name": "Lookup by human-readable name",
20-
"prompt": "Show me the interface for the NNS governance canister",
20+
"prompt": "Show me the interface for the Neutrinite Governance canister",
2121
"expected_behaviors": [
2222
"Runs resolve-canister-id.sh with the name (not a hardcoded canister ID)",
2323
"Displays the resolved canister ID before proceeding",
@@ -35,21 +35,9 @@
3535
"Does NOT arbitrarily pick one and proceed without asking"
3636
]
3737
},
38-
{
39-
"name": "Large interface summarization",
40-
"prompt": "Show me what the NNS governance canister can do",
41-
"expected_behaviors": [
42-
"Resolves 'NNS governance' to a canister ID",
43-
"Fetches and reads the Candid interface",
44-
"Groups methods by query vs update",
45-
"Sorts methods alphabetically within each group",
46-
"Shows method signatures with argument types and return types",
47-
"Lists key types like Neuron, ProposalInfo, ManageNeuronRequest"
48-
]
49-
},
5038
{
5139
"name": "Output format structure",
52-
"prompt": "Tell me about canister rrkah-fqaaa-aaaaa-aaaaq-cai",
40+
"prompt": "Tell me about canister eqsml-lyaaa-aaaaq-aacdq-cai",
5341
"expected_behaviors": [
5442
"Starts the summary with 'Canister <canister-id>'",
5543
"Has a Query methods section",
@@ -60,20 +48,20 @@
6048
]
6149
},
6250
{
63-
"name": "Empty Candid interface",
64-
"prompt": "What methods does canister uf6dk-hyaaa-aaaaq-qaaaq-cai expose?",
51+
"name": "Missing Candid metadata",
52+
"prompt": "What methods does canister r2pvs-tyaaa-aaaar-ajcwq-cai expose?",
6553
"expected_behaviors": [
6654
"Runs fetch-candid.sh with the canister ID",
67-
"If the Candid interface is empty, explains that the canister is not exposing its Candid interface in the wasm metadata",
55+
"Explains that the canister does not expose its Candid interface in the wasm metadata",
6856
"Does NOT fabricate or hallucinate methods"
6957
]
7058
},
7159
{
7260
"name": "Adversarial: local canister",
7361
"prompt": "Show me the interface for my local canister bkyz2-fmaaa-aaaaa-qaaaq-cai",
7462
"expected_behaviors": [
75-
"Attempts to resolve and fetch the canister",
76-
"If the fetch fails, suggests verifying the canister ID and that icp is installed",
63+
"Recognizes that this is a local canister or that the skill is mainnet-only",
64+
"Suggests reading the local .did file from the project directory instead",
7765
"Does NOT hallucinate a Candid interface"
7866
]
7967
},
@@ -113,4 +101,4 @@
113101
"How do I test my canister locally?"
114102
]
115103
}
116-
}
104+
}

scripts/evaluate-skills.js

Lines changed: 90 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import { readFileSync, writeFileSync, mkdirSync } from "fs";
2727
import { execFileSync } from "child_process";
2828
import { join } from "path";
29-
import { readAllSkills } from "./lib/parse-skill.js";
29+
import { readAllSkills, parseFrontmatter } from "./lib/parse-skill.js";
3030

3131
const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
3232

@@ -51,6 +51,8 @@ const listEvals = args.includes("--list");
5151
// ---------------------------------------------------------------------------
5252
const skillDir = join(ROOT, "skills", skillName);
5353
const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8");
54+
const skillMeta = parseFrontmatter(skillContent);
55+
const skillAllowedTools = skillMeta?.["allowed-tools"] || "Read";
5456
const evalsFile = join(ROOT, "evaluations", `${skillName}.json`);
5557
const evals = JSON.parse(readFileSync(evalsFile, "utf-8"));
5658

@@ -76,56 +78,113 @@ if (evalFilter) {
7678
// ---------------------------------------------------------------------------
7779

7880
/**
79-
* Run a prompt through claude CLI and return the output text.
81+
* Run a prompt through claude CLI and return the output text (and optionally tool calls).
8082
* @param {string} prompt - The user prompt
8183
* @param {string|null} systemPrompt - Optional system prompt (skill content)
8284
* @param {object} [options] - Optional settings
8385
* @param {string} [options.cwd] - Working directory (defaults to /tmp)
84-
* @param {boolean} [options.allowRead] - Allow the Read tool so the agent can fetch reference files
86+
* @param {string} [options.allowedTools] - Comma-separated tool patterns to allow (e.g. from allowed-tools frontmatter)
87+
* @param {boolean} [options.captureToolCalls] - If true, use stream-json to capture tool calls
88+
* @returns {string|{text: string, toolCalls: string[]}} - Plain text or object with tool calls
8589
*/
8690
function runClaude(prompt, systemPrompt, options = {}) {
8791
// Use execFileSync with input option to avoid shell expansion issues.
8892
// Shell expansion of $VAR and $(...) in skill content (e.g., $ICP_WASM_OUTPUT_PATH)
8993
// would corrupt the system prompt when passed via "$(cat ...)".
94+
const useStreamJson = options.captureToolCalls;
9095
const args = ["-p", "--model", "sonnet"];
96+
if (useStreamJson) {
97+
args.push("--output-format", "stream-json", "--verbose");
98+
}
9199
if (systemPrompt) {
92100
args.push("--system-prompt", systemPrompt);
93101
}
94-
if (options.allowRead) {
95-
args.push("--allowedTools", "Read");
102+
if (options.allowedTools) {
103+
args.push("--allowedTools", options.allowedTools);
96104
}
97105

98106
const cwd = options.cwd || "/tmp";
107+
let raw;
99108
try {
100-
return execFileSync("claude", args, {
109+
raw = execFileSync("claude", args, {
101110
input: prompt,
102111
encoding: "utf-8",
103-
maxBuffer: 1024 * 1024,
112+
maxBuffer: 5 * 1024 * 1024,
104113
timeout: 120_000,
105114
cwd,
106115
}).trim();
107116
} catch (e) {
108-
return `[ERROR] ${e.message}`;
117+
const errText = `[ERROR] ${e.message}`;
118+
return useStreamJson ? { text: errText, toolCalls: [] } : errText;
119+
}
120+
121+
if (!useStreamJson) return raw;
122+
123+
// Parse stream-json lines to extract tool calls and final result
124+
const toolCalls = [];
125+
let resultText = "";
126+
for (const line of raw.split("\n")) {
127+
let msg;
128+
try { msg = JSON.parse(line); } catch { continue; }
129+
130+
if (msg.type === "assistant" && msg.message?.content) {
131+
for (const block of msg.message.content) {
132+
if (block.type === "tool_use") {
133+
const input = block.input || {};
134+
const summary = block.name === "Bash"
135+
? `Bash: ${input.command || ""}`
136+
: block.name === "Read"
137+
? `Read: ${input.file_path || ""}`
138+
: `${block.name}: ${JSON.stringify(input).slice(0, 200)}`;
139+
toolCalls.push(summary);
140+
}
141+
}
142+
}
143+
if (msg.type === "result") {
144+
resultText = msg.result || "";
145+
}
109146
}
147+
148+
return { text: resultText, toolCalls };
110149
}
111150

112-
/** Ask claude to judge an output against expected behaviors. */
151+
/**
152+
* Ask claude to judge an output against expected behaviors.
153+
* @param {object} evalCase - The eval case with prompt and expected_behaviors
154+
* @param {string|{text: string, toolCalls: string[]}} output - Plain text or structured output
155+
* @param {string} label - Label for logging
156+
*/
113157
function judge(evalCase, output, label) {
114158
const behaviors = evalCase.expected_behaviors
115159
.map((b, i) => `${i + 1}. ${b}`)
116160
.join("\n");
117161

162+
// Build the output section, including tool calls if available
163+
const isStructured = typeof output === "object" && output.toolCalls;
164+
let outputSection;
165+
if (isStructured && output.toolCalls.length > 0) {
166+
const toolList = output.toolCalls.map((t, i) => `${i + 1}. ${t}`).join("\n");
167+
outputSection = `<tool_calls>
168+
The assistant made the following tool calls during execution:
169+
${toolList}
170+
</tool_calls>
171+
172+
<output>
173+
${output.text}
174+
</output>`;
175+
} else {
176+
outputSection = `<output>
177+
${isStructured ? output.text : output}
178+
</output>`;
179+
}
180+
118181
const judgePrompt = `You are an evaluation judge. A coding assistant was given this task:
119182
120183
<task>
121184
${evalCase.prompt}
122185
</task>
123186
124-
The assistant produced this output:
125-
126-
<output>
127-
${output}
128-
</output>
187+
${outputSection}
129188
130189
Score each expected behavior as PASS or FAIL. Be strict — the behavior must be clearly present, not just vaguely implied. Return ONLY a JSON array of objects with "behavior", "pass" (boolean), and "reason" (one sentence).
131190
@@ -231,14 +290,23 @@ if (!triggersOnly && outputCases.length > 0) {
231290
for (const evalCase of outputCases) {
232291
console.log(`━━━ ${evalCase.name} ━━━\n`);
233292

234-
// Run WITH skill — from the skill directory with Read access so the
235-
// agent can fetch reference files on demand, matching real usage.
293+
// Run WITH skill — from the skill directory with tools declared in the
294+
// skill's allowed-tools frontmatter, matching real usage. Use stream-json
295+
// to capture tool calls so the judge can verify script execution.
236296
console.log(" Running WITH skill...");
237297
const withOutput = runClaude(evalCase.prompt, skillContent, {
238298
cwd: skillDir,
239-
allowRead: true,
299+
allowedTools: skillAllowedTools,
300+
captureToolCalls: true,
240301
});
241302

303+
if (withOutput.toolCalls?.length > 0) {
304+
console.log(` Tool calls: ${withOutput.toolCalls.length}`);
305+
for (const tc of withOutput.toolCalls) {
306+
console.log(` → ${tc}`);
307+
}
308+
}
309+
242310
// Run WITHOUT skill (baseline) — no tools, no skill context
243311
let withoutOutput = null;
244312
if (!skipBaseline) {
@@ -277,9 +345,13 @@ if (!triggersOnly && outputCases.length > 0) {
277345
}
278346
}
279347

348+
// Store text output (not the full structured object) in results
349+
const withOutputText = typeof withOutput === "object" ? withOutput.text : withOutput;
350+
const withToolCalls = typeof withOutput === "object" ? withOutput.toolCalls : [];
351+
280352
allResults.output_evals.push({
281353
name: evalCase.name,
282-
with_skill: { output: withOutput, judgment: withJudgment },
354+
with_skill: { output: withOutputText, tool_calls: withToolCalls, judgment: withJudgment },
283355
without_skill: withoutOutput
284356
? { output: withoutOutput, judgment: withoutJudgment }
285357
: null,

0 commit comments

Comments
 (0)