Skip to content

Commit f7fb96b

Browse files
authored
Include threat-detection credits in forecast totals, expose monthly low/high/stdev, and fix formal-verifier tool denials (#39101)
1 parent 35b9a15 commit f7fb96b

8 files changed

Lines changed: 380 additions & 22 deletions

.github/workflows/daily-formal-spec-verifier.lock.yml

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.github/workflows/daily-formal-spec-verifier.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ tools:
4545
- "cat specs/*.md"
4646
- "find . -name \"*_test.go\" -path \"*/pkg/*\" | head -20"
4747
- "cat pkg/workflow/*.go | head -200"
48+
- "cat pkg/cli/*.go"
4849

4950
safe-outputs:
5051
mentions: false

actions/setup/js/create_forecast_issue.cjs

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,21 @@ function monthlyCost(workflow) {
7070
return Number(workflow?.monthly_monte_carlo?.p50_projected_aic ?? workflow?.monthly_projected_aic ?? 0);
7171
}
7272

73+
/**
74+
* @param {Record<string, any>} workflow
75+
* @returns {{low:number,p50:number,high:number,stddev:number}}
76+
*/
77+
function getMonthlyForecastStats(workflow) {
78+
const monthlyMonteCarlo = workflow?.monthly_monte_carlo;
79+
const monthlyProjected = workflow?.monthly_projected_aic ?? 0;
80+
return {
81+
low: toFiniteNumber(monthlyMonteCarlo?.p10_projected_aic ?? monthlyProjected),
82+
p50: toFiniteNumber(monthlyMonteCarlo?.p50_projected_aic ?? monthlyProjected),
83+
high: toFiniteNumber(monthlyMonteCarlo?.p90_projected_aic ?? monthlyProjected),
84+
stddev: toFiniteNumber(monthlyMonteCarlo?.std_dev_aic ?? 0),
85+
};
86+
}
87+
7388
/**
7489
* @param {Record<string, any>} workflow
7590
* @returns {number}
@@ -89,11 +104,11 @@ function buildForecastIssueBody(report, options) {
89104

90105
const categorized = workflows.map(workflow => {
91106
const p50PerRun = toFiniteNumber(workflow?.p50_aic_per_run);
92-
const monthlyP50 = toFiniteNumber(workflow?.monthly_monte_carlo?.p50_projected_aic ?? workflow?.monthly_projected_aic);
93-
const hasForecastData = [p50PerRun, monthlyP50].some(hasPositiveAIC);
107+
const monthly = getMonthlyForecastStats(workflow);
108+
const hasForecastData = [p50PerRun, monthly.p50, monthly.high, monthly.low].some(hasPositiveAIC);
94109
return {
95110
workflow,
96-
row: [renderWorkflowLink(workflow, options), toFiniteNumber(workflow.sampled_runs), p50PerRun, monthlyP50],
111+
row: [renderWorkflowLink(workflow, options), toFiniteNumber(workflow.sampled_runs), p50PerRun, monthly.low, monthly.p50, monthly.high, monthly.stddev],
97112
hasForecastData,
98113
};
99114
});
@@ -117,7 +132,7 @@ function buildForecastIssueBody(report, options) {
117132
return !hasPositiveAIC(p50);
118133
});
119134

120-
const allMonthlyZero = tableRows.length > 0 && tableRows.every(([, , , monthly]) => Number(monthly) === 0);
135+
const allMonthlyZero = tableRows.length > 0 && tableRows.every(([, , , , monthlyP50]) => Number(monthlyP50) === 0);
121136
const allProjectedZero = legacyRows ? legacyRows.length > 0 && legacyRows.every(([, , p50]) => Number(p50) === 0) : allMonthlyZero;
122137

123138
let reportTable;
@@ -130,12 +145,15 @@ function buildForecastIssueBody(report, options) {
130145
if (tableRows.length === 0) {
131146
reportTable = "_No forecast rows were produced._";
132147
} else {
133-
const totalMonthly = tableRows.reduce((s, [, , , m]) => s + Number(m), 0);
134-
const dataRows = tableRows.map(([workflowID, sampledRuns, p50Run, monthly]) => `| ${workflowID} | ${sampledRuns} | ${formatAIC(p50Run)} | ${formatAIC(monthly)} |`);
148+
const totalMonthly = tableRows.reduce((s, [, , , , monthly]) => s + Number(monthly), 0);
149+
const dataRows = tableRows.map(
150+
([workflowID, sampledRuns, p50Run, monthlyLow, monthlyP50, monthlyHigh, monthlyStdDev]) =>
151+
`| ${workflowID} | ${sampledRuns} | ${formatAIC(p50Run)} | ${formatAIC(monthlyLow)} | ${formatAIC(monthlyP50)} | ${formatAIC(monthlyHigh)} | ${formatAIC(monthlyStdDev)} |`
152+
);
135153
if (tableRows.length > 1) {
136-
dataRows.push(`| **TOTAL** | | | **${formatAIC(totalMonthly)}** |`);
154+
dataRows.push(`| **TOTAL** | | | | **${formatAIC(totalMonthly)}** | | |`);
137155
}
138-
reportTable = ["| Workflow | Runs | P50/Run | Monthly (P50) |", "| --- | ---: | ---: | ---: |", ...dataRows].join("\n");
156+
reportTable = ["| Workflow | Runs | P50/Run | Monthly (Low) | Monthly (P50) | Monthly (High) | Monthly (Stdev) |", "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", ...dataRows].join("\n");
139157
}
140158
}
141159
const withoutDataWorkflows = legacyRows ? legacyNoDataWorkflows : workflowsWithoutData;
@@ -166,8 +184,9 @@ function buildForecastIssueBody(report, options) {
166184
"### How to read this report",
167185
"",
168186
"- **P50/Run** is the median per-run AIC from sampled historical runs.",
169-
"- **Monthly (P50)** is the Monte Carlo median of total AIC over 30 days.",
170-
"- Monthly values are distribution medians, not a direct `P50/Run × runs` multiplication.",
187+
"- **Monthly (Low/P50/High)** are the Monte Carlo P10 / P50 / P90 total-AIC bounds over 30 days.",
188+
"- **Monthly (Stdev)** is the Monte Carlo standard deviation of the 30-day total-AIC distribution.",
189+
"- Monthly values come from the Monte Carlo distribution and are not a direct `P50/Run × runs` multiplication.",
171190
"",
172191
].join("\n");
173192

actions/setup/js/create_forecast_issue.test.cjs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,12 @@ describe("create_forecast_issue", () => {
6666
p50_aic_per_run: 4000,
6767
p95_aic_per_run: 8000,
6868
weekly_monte_carlo: { p50_projected_aic: 12345.6 },
69-
monthly_monte_carlo: { p50_projected_aic: 52000 },
69+
monthly_monte_carlo: {
70+
p10_projected_aic: 48000,
71+
p50_projected_aic: 52000,
72+
p90_projected_aic: 61000,
73+
std_dev_aic: 3210,
74+
},
7075
},
7176
{
7277
workflow_id: "wf-b",
@@ -89,13 +94,15 @@ describe("create_forecast_issue", () => {
8994
}
9095
);
9196

92-
expect(body).toContain("| Workflow | Runs | P50/Run | Monthly (P50) |");
93-
expect(body).toContain("| [wf\\|a](https://github.com/octo/repo/actions/workflows/.github%2Fworkflows%2Fwf-a.yml) | 3 | 4,000 | 52,000 |");
97+
expect(body).toContain("| Workflow | Runs | P50/Run | Monthly (Low) | Monthly (P50) | Monthly (High) | Monthly (Stdev) |");
98+
expect(body).toContain("| [wf\\|a](https://github.com/octo/repo/actions/workflows/.github%2Fworkflows%2Fwf-a.yml) | 3 | 4,000 | 48,000 | 52,000 | 61,000 | 3,210 |");
9499
expect(body).toContain("### AW without data");
95100
expect(body).toContain("| [wf-b](https://github.com/octo/repo/actions/workflows/.github%2Fworkflows%2Fwf-b.yml) | 0 |");
96101
expect(body).toContain("AIC = 0 is treated as missing data and excluded from forecast computation.");
97102
expect(body).toContain("### How to read this report");
98-
expect(body).toContain("Monthly values are distribution medians");
103+
expect(body).toContain("Monte Carlo P10 / P50 / P90 total-AIC bounds");
104+
expect(body).toContain("Monte Carlo standard deviation");
105+
expect(body).toContain("Monthly values come from the Monte Carlo distribution");
99106
expect(body).toContain("_Forecast source run: [#123456](https://github.com/octo/repo/actions/runs/123456)._");
100107
expect(body).toContain("Consult the billing dashboards for accurate usage and charges.");
101108
expect(body).not.toContain("sampled runs but forecast AIC is 0");
@@ -125,7 +132,7 @@ describe("create_forecast_issue", () => {
125132
}
126133
);
127134

128-
expect(body).toContain("| wf-round | 1 | 2 | 5 |");
135+
expect(body).toContain("| wf-round | 1 | 2 | 5 | 5 | 5 | 0 |");
129136
});
130137

131138
it("lists workflows without data when every projected AIC is zero", async () => {
@@ -279,7 +286,7 @@ describe("create_forecast_issue", () => {
279286
}
280287
);
281288

282-
expect(body).toContain("| **TOTAL** | | | **42,000** |");
289+
expect(body).toContain("| **TOTAL** | | | | **42,000** | | |");
283290
});
284291

285292
it("sorts workflows by monthly cost descending", async () => {
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# ADR-39101: Aggregate All Usage-Artifact JSONL Files for Forecast AIC
2+
3+
**Date**: 2026-06-13
4+
**Status**: Draft
5+
**Deciders**: Unknown (generated from PR #39101)
6+
7+
---
8+
9+
## Part 1 — Narrative (Human-Friendly)
10+
11+
### Context
12+
13+
The cost-forecast pipeline computes per-run AI Credit (AIC) cost from a single token-usage file produced by the main agent. As workflows began spending AIC in threat-detection steps, that spend was recorded in separate usage records inside the compact `usage` artifact and was never read by the forecast loader, so forecast totals silently undercounted real cost. The forecast issue also exposed only a single monthly P50 figure, hiding the spread of the Monte Carlo projection from anyone trying to reason about worst-case monthly spend.
14+
15+
### Decision
16+
17+
We will compute forecast AIC by aggregating **every** `.jsonl` file under a run's `usage` artifact directory rather than reading only the main agent usage file. For each record we prefer an explicit credit value (`ai_credits`/`aic`) and otherwise recompute AIC from raw token counts via `computeModelInferenceAIC`. When no usage-directory files are present we fall back to the existing single-file path, preserving backward compatibility. We will also widen the forecast report from a single `Monthly (P50)` column to `Monthly (Low/P50/High/Stdev)` derived from the Monte Carlo distribution.
18+
19+
### Alternatives Considered
20+
21+
#### Alternative 1: Keep reading only the main agent usage file
22+
23+
The status quo. Rejected because it structurally cannot see detection spend, which lives in sibling records within the `usage` artifact — the very gap that motivated this change. No amount of per-run scaling fixes an input that omits a cost source.
24+
25+
#### Alternative 2: Pre-aggregate AIC upstream into one summed file
26+
27+
Have the artifact producer emit a single pre-summed usage file the forecast loader reads as-is. Rejected for this change because it pushes cost-summation and AIC-recomputation logic into artifact generation, couples the forecast format to the producer, and is a larger blast radius than reading the files that already exist. Reading the directory keeps the forecast loader as the single owner of AIC computation.
28+
29+
### Consequences
30+
31+
#### Positive
32+
- Forecast totals now include threat-detection credits, eliminating the documented undercount.
33+
- Both explicit-credit and token-only usage records are handled, so detection records missing `ai_credits` still contribute cost via recomputation.
34+
- The widened report surfaces low/high/stdev, letting readers gauge projection spread, not just the median.
35+
36+
#### Negative
37+
- The loader now walks the entire `usage` directory per run, adding filesystem I/O and a `filepath.Walk` traversal that scales with artifact file count.
38+
- Per-record precedence logic (`ai_credits``aic` → recomputed) adds branching that must stay in sync with the artifact record shape; a renamed field would silently zero a cost source.
39+
- The forecast issue table is wider, consuming more horizontal space in the rendered report.
40+
41+
#### Neutral
42+
- Behavior is unchanged for runs without a `usage` directory; the single-file path remains the fallback.
43+
- Sorting and totals stay centered on monthly P50, so report ranking is unaffected by the added columns.
44+
45+
---
46+
47+
## Part 2 — Normative Specification (RFC 2119)
48+
49+
> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119).
50+
51+
### Forecast AIC Aggregation
52+
53+
1. When a run directory contains a `usage` subdirectory with one or more `.jsonl` files, the AIC-only loader **MUST** compute total AIC from all such files rather than from the single token-usage file.
54+
2. For each usage record, an implementation **MUST** prefer an explicit credit value (`ai_credits`/`aic`) when present and positive, and **MUST NOT** also recompute AIC from token counts for that same record.
55+
3. When no explicit credit value is present, an implementation **SHOULD** recompute AIC from the record's token counts using the shared inference-cost function.
56+
4. When no `usage` directory files are found, an implementation **MUST** fall back to the existing single-file token-usage path.
57+
5. Records that are malformed, empty, or non-AIC **MUST** be skipped without aborting aggregation of the remaining records.
58+
59+
### Forecast Report Shape
60+
61+
1. The forecast table **MUST** present `Monthly (Low)`, `Monthly (P50)`, and `Monthly (High)` as the Monte Carlo P10, P50, and P90 of 30-day total AIC respectively.
62+
2. The forecast table **MUST** present `Monthly (Stdev)` as the Monte Carlo standard deviation of the 30-day total-AIC distribution.
63+
3. Sorting and totals **SHOULD** remain centered on the monthly P50 value.
64+
65+
### Conformance
66+
67+
An implementation is conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance.
68+
69+
---
70+
71+
*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/27471799541) workflow. The PR author must review, complete, and finalize this document before the PR can merge.*

0 commit comments

Comments
 (0)