Skip to content

Commit e9499ca

Browse files
hugbubbyclaude
andcommitted
refactor(api): extract InvestigationLease table, fix review issues
Migrates lease state from the InvestigationRun.leaseOwner/leaseExpiresAt fields (nullable, representable-invalid) into a separate InvestigationLease table where the row's *existence* represents "this investigation is PROCESSING with a live lease holder". This eliminates the invalid states that the old schema permitted (leaseOwner without leaseExpiresAt, PENDING/COMPLETE investigations with orphan lease fields, etc.) and removes the now-redundant InvestigationRun model entirely. Key design changes: - InvestigationLease row existence ↔ PROCESSING status structural invariant - progressClaims moves to InvestigationLease (auto-cleaned on row delete) - attemptCount, queuedAt, retryAfter promoted to Investigation directly - Retry backoff now application-managed (not graphile-worker maxAttempts): TRANSIENT failures reclaim to PENDING + re-enqueue with exponential delay (10s × 2^(attempt-1)); NON_RETRYABLE failures mark FAILED immediately - Per-investigation jobKey (investigate:${id}) deduplicate concurrent enqueues Review fixes included in this commit: - Bug: duplicate no-op assertion in interim-source-selection test replaced with an explicit InvestigationLease DB query (the original assertion was checking the same already-verified variable twice, losing coverage) - Coverage: extract markInvestigationFailedInTx and releaseLeaseToRetryInTx from attempt-audit.ts so all three terminal/reclaim helpers can be unit tested with mock tx clients, not just persistCompletedInvestigation - Coverage: add invariant-violation throw tests for all three tx helpers (lease deleted but status not PROCESSING → throw rolls back lease deletion) - Coverage: add investigation-lease.test.ts pinning the retry backoff schedule (BASE_BACKOFF_MS, formula, concrete per-attempt values per spec §3.7) - Fix: TypeScript narrowing bug in investigation-lifecycle.ts (investigation variable closure capture in $transaction callback) - Clean: remove redundant seedInvestigationWithLeaseFields calls in lifecycle fuzz test (startedAt/heartbeatAt are not asserted and were already set by seedInvestigation) - Docs: document in tryClaimLease that retryAfter is intentionally not gated there; investigateNow bypassing the backoff window is by design Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent ec1b158 commit e9499ca

28 files changed

Lines changed: 1412 additions & 1179 deletions

SPEC.md

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -980,12 +980,16 @@ model Investigation {
980980
model InvestigationModel
981981
modelVersion String? // Provider-reported model revision/version when available
982982
checkedAt DateTime? // Null until completion
983-
// Transient progress state during active PROCESSING. Contains
984-
// { pending: ClaimPayload[], confirmed: ClaimPayload[] } while the
985-
// orchestrator is running. Nullified on every lifecycle transition
986-
// (COMPLETE, FAILED, lease release, stale-run recovery, requeue).
987-
progressClaims Json?
988-
run InvestigationRun?
983+
queuedAt DateTime @default(now())
984+
// Monotonically increasing attempt counter. Incremented atomically when a
985+
// worker claims the lease. Gives each retry a distinct attemptNumber for
986+
// the InvestigationAttempt audit trail.
987+
attemptCount Int @default(0)
988+
// INV-LEASE: The InvestigationLease row exists iff the investigation is
989+
// PROCESSING and has an active lease holder. Structurally prevents
990+
// leaseOwner/leaseExpiresAt without PROCESSING, and vice versa.
991+
lease InvestigationLease?
992+
openAiKeySource InvestigationOpenAiKeySource?
989993
attempts InvestigationAttempt[]
990994
updates Investigation[] @relation("InvestigationUpdateLineage")
991995
claims Claim[]
@@ -1010,34 +1014,35 @@ model InvestigationInput {
10101014
createdAt DateTime @default(now())
10111015
}
10121016
1013-
model InvestigationRun {
1014-
id String @id @default(cuid())
1015-
investigationId String @unique
1016-
investigation Investigation @relation(fields: [investigationId], references: [id], onDelete: Cascade)
1017-
leaseOwner String? // Worker identity holding the run
1018-
leaseExpiresAt DateTime? // When the worker's lease expires
1019-
recoverAfterAt DateTime? // Grace period before another worker can recover
1020-
queuedAt DateTime?
1021-
startedAt DateTime?
1022-
heartbeatAt DateTime?
1023-
openAiKeySource InvestigationOpenAiKeySource?
1024-
createdAt DateTime @default(now())
1025-
updatedAt DateTime @updatedAt
1017+
// INV-LEASE: The existence of an InvestigationLease row means "this
1018+
// investigation is PROCESSING and has an active lease holder". All fields
1019+
// are NOT NULL — structurally prevents partial lease states. The row is
1020+
// deleted on every terminal transition (COMPLETE, FAILED) and on lease
1021+
// release (transient failure → PENDING), so progressClaims is automatically
1022+
// cleaned up without needing sentinel values.
1023+
model InvestigationLease {
1024+
investigationId String @id
1025+
investigation Investigation @relation(fields: [investigationId], references: [id], onDelete: Cascade)
1026+
leaseOwner String
1027+
leaseExpiresAt DateTime
1028+
startedAt DateTime
1029+
heartbeatAt DateTime
1030+
progressClaims Json? // { pending: ClaimPayload[], confirmed: ClaimPayload[] }
1031+
createdAt DateTime @default(now())
10261032
10271033
@@index([leaseExpiresAt])
1028-
@@index([recoverAfterAt])
10291034
}
10301035
10311036
model InvestigationOpenAiKeySource {
1032-
runId String @id
1033-
run InvestigationRun @relation(fields: [runId], references: [id], onDelete: Cascade)
1034-
ciphertext String // AES-256-GCM encrypted user API key
1035-
iv String
1036-
authTag String
1037-
keyId String // Identifies the encryption key version
1038-
expiresAt DateTime // Short-lived lease; worker must start before expiry
1039-
createdAt DateTime @default(now())
1040-
updatedAt DateTime @updatedAt
1037+
investigationId String @id
1038+
investigation Investigation @relation(fields: [investigationId], references: [id], onDelete: Cascade)
1039+
ciphertext String // AES-256-GCM encrypted user API key
1040+
iv String
1041+
authTag String
1042+
keyId String // Identifies the encryption key version
1043+
expiresAt DateTime // Short-lived lease; worker must start before expiry
1044+
createdAt DateTime @default(now())
1045+
updatedAt DateTime @updatedAt
10411046
10421047
@@index([expiresAt])
10431048
}
@@ -1597,21 +1602,18 @@ WITH latest_versions AS (
15971602
SELECT
15981603
lv."postVersionId",
15991604
i."id" AS "investigationId",
1600-
i."status" AS "investigationStatus",
1601-
r."id" AS "runId",
1602-
r."leaseOwner",
1603-
r."leaseExpiresAt",
1604-
r."recoverAfterAt"
1605+
i."status" AS "investigationStatus"
16051606
FROM latest_versions lv
16061607
JOIN "Post" p ON p."id" = lv."postId"
16071608
JOIN "ContentBlob" cb ON cb."id" = lv."contentBlobId"
16081609
LEFT JOIN "Investigation" i ON i."postVersionId" = lv."postVersionId"
1609-
LEFT JOIN "InvestigationRun" r ON r."investigationId" = i."id"
1610+
LEFT JOIN "InvestigationLease" il ON il."investigationId" = i."id"
16101611
WHERE cb."wordCount" <= 10000
16111612
AND (
16121613
i."id" IS NULL -- no investigation yet
1613-
OR i."status" = 'PENDING' -- pending (may need run)
1614-
OR (i."status" = 'PROCESSING' AND ...) -- stuck processing (lease/recovery expired)
1614+
OR i."status" = 'PENDING' -- pending, ready for enqueueing
1615+
OR (i."status" = 'PROCESSING' -- stuck processing (lease expired or missing)
1616+
AND (il."investigationId" IS NULL OR il."leaseExpiresAt" <= NOW()))
16151617
)
16161618
ORDER BY p."uniqueViewScore" DESC
16171619
LIMIT :budget;
@@ -1622,21 +1624,32 @@ which handles idempotent creation of the `Investigation` row and job enqueueing.
16221624

16231625
## 3.7 Job Queue
16241626

1625-
Postgres-backed (graphile-worker or `FOR UPDATE SKIP LOCKED`). No Redis.
1627+
Postgres-backed (graphile-worker). No Redis.
16261628
Used by selector work and all `investigateNow` requests.
1627-
User-key requests attach an encrypted short-lived lease for worker-side credential handoff.
1629+
User-key requests attach an encrypted short-lived key source on the Investigation
1630+
for worker-side credential handoff.
1631+
1632+
Each graphile-worker job is enqueued with `maxAttempts: 1` and a per-investigation
1633+
`jobKey` (`investigate:${investigationId}`). Retry control is managed by the
1634+
application, not graphile-worker: transient failures reclaim the investigation to
1635+
PENDING and explicitly re-enqueue with a backoff delay.
16281636

16291637
```
16301638
Investigation selected (by selector or any investigateNow request)
16311639
→ Upsert investigation for postVersionId (idempotent: one investigation per content version)
16321640
→ If already exists: reuse existing investigation row and do not enqueue duplicate work
1633-
→ Worker picks up job → UPDATE status = PROCESSING
1641+
→ Worker picks up job → claim lease (PENDING → PROCESSING, atomically increment attemptCount)
16341642
→ Worker calls Investigator.investigate()
1635-
→ On success: UPDATE status = COMPLETE
1643+
→ On success: delete lease, UPDATE status = COMPLETE
16361644
→ On failure: classify and retry or fail permanently
16371645
1646+
Retry model:
1647+
- Investigation.attemptCount tracks retries (incremented at lease claim).
1648+
- MAX_INVESTIGATION_ATTEMPTS = 4. When exhausted, the investigation is marked FAILED.
1649+
- Transient retries use exponential backoff: delay = 10s × 2^(attempt - 1).
1650+
16381651
Failure classes:
1639-
TRANSIENT (retry up to 3x with exponential backoff):
1652+
TRANSIENT (reclaim to PENDING, re-enqueue with backoff, up to MAX_INVESTIGATION_ATTEMPTS):
16401653
- Provider 5xx errors, rate limits (429), network timeouts
16411654
NON_RETRYABLE (mark FAILED immediately):
16421655
- Structured output fails Zod validation (likely prompt/schema issue, not transient)
@@ -1645,8 +1658,9 @@ Failure classes:
16451658
PARTIAL (mark FAILED, log partial output for debugging):
16461659
- Provider returns truncated or incomplete tool-call trace
16471660
1648-
If a user-key lease is missing/expired when the worker starts, the run fails and
1649-
requires an explicit user re-request.
1661+
If a user-key source is missing/expired when the worker starts, the investigation
1662+
fails and requires an explicit user re-request. Key sources are consumed (deleted)
1663+
on every terminal transition (COMPLETE, FAILED).
16501664
16511665
`FAILED` is terminal for a given `postVersionId` in v1. Re-running that exact content
16521666
version requires an explicit operator/admin action (e.g., reset status or delete/recreate row),
@@ -1985,11 +1999,11 @@ openerrata/
19851999
│ │ │ │ ├── services/
19862000
│ │ │ │ │ ├── orchestrator.ts # Main investigation orchestrator
19872001
│ │ │ │ │ ├── orchestrator-errors.ts # Error classification
1988-
│ │ │ │ │ ├── run-lease.ts # Atomic lease claim/release + heartbeat
2002+
│ │ │ │ │ ├── investigation-lease.ts # Atomic lease claim/release + heartbeat
19892003
│ │ │ │ │ ├── prompt-context.ts # Post metadata extraction for prompts
19902004
│ │ │ │ │ ├── attempt-audit.ts # Audit record persistence
19912005
│ │ │ │ │ ├── markdown-resolution.ts # Trust-policy-based markdown resolution
1992-
│ │ │ │ │ ├── investigation-lifecycle.ts # Status transitions + progressClaims
2006+
│ │ │ │ │ ├── investigation-lifecycle.ts # Status transitions + lease recovery
19932007
│ │ │ │ │ ├── selector.ts # Investigation selection cron
19942008
│ │ │ │ │ ├── queue.ts # graphile-worker integration
19952009
│ │ │ │ │ ├── queue-lifecycle.ts

src/typescript/api/AGENTS.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ tRPC handler → `postRouter` or `publicRouter` → Prisma → PostgreSQL.
5454
3. Orchestrator (`src/lib/services/orchestrator.ts`) claims a run lease, calls
5555
the investigator, stores claims + sources, marks COMPLETE or FAILED.
5656
Delegates to focused sub-modules:
57-
- `services/run-lease.ts` — Atomic lease claim/release and heartbeat renewal.
58-
Runs keep `PROCESSING` status throughout; lease expiry is the worker
59-
exclusivity mechanism (not a status reset to `PENDING`).
57+
- `services/investigation-lease.ts` — Atomic lease claim/release and heartbeat
58+
renewal via the `InvestigationLease` table. The row's existence represents
59+
a PROCESSING investigation with a lease holder; lease expiry is the worker
60+
exclusivity mechanism.
6061
- `services/prompt-context.ts` — Extracts post metadata (platform, URL,
6162
author, image occurrences, video flag) from the Prisma query result.
6263
- `services/attempt-audit.ts` — Persists `InvestigationAttempt` records and
@@ -87,11 +88,13 @@ tRPC handler → `postRouter` or `publicRouter` → Prisma → PostgreSQL.
8788

8889
### Failure Classification (spec §3.7)
8990

90-
- **TRANSIENT**: Provider 5xx, 429, network timeouts → graphile-worker retries
91-
with backoff. Investigation stays `PROCESSING`; the run lease is released so
92-
the next worker attempt can claim it. Status is **not** reset to `PENDING`
93-
(doing so would allow the selector/investigateNow to re-enqueue a duplicate
94-
graphile-worker job via jobKey replacement).
91+
- **TRANSIENT**: Provider 5xx, 429, network timeouts → delete lease, reclaim
92+
investigation to `PENDING`, and explicitly re-enqueue with exponential backoff
93+
(`10s × 2^(attempt-1)`). `Investigation.attemptCount` tracks retries; after
94+
`MAX_INVESTIGATION_ATTEMPTS` (4) the investigation is marked FAILED.
95+
The per-investigation `jobKey` (`investigate:${investigationId}`) ensures
96+
concurrent enqueue calls from the re-enqueue, selector, or investigateNow
97+
all resolve to exactly one graphile-worker job.
9598
- **NON_RETRYABLE**: Zod validation failure, content-policy refusal, auth errors
9699
→ mark FAILED immediately, don't rethrow.
97100
- **FAILED is terminal** for a given `postVersionId`. No automatic
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
-- Extract InvestigationLease table from InvestigationRun.
2+
--
3+
-- Instead of merging lease fields into Investigation (with CHECK constraints),
4+
-- this migration creates a separate InvestigationLease table where all fields
5+
-- are NOT NULL. The row's existence represents "this investigation is PROCESSING
6+
-- and has a lease holder" — the representable-valid principle is enforced by
7+
-- the schema itself:
8+
-- - Can't have leaseOwner without leaseExpiresAt (both are NOT NULL)
9+
-- - Can't have lease fields on a non-PROCESSING investigation (no row)
10+
-- - progressClaims lives on the lease row — automatically cleaned up on delete
11+
12+
-- ── Phase 1: Create InvestigationLease table ────────────────────────────────
13+
14+
CREATE TABLE "InvestigationLease" (
15+
"investigationId" TEXT NOT NULL,
16+
"leaseOwner" TEXT NOT NULL,
17+
"leaseExpiresAt" TIMESTAMP(3) NOT NULL,
18+
"startedAt" TIMESTAMP(3) NOT NULL,
19+
"heartbeatAt" TIMESTAMP(3) NOT NULL,
20+
"progressClaims" JSONB,
21+
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
22+
CONSTRAINT "InvestigationLease_pkey" PRIMARY KEY ("investigationId"),
23+
CONSTRAINT "InvestigationLease_investigationId_fkey"
24+
FOREIGN KEY ("investigationId") REFERENCES "Investigation"("id") ON DELETE CASCADE ON UPDATE CASCADE
25+
);
26+
27+
CREATE INDEX "InvestigationLease_leaseExpiresAt_idx" ON "InvestigationLease"("leaseExpiresAt");
28+
29+
-- ── Phase 2: Backfill from InvestigationRun + Investigation.progressClaims ──
30+
31+
INSERT INTO "InvestigationLease" ("investigationId", "leaseOwner", "leaseExpiresAt", "startedAt", "heartbeatAt", "progressClaims")
32+
SELECT r."investigationId", r."leaseOwner", r."leaseExpiresAt", r."startedAt", r."heartbeatAt", i."progressClaims"
33+
FROM "InvestigationRun" r
34+
JOIN "Investigation" i ON i."id" = r."investigationId"
35+
WHERE r."leaseOwner" IS NOT NULL
36+
AND r."leaseExpiresAt" IS NOT NULL
37+
AND i."status" = 'PROCESSING';
38+
39+
-- ── Phase 3: Add queuedAt to Investigation ──────────────────────────────────
40+
41+
ALTER TABLE "Investigation" ADD COLUMN "queuedAt" TIMESTAMP(3);
42+
UPDATE "Investigation" i SET "queuedAt" = r."queuedAt" FROM "InvestigationRun" r WHERE r."investigationId" = i."id";
43+
UPDATE "Investigation" SET "queuedAt" = "createdAt" WHERE "queuedAt" IS NULL;
44+
ALTER TABLE "Investigation" ALTER COLUMN "queuedAt" SET NOT NULL;
45+
ALTER TABLE "Investigation" ALTER COLUMN "queuedAt" SET DEFAULT CURRENT_TIMESTAMP;
46+
47+
-- ── Phase 3b: Add attemptCount to Investigation ─────────────────────────────
48+
49+
ALTER TABLE "Investigation" ADD COLUMN "attemptCount" INTEGER NOT NULL DEFAULT 0;
50+
51+
-- ── Phase 3c: Add retryAfter to Investigation ───────────────────────────────
52+
53+
ALTER TABLE "Investigation" ADD COLUMN "retryAfter" TIMESTAMP(3);
54+
55+
-- ── Phase 4: Re-point InvestigationOpenAiKeySource FK ───────────────────────
56+
57+
ALTER TABLE "InvestigationOpenAiKeySource"
58+
ADD COLUMN "investigationId" TEXT;
59+
60+
UPDATE "InvestigationOpenAiKeySource" ks
61+
SET "investigationId" = r."investigationId"
62+
FROM "InvestigationRun" r
63+
WHERE r."id" = ks."runId";
64+
65+
ALTER TABLE "InvestigationOpenAiKeySource"
66+
ALTER COLUMN "investigationId" SET NOT NULL;
67+
68+
ALTER TABLE "InvestigationOpenAiKeySource"
69+
ADD CONSTRAINT "InvestigationOpenAiKeySource_investigationId_fkey"
70+
FOREIGN KEY ("investigationId") REFERENCES "Investigation"("id") ON DELETE CASCADE ON UPDATE CASCADE;
71+
72+
-- Drop old FK and PK
73+
ALTER TABLE "InvestigationOpenAiKeySource"
74+
DROP CONSTRAINT "InvestigationOpenAiKeySource_runId_fkey";
75+
ALTER TABLE "InvestigationOpenAiKeySource"
76+
DROP CONSTRAINT "InvestigationOpenAiKeySource_pkey";
77+
ALTER TABLE "InvestigationOpenAiKeySource"
78+
ADD PRIMARY KEY ("investigationId");
79+
ALTER TABLE "InvestigationOpenAiKeySource"
80+
DROP COLUMN "runId";
81+
82+
-- ── Phase 5: Fix zombies ────────────────────────────────────────────────────
83+
84+
-- Delete expired leases
85+
DELETE FROM "InvestigationLease" WHERE "leaseExpiresAt" <= NOW();
86+
87+
-- PROCESSING without lease row → PENDING
88+
UPDATE "Investigation" SET "status" = 'PENDING'
89+
WHERE "status" = 'PROCESSING'
90+
AND "id" NOT IN (SELECT "investigationId" FROM "InvestigationLease");
91+
92+
-- ── Phase 6: Drop progressClaims from Investigation ─────────────────────────
93+
94+
ALTER TABLE "Investigation" DROP COLUMN IF EXISTS "progressClaims";
95+
96+
-- ── Phase 7: Drop InvestigationRun (explicit, no CASCADE) ───────────────────
97+
98+
-- Explicitly drop all known dependencies first so unexpected ones fail loud
99+
ALTER TABLE "InvestigationRun"
100+
DROP CONSTRAINT IF EXISTS "InvestigationRun_investigationId_fkey";
101+
DROP INDEX IF EXISTS "InvestigationRun_investigationId_key";
102+
DROP INDEX IF EXISTS "InvestigationRun_leaseExpiresAt_idx";
103+
DROP INDEX IF EXISTS "InvestigationRun_recoverAfterAt_idx";
104+
ALTER TABLE "InvestigationRun"
105+
DROP CONSTRAINT IF EXISTS "InvestigationRun_lease_pair_consistency_check";
106+
107+
-- Now drop the table — will fail if any unexpected FK or dependency remains
108+
DROP TABLE "InvestigationRun";

0 commit comments

Comments
 (0)