Skip to content

Commit ecc9222

Browse files
hugbubbyclaude
andcommitted
feat(api,extension): video-first mediaState, orchestrator race guard, provenance from snapshot
Align all platform adapters to give has_video precedence over has_images, matching the SPEC: any detected video/iframe causes the post to be skipped even when images are present. LessWrong, X, and Substack adapters were previously images-first and are now corrected. Other changes: - Orchestrator: atomic updateMany guard prevents processing an investigation that reached a terminal state between lease claim and start - Post router: derive provenance from InvestigationInput snapshot for all statuses, not just COMPLETE - parseProgressClaims: fail-fast on malformed data instead of silent fallback - page-content-decision: only auto-investigate for NOT_INVESTIGATED; interim snapshots are display data - CI: add .github/actions/** to path filters, drop Firefox XPI artifact - README: add Wikipedia to supported platforms table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7427f14 commit ecc9222

17 files changed

Lines changed: 118 additions & 69 deletions

File tree

.github/workflows/deploy.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ jobs:
136136
- 'src/typescript/pnpm-workspace.yaml'
137137
- 'src/typescript/api/Dockerfile'
138138
- '.github/workflows/deploy.yml'
139+
- '.github/actions/**'
139140
140141
decide-deploy:
141142
needs: [resolve-target, detect-changes]

.github/workflows/pr-checks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
- 'src/typescript/**'
3434
- 'src/helm/openerrata/**'
3535
- '.github/workflows/**'
36+
- '.github/actions/**'
3637
3738
- name: Check for promote PR
3839
id: gate

.github/workflows/release-extension.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,9 @@ jobs:
9090
chrome_zip="$GITHUB_WORKSPACE/openerrata-extension-chrome-${{ steps.version.outputs.version }}.zip"
9191
chrome_crx="$GITHUB_WORKSPACE/openerrata-extension-chrome-${{ steps.version.outputs.version }}.crx"
9292
firefox_zip="$GITHUB_WORKSPACE/openerrata-extension-firefox-${{ steps.version.outputs.version }}.zip"
93-
firefox_xpi="$GITHUB_WORKSPACE/openerrata-extension-firefox-${{ steps.version.outputs.version }}.xpi"
9493
limit_bytes=5242880
9594
96-
for artifact in "$chrome_zip" "$chrome_crx" "$firefox_zip" "$firefox_xpi"; do
95+
for artifact in "$chrome_zip" "$chrome_crx" "$firefox_zip"; do
9796
bytes="$(stat -c%s "$artifact")"
9897
if [ "$bytes" -gt "$limit_bytes" ]; then
9998
echo "$(basename "$artifact") exceeds 5 MiB: ${bytes} bytes"
@@ -104,7 +103,6 @@ jobs:
104103
- name: Verify Firefox package integrity
105104
run: |
106105
unzip -t "$GITHUB_WORKSPACE/openerrata-extension-firefox-${{ steps.version.outputs.version }}.zip"
107-
unzip -t "$GITHUB_WORKSPACE/openerrata-extension-firefox-${{ steps.version.outputs.version }}.xpi"
108106
109107
- name: Lint Firefox extension package
110108
run: npx -y web-ext@8.9.0 lint --source-dir extension/dist/firefox
@@ -117,7 +115,6 @@ jobs:
117115
${{ github.workspace }}/openerrata-extension-chrome-${{ steps.version.outputs.version }}.zip
118116
${{ github.workspace }}/openerrata-extension-chrome-${{ steps.version.outputs.version }}.crx
119117
${{ github.workspace }}/openerrata-extension-firefox-${{ steps.version.outputs.version }}.zip
120-
${{ github.workspace }}/openerrata-extension-firefox-${{ steps.version.outputs.version }}.xpi
121118
122119
release:
123120
needs: [build-and-test]
@@ -140,7 +137,6 @@ jobs:
140137
openerrata-extension-chrome-${{ needs.build-and-test.outputs.version }}.zip
141138
openerrata-extension-chrome-${{ needs.build-and-test.outputs.version }}.crx
142139
openerrata-extension-firefox-${{ needs.build-and-test.outputs.version }}.zip
143-
openerrata-extension-firefox-${{ needs.build-and-test.outputs.version }}.xpi
144140
generate_release_notes: true
145141

146142
publish:

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ page and change the API URL.
6161
| LessWrong | URL match (`lesswrong.com`) |
6262
| X (Twitter) | URL match (`x.com`, `twitter.com`) |
6363
| Substack | URL match (`*.substack.com/p/*`) + DOM fingerprint for custom domains |
64+
| Wikipedia | URL match (`*.wikipedia.org/wiki/*`, `*.wikipedia.org/w/index.php*`) |
6465

6566
## Public API
6667

SPEC.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ misleading.
6161

6262
v1 ships fact-checking for posts on LessWrong, X, Substack, and Wikipedia.
6363
Investigations use post text plus attached images when available. Posts classified
64-
as `has_video` (video/iframe detected and no extracted images) are skipped.
64+
as `has_video` (any detected video/iframe embed) are skipped, even when text and
65+
images are also present.
6566

6667
Users can trigger investigations in two ways (both async queued):
6768

@@ -181,7 +182,7 @@ Four key design decisions:
181182
4. **Investigations are multimodal for images.** When image attachments are
182183
available, we include image URLs in the model input (`input_image`)
183184
alongside text. Video is not analyzed in v1; posts classified as
184-
`has_video` (video/iframe detected and no extracted images) are visibly
185+
`has_video` (any detected video/iframe embed) are visibly
185186
skipped.
186187
5. **Edited posts use incremental updates.** If a post is edited and a prior
187188
complete `SERVER_VERIFIED` investigation exists, we run an "update investigation"
@@ -279,9 +280,8 @@ The prompt principles (exact wording TBD):
279280
system and will be selectively highlighted.
280281
- **Claims must remain text-grounded.** Even when images are provided to the investigator, flagged
281282
claims must still be exact verbatim quotes from the post text so DOM matching remains reliable.
282-
- **Video is non-analyzable in v1.** Posts classified as `has_video` (video/iframe detected and no
283-
extracted images) are skipped even when text is present. When extracted images are present, the
284-
post is investigated using text + images, and video remains unanalyzed.
283+
- **Video is non-analyzable in v1.** Posts classified as `has_video` (any detected video/iframe
284+
embed) are skipped even when text is present and even when extracted images are present.
285285

286286
## 2.5 User Interface
287287

@@ -1606,7 +1606,7 @@ interface PlatformContent {
16061606
externalId: string;
16071607
url: string;
16081608
contentText: string; // Client-observed normalized plain text; must be non-empty
1609-
mediaState: "text_only" | "has_images" | "has_video"; // "has_video" means video/iframe detected and imageUrls is empty; text may still be present.
1609+
mediaState: "text_only" | "has_images" | "has_video"; // Precedence: "has_video" if any video/iframe is detected; otherwise "has_images" when imageUrls is non-empty; otherwise "text_only".
16101610
imageUrls: string[];
16111611
imageOccurrences?: ImageOccurrence[]; // Positional image data; sent to API as observedImageOccurrences
16121612
metadata: Record<string, unknown>;
@@ -1653,9 +1653,9 @@ DOM manipulation is reliable.
16531653
5. Extract image URLs (`<img src>`), filter malformed/data URLs, and compute `mediaState`.
16541654
6. Send `{ platform: "LESSWRONG", externalId, url, metadata.htmlContent, observedImageUrls? }` to background worker.
16551655

1656-
**Media behavior:** Posts with images are investigated. Posts detected as private/gated are
1657-
skipped (`reason: "private_or_gated"`). Among public posts, only `has_video` posts
1658-
(video/iframe without images, even when text is present) are skipped.
1656+
**Media behavior:** Posts with images and no video are investigated. Posts detected as private/gated
1657+
are skipped (`reason: "private_or_gated"`). Among public posts, any `has_video` post
1658+
(video/iframe detected, even when images and/or text are present) is skipped.
16591659

16601660
**React reconciliation:** LessWrong uses React 16+. Use `MutationObserver` to detect re-renders and
16611661
re-apply annotations. Store annotations in extension state, not DOM.
@@ -1669,9 +1669,9 @@ X uses a React SPA with aggressive DOM recycling.
16691669
3. Target `[data-testid="tweetText"]`. Acknowledge this selector is fragile and may need
16701670
maintenance.
16711671

1672-
**Media behavior:** Extract image URLs separately from video detection. Investigate image posts.
1673-
Skip private/protected tweets (`reason: "private_or_gated"`). Among accessible tweets, skip
1674-
only `has_video` tweets (video present, no extracted images, even when tweet text exists).
1672+
**Media behavior:** Extract image URLs separately from video detection. Investigate image-only
1673+
tweets. Skip private/protected tweets (`reason: "private_or_gated"`). Among accessible tweets, skip
1674+
any `has_video` tweet (video present, even when extracted images and/or tweet text exist).
16751675

16761676
## 3.11 Substack Content Script
16771677

@@ -1712,7 +1712,7 @@ JavaScript globals (`mw.config`).
17121712
6. Wikipedia articles have no single author; no `Author` row is linked.
17131713

17141714
**Media behavior:** Same rules as other platforms — extract image URLs and
1715-
occurrences from the article body; `has_video` articles are skipped.
1715+
occurrences from the article body; any `has_video` article is skipped.
17161716

17171717
## 3.13 Extension Manifest (v3)
17181718

src/typescript/api/prisma/migrations/0022_breaking_release_schema_tightening/migration.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
--
33
-- 1. Drop unused PostVersionViewCredit table.
44
-- 2. Convert InvestigationInput.provenance and .markdownSource from TEXT to Prisma enums.
5-
-- 3. Add missing HTML blob presence CHECK constraints on SubstackVersionMeta and WikipediaVersionMeta.
5+
-- 3. Document why HTML blob presence CHECK constraints are intentionally not added for
6+
-- SubstackVersionMeta and WikipediaVersionMeta.
67

78
-- ── 1. Drop PostVersionViewCredit ───────────────────────────────────────────
89

src/typescript/api/src/lib/services/orchestrator.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,19 @@ export async function orchestrateInvestigation(
268268
// When re-claiming a PROCESSING investigation with an expired lease
269269
// (stale worker takeover), the dead worker's progressClaims would
270270
// otherwise persist and be visible to clients until overwritten.
271-
await prisma.investigation.update({
272-
where: { id: investigation.id },
271+
const transitionedToProcessing = await prisma.investigation.updateMany({
272+
where: {
273+
id: investigation.id,
274+
OR: [{ status: "PENDING" }, { status: "PROCESSING" }],
275+
},
273276
data: { status: "PROCESSING", progressClaims: CLEARED_PROGRESS_CLAIMS },
274277
});
278+
if (transitionedToProcessing.count === 0) {
279+
logger.info(
280+
`Investigation ${investigation.id} moved to terminal state before processing began; skipping`,
281+
);
282+
return;
283+
}
275284

276285
const heartbeat = startRunHeartbeat(run.id, options.workerIdentity, logger);
277286

src/typescript/api/src/lib/trpc/routes/post.ts

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -335,40 +335,41 @@ export const postRouter = router({
335335
},
336336
});
337337

338-
switch (investigation.status) {
339-
case "COMPLETE": {
340-
const completed = await loadInvestigationWithClaims(
341-
prismaInvestigationRepository(ctx.prisma),
342-
investigation.id,
343-
);
344-
if (!completed) {
345-
throw new TRPCError({
346-
code: "INTERNAL_SERVER_ERROR",
347-
message: `Investigation ${investigation.id} disappeared after completion lookup`,
348-
});
349-
}
338+
const loadedInvestigation = await loadInvestigationWithClaims(
339+
prismaInvestigationRepository(ctx.prisma),
340+
investigation.id,
341+
);
342+
if (!loadedInvestigation) {
343+
throw new TRPCError({
344+
code: "INTERNAL_SERVER_ERROR",
345+
message: `Investigation ${investigation.id} disappeared after ensureInvestigationsWithUpdateMetadata`,
346+
});
347+
}
348+
349+
const provenance = parseProvenance({
350+
investigationId: loadedInvestigation.id,
351+
snapshot: loadedInvestigation.input,
352+
});
350353

354+
switch (loadedInvestigation.status) {
355+
case "COMPLETE": {
351356
return {
352-
investigationId: completed.id,
353-
status: completed.status,
354-
provenance: parseProvenance({
355-
investigationId: completed.id,
356-
snapshot: completed.input,
357-
}),
358-
claims: formatClaims(completed.claims),
357+
investigationId: loadedInvestigation.id,
358+
status: loadedInvestigation.status,
359+
provenance,
360+
claims: formatClaims(loadedInvestigation.claims),
359361
};
360362
}
361363
case "PENDING":
362364
case "PROCESSING":
363365
case "FAILED":
364366
return {
365-
investigationId: investigation.id,
366-
status: investigation.status,
367-
provenance:
368-
postVersion.serverVerifiedAt !== null ? "SERVER_VERIFIED" : "CLIENT_FALLBACK",
367+
investigationId: loadedInvestigation.id,
368+
status: loadedInvestigation.status,
369+
provenance,
369370
};
370371
default:
371-
return unreachableInvestigationStatus(investigation.status);
372+
return unreachableInvestigationStatus(loadedInvestigation.status);
372373
}
373374
} catch (error) {
374375
if (error instanceof InvestigationWordLimitError) {

src/typescript/api/src/lib/trpc/routes/post/investigation-queries.ts

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -269,13 +269,10 @@ export function parseProgressClaims(raw: unknown): {
269269
}
270270
const result = progressClaimsDbSchema.safeParse(raw);
271271
if (!result.success) {
272-
console.warn(
273-
"progressClaims failed schema validation:",
274-
result.error.message,
275-
"raw:",
276-
JSON.stringify(raw).slice(0, 200),
277-
);
278-
return { pendingClaims: [], confirmedClaims: [] };
272+
throw new TRPCError({
273+
code: "INTERNAL_SERVER_ERROR",
274+
message: `progressClaims failed schema validation: ${result.error.message}`,
275+
});
279276
}
280277
return { pendingClaims: result.data.pending, confirmedClaims: result.data.confirmed };
281278
}

src/typescript/api/test/unit/post-investigation-queries.test.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
investigationQueriesInternals,
99
loadInvestigationWithClaims,
1010
maybeRecordCorroboration,
11+
parseProgressClaims,
1112
requireCompleteCheckedAtIso,
1213
selectSourceInvestigationForUpdate,
1314
toPriorInvestigationResult,
@@ -139,6 +140,13 @@ test("unreachableInvestigationStatus throws explicit internal error", () => {
139140
);
140141
});
141142

143+
test("parseProgressClaims fails fast on malformed progress payload", () => {
144+
assert.throws(
145+
() => parseProgressClaims({ pending: ["bad"], confirmed: [] }),
146+
/progressClaims failed schema validation/,
147+
);
148+
});
149+
142150
test("load and lookup helpers delegate to repository methods", async () => {
143151
const repo = nullRepo();
144152
assert.equal(await loadInvestigationWithClaims(repo, "inv-1"), null);

0 commit comments

Comments
 (0)