Skip to content

Commit bb48b68

Browse files
authored
feat: respect robots.txt before fetching pages (#8)
Production-grade crawlers check robots.txt. deepdive's per-query fetch volume is low (~12 URLs) but it's still the polite thing; sites with explicit scraper deny rules shouldn't be surprised. Behavior: - Before every agent.fetchOne, we check <scheme>://<host>/robots.txt with User-Agent "deepdive-bot" (configurable via AgentConfig.robotsUserAgent). - On "deny", skip the URL + emit a new fetch.skipped event so --verbose output shows the skip reason. - On "allow" or "unknown", proceed as before. - robots.txt content is cached in-memory per run (one GET per origin). - Network errors fetching robots.txt err on the side of "fetch" rather than "deny" — publishers who care have working robots.txt. Opt-out: --ignore-robots / DEEPDIVE_IGNORE_ROBOTS=1 bypasses the check entirely (for operators with their own relationship to the target). Parser supports: User-agent blocks (case-insensitive substring match, exact agent beats *), Disallow + Allow with longest-prefix wins (ties go to Allow per RFC 9309), empty Disallow = allow everything, wildcard * in paths, $ end-anchor, Crawl-delay field, # comments. Tests: 17 new assertions (12 parser unit, 5 canFetch integration, 2 CLI). 198 total. Co-authored-by: askalf <263217947+askalf@users.noreply.github.com>
1 parent a942e41 commit bb48b68

7 files changed

Lines changed: 482 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
1515
- `retry(fn, opts)` helper (`src/retry.ts`) exported for library reuse — injectable `sleep`/`random` for deterministic tests, `shouldRetry` predicate, `onRetry` hook, abort-signal aware.
1616
- Tests: 18 doctor assertions + 12 retry helper + 9 LLM-retry integration + 14 streaming (6 parseBlocks unit, 4 parseSSE reader, 4 callLLMStream mock-server integration) + 5 error-scrub + new CLI/config cases. 180 total, up from 96.
1717

18+
- **robots.txt respect.** Before every page fetch, deepdive now checks the site's `robots.txt` with User-Agent `deepdive-bot` (configurable via `AgentConfig.robotsUserAgent`). Disallowed URLs are skipped with a `fetch.skipped` event instead of fetched. `--ignore-robots` / `DEEPDIVE_IGNORE_ROBOTS=1` bypasses for cases where the operator has their own relationship with the target. robots.txt content is cached in-memory per run (one GET per host). Network errors on the robots.txt fetch err on the side of "fetch" rather than "deny" — publishers who care have working robots.txt.
19+
- New exports: `canFetch`, `createRobotsCache`, `parseRobotsTxt`, `isPathAllowed`, `DEFAULT_USER_AGENT`. 17 new assertions (15 robots unit + 2 CLI).
20+
1821
### Security
1922
- Addressed 7 CodeQL high-severity alerts: polynomial-ReDoS risks on URL trim/fragment strip regexes replaced with non-regex string walks in a new `src/url-util.ts`; tightened DuckDuckGo hostname match to rule out `evil-duckduckgo.com`-style spoofs; single-pass HTML entity decoder fixes the `&amp;#39;` double-unescape; defensive `stripTags` now also drops stray `<` so malformed partial tags can't leak a tag opener downstream.
2023
- Home-dir scrubbing on all CLI error messages. `safeErrorMessage` (exported for library reuse) runs every user-facing error through `scrubPath` before printing, so a Playwright ENOENT, an LLM 500 echoing back a path from the request body, or any other downstream error can't include `/home/alice/...` or `C:\Users\alice\...` in the output a user would paste into a bug report. Library consumers of `runAgent` still get raw errors for debugging.

src/agent.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ import { buildSourceTable, renderAnswerMarkdown, type Source } from "./citations
1818
import { synthesize, type SourceWithContent } from "./synthesize.js";
1919
import type { PageCache } from "./cache.js";
2020
import { runConcurrent } from "./concurrency.js";
21+
import {
22+
canFetch,
23+
DEFAULT_USER_AGENT,
24+
type RobotsCache,
25+
} from "./robots.js";
2126

2227
// Minimal surface the agent needs from a browser. BrowserSession satisfies it;
2328
// tests pass a mock with the same shape.
@@ -38,6 +43,12 @@ export interface AgentConfig {
3843
concurrency: number;
3944
cache?: PageCache;
4045
browserFactory?: (opts: BrowserOptions) => BrowserLike;
46+
// Respect robots.txt when true (default). Network errors fetching robots
47+
// return "unknown" and we err on the side of fetching. --ignore-robots in
48+
// the CLI flips this to false.
49+
respectRobots?: boolean;
50+
robotsUserAgent?: string;
51+
robotsCache?: RobotsCache;
4152
onEvent?: (event: AgentEvent) => void;
4253
// Fires for each SSE token emitted by the synthesizer. When set, the agent
4354
// uses the streaming LLM path for synthesize() calls. CLI callers enable
@@ -61,6 +72,7 @@ export type AgentEvent =
6172
words: number;
6273
cached: boolean;
6374
}
75+
| { type: "fetch.skipped"; url: string; reason: "robots" }
6476
| { type: "synthesize.start"; sourceCount: number; round: number }
6577
| { type: "synthesize.done"; round: number }
6678
| { type: "critique.start"; round: number }
@@ -284,6 +296,17 @@ async function fetchOne(
284296
config: AgentConfig,
285297
ensureBrowser: () => Promise<BrowserLike>,
286298
): Promise<FetchOutcome | null> {
299+
if (config.respectRobots !== false) {
300+
const ua = config.robotsUserAgent ?? DEFAULT_USER_AGENT;
301+
const result = await canFetch(c.url, {
302+
userAgent: ua,
303+
cache: config.robotsCache,
304+
});
305+
if (result === "deny") {
306+
emit(config, { type: "fetch.skipped", url: c.url, reason: "robots" });
307+
return null;
308+
}
309+
}
287310
if (config.cache) {
288311
const cached = await config.cache.get(c.url);
289312
if (cached) {

src/cli.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ Flags:
5252
--concurrency=<n> Parallel fetches. Default: 4
5353
--no-cache Disable the on-disk page cache (default: enabled)
5454
--cache-ttl-ms=<ms> Page cache TTL. Default: 3600000 (1 hour)
55+
--ignore-robots Bypass robots.txt checks (default: respect them)
5556
--json Emit a JSON result to stdout instead of markdown
5657
--out=<path> Write the output (markdown or json) to a file too
5758
--verbose, -v Stream progress events to stderr
@@ -96,6 +97,10 @@ export function parseArgs(argv: string[]): ParsedArgs {
9697
flags.noCache = true;
9798
continue;
9899
}
100+
if (a === "--ignore-robots") {
101+
flags.ignoreRobots = true;
102+
continue;
103+
}
99104
if (a === "--json") {
100105
flags.json = true;
101106
continue;
@@ -208,6 +213,8 @@ function renderEvent(e: AgentEvent): string {
208213
return ` fetch ${e.cached ? "(cached) " : ""}${e.url}`;
209214
case "fetch.done":
210215
return ` ${e.ok ? "OK " : "!! "}${e.status} · ${e.words} words${e.cached ? " · cache" : ""}`;
216+
case "fetch.skipped":
217+
return ` fetch skipped (${e.reason}) ${e.url}`;
211218
case "synthesize.start":
212219
return ` synth round ${e.round} · ${e.sourceCount} source${e.sourceCount === 1 ? "" : "s"}`;
213220
case "synthesize.done":
@@ -303,6 +310,7 @@ async function main(argv: string[]): Promise<number> {
303310
deepRounds: config.deepRounds,
304311
concurrency: config.concurrency,
305312
cache,
313+
respectRobots: config.respectRobots,
306314
onEvent: (e) => {
307315
if (config.verbose) process.stderr.write(renderEvent(e) + "\n");
308316
},

src/config.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export interface RuntimeConfig {
1616
deepRounds: number;
1717
concurrency: number;
1818
cache: { enabled: boolean; dir: string; ttlMs: number };
19+
respectRobots: boolean;
1920
jsonOutput: boolean;
2021
streamEnabled: boolean;
2122
verbose: boolean;
@@ -37,6 +38,7 @@ export interface CLIFlags {
3738
concurrency?: number;
3839
noCache?: boolean;
3940
cacheTtlMs?: number;
41+
ignoreRobots?: boolean;
4042
json?: boolean;
4143
noStream?: boolean;
4244
verbose?: boolean;
@@ -130,6 +132,8 @@ export function resolveConfig(
130132
parsePositiveInt(env.DEEPDIVE_CACHE_TTL_MS) ??
131133
DEFAULTS.cacheTtlMs;
132134

135+
const respectRobots =
136+
!(flags.ignoreRobots ?? env.DEEPDIVE_IGNORE_ROBOTS === "1");
133137
const jsonOutput = flags.json ?? env.DEEPDIVE_JSON === "1";
134138
const streamOptOut = flags.noStream ?? env.DEEPDIVE_NO_STREAM === "1";
135139
// Streaming is on by default but gets auto-disabled for:
@@ -161,6 +165,7 @@ export function resolveConfig(
161165
deepRounds,
162166
concurrency,
163167
cache: { enabled: cacheEnabled, dir: cacheDir, ttlMs: cacheTtlMs },
168+
respectRobots,
164169
jsonOutput,
165170
streamEnabled,
166171
verbose,

src/robots.ts

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// robots.txt support — per-host fetch + parse + per-URL allow/deny check.
2+
//
3+
// Production-grade crawlers respect robots.txt. deepdive's fetch volume is
4+
// low (~12 URLs per query) but it's still the polite thing; sites with
5+
// explicit scraper deny rules shouldn't be surprised. --ignore-robots is
6+
// provided for operators who know what they're doing.
7+
//
8+
// Cache is per-run (in-memory) and keyed by `<scheme>://<host>`. We don't
9+
// persist to disk because the expected hit count per run is small and
10+
// robots.txt content can change rapidly on the publisher's end.
11+
12+
export interface RobotsRule {
13+
allow: boolean;
14+
path: string;
15+
}
16+
17+
export interface ParsedRobots {
18+
// Rules in file order. Path matching picks the longest-matching rule
19+
// (ties broken by Allow winning over Disallow per RFC 9309).
20+
rules: RobotsRule[];
21+
crawlDelaySec?: number;
22+
}
23+
24+
export type RobotsCheckResult = "allow" | "deny" | "unknown";
25+
26+
export interface RobotsCache {
27+
get(origin: string): ParsedRobots | null | undefined;
28+
set(origin: string, parsed: ParsedRobots | null): void;
29+
}
30+
31+
export interface CanFetchOptions {
32+
userAgent: string;
33+
cache?: RobotsCache;
34+
fetchImpl?: typeof fetch;
35+
timeoutMs?: number;
36+
signal?: AbortSignal;
37+
}
38+
39+
export const DEFAULT_USER_AGENT = "deepdive-bot";
40+
41+
export function createRobotsCache(): RobotsCache {
42+
const store = new Map<string, ParsedRobots | null>();
43+
return {
44+
get: (origin) => store.get(origin),
45+
set: (origin, parsed) => void store.set(origin, parsed),
46+
};
47+
}
48+
49+
export async function canFetch(
50+
url: string,
51+
opts: CanFetchOptions,
52+
): Promise<RobotsCheckResult> {
53+
let origin: string;
54+
let path: string;
55+
try {
56+
const u = new URL(url);
57+
if (u.protocol !== "http:" && u.protocol !== "https:") return "allow";
58+
origin = `${u.protocol}//${u.host}`;
59+
path = u.pathname + u.search;
60+
} catch {
61+
return "allow";
62+
}
63+
64+
const cached = opts.cache?.get(origin);
65+
const parsed =
66+
cached === undefined
67+
? await fetchAndParse(origin, opts)
68+
: cached;
69+
if (opts.cache && cached === undefined) opts.cache.set(origin, parsed);
70+
71+
if (parsed === null) return "unknown"; // couldn't reach robots.txt
72+
return isPathAllowed(parsed, path, opts.userAgent) ? "allow" : "deny";
73+
}
74+
75+
async function fetchAndParse(
76+
origin: string,
77+
opts: CanFetchOptions,
78+
): Promise<ParsedRobots | null> {
79+
const fetchImpl = opts.fetchImpl ?? fetch;
80+
const timeoutMs = opts.timeoutMs ?? 5_000;
81+
const timeout = AbortSignal.timeout(timeoutMs);
82+
const signal = opts.signal
83+
? AbortSignal.any([opts.signal, timeout])
84+
: timeout;
85+
try {
86+
const res = await fetchImpl(`${origin}/robots.txt`, {
87+
headers: { "user-agent": opts.userAgent },
88+
signal,
89+
});
90+
// Per RFC 9309: 4xx → no restrictions (no robots file); 5xx → treat as
91+
// "full disallow" conservatively. We lean permissive for 5xx too since
92+
// it's often transient and we don't want to lock out a run because the
93+
// publisher's server is flaky. Callers can pass --ignore-robots if they
94+
// want to bypass robots entirely.
95+
if (res.status >= 400) {
96+
return { rules: [] };
97+
}
98+
const text = await res.text();
99+
return parseRobotsTxt(text);
100+
} catch {
101+
return null;
102+
}
103+
}
104+
105+
// Exported for unit tests.
106+
export function parseRobotsTxt(text: string): ParsedRobots {
107+
const lines = text.split(/\r?\n/);
108+
// We track a current set of user-agents whose rules we're accumulating.
109+
// A rule applies to the most-specific matching user-agent (case-insensitive).
110+
// Simple strategy: collect all rules with their owning user-agents,
111+
// then at check time pick the right group.
112+
type GroupedRule = { agent: string; allow: boolean; path: string };
113+
const grouped: GroupedRule[] = [];
114+
let currentAgents: string[] = [];
115+
let sawRuleThisGroup = false;
116+
let crawlDelay: number | undefined;
117+
118+
for (const rawLine of lines) {
119+
const line = stripComment(rawLine).trim();
120+
if (!line) continue;
121+
const match = /^([a-zA-Z-]+)\s*:\s*(.*)$/.exec(line);
122+
if (!match) continue;
123+
const [, key, value] = match;
124+
const lower = key.toLowerCase();
125+
if (lower === "user-agent") {
126+
if (sawRuleThisGroup) {
127+
// new group
128+
currentAgents = [];
129+
sawRuleThisGroup = false;
130+
}
131+
currentAgents.push(value.trim().toLowerCase());
132+
} else if (lower === "disallow" || lower === "allow") {
133+
sawRuleThisGroup = true;
134+
for (const agent of currentAgents) {
135+
grouped.push({
136+
agent,
137+
allow: lower === "allow",
138+
path: value.trim(),
139+
});
140+
}
141+
} else if (lower === "crawl-delay") {
142+
const n = Number(value.trim());
143+
if (Number.isFinite(n) && n >= 0) crawlDelay = n;
144+
}
145+
}
146+
147+
return {
148+
rules: grouped.map((g) => ({ allow: g.allow, path: g.path })),
149+
crawlDelaySec: crawlDelay,
150+
// We stash the grouping by keeping a hidden field. But since we want a
151+
// clean exported type, bake agent-matching in: we'll re-do the parse at
152+
// check time. Simpler: re-parse cheaply or store a bigger structure.
153+
// Actually let's just store the grouped form and compute at check time:
154+
...({ _grouped: grouped } as object),
155+
} as ParsedRobots;
156+
}
157+
158+
// Exported for unit tests.
159+
export function isPathAllowed(
160+
parsed: ParsedRobots,
161+
path: string,
162+
userAgent: string,
163+
): boolean {
164+
const grouped = (parsed as unknown as { _grouped?: { agent: string; allow: boolean; path: string }[] })._grouped ?? [];
165+
if (grouped.length === 0) return true;
166+
const ua = userAgent.toLowerCase();
167+
168+
// Pick matching rules: prefer exact agent match; fall back to '*'.
169+
let applicable = grouped.filter((g) => g.agent && ua.includes(g.agent));
170+
if (applicable.length === 0) applicable = grouped.filter((g) => g.agent === "*");
171+
if (applicable.length === 0) return true;
172+
173+
// Pick the longest-matching rule. Tie → allow wins (RFC 9309).
174+
let bestLen = -1;
175+
let bestAllow = true;
176+
for (const rule of applicable) {
177+
if (!rule.path) {
178+
// Empty Disallow: means allow everything. Empty Allow: is a no-op.
179+
if (!rule.allow) {
180+
if (bestLen < 0) {
181+
bestLen = 0;
182+
bestAllow = true; // empty Disallow explicitly grants
183+
}
184+
}
185+
continue;
186+
}
187+
if (!pathMatches(rule.path, path)) continue;
188+
if (rule.path.length > bestLen || (rule.path.length === bestLen && rule.allow)) {
189+
bestLen = rule.path.length;
190+
bestAllow = rule.allow;
191+
}
192+
}
193+
return bestLen < 0 ? true : bestAllow;
194+
}
195+
196+
function pathMatches(pattern: string, path: string): boolean {
197+
// Robots.txt patterns support * as wildcard and $ as end-anchor. For the
198+
// simpler prefix patterns — which is what 95% of robots.txt files use — a
199+
// startsWith check suffices. Anything fancier: compile to a regex.
200+
if (!pattern.includes("*") && !pattern.endsWith("$")) {
201+
return path.startsWith(pattern);
202+
}
203+
// Convert to regex, escaping other regex-special chars.
204+
let re = "";
205+
for (let i = 0; i < pattern.length; i++) {
206+
const c = pattern[i];
207+
if (c === "*") re += ".*";
208+
else if (c === "$" && i === pattern.length - 1) re += "$";
209+
else re += c.replace(/[.+?^${}()|[\]\\]/g, "\\$&");
210+
}
211+
try {
212+
return new RegExp("^" + re).test(path);
213+
} catch {
214+
return false;
215+
}
216+
}
217+
218+
function stripComment(s: string): string {
219+
const i = s.indexOf("#");
220+
return i === -1 ? s : s.slice(0, i);
221+
}

test/parse-args.test.mjs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ test("parseArgs: --no-cache flag", () => {
9696
assert.equal(p.flags.noCache, true);
9797
});
9898

99+
test("parseArgs: --ignore-robots flag", () => {
100+
const p = parseArgs(["q", "--ignore-robots"]);
101+
assert.equal(p.flags.ignoreRobots, true);
102+
});
103+
99104
test("parseArgs: --json flag", () => {
100105
const p = parseArgs(["q", "--json"]);
101106
assert.equal(p.flags.json, true);

0 commit comments

Comments
 (0)