|
| 1 | +// robots.txt support — per-host fetch + parse + per-URL allow/deny check. |
| 2 | +// |
| 3 | +// Production-grade crawlers respect robots.txt. deepdive's fetch volume is |
| 4 | +// low (~12 URLs per query) but it's still the polite thing; sites with |
| 5 | +// explicit scraper deny rules shouldn't be surprised. --ignore-robots is |
| 6 | +// provided for operators who know what they're doing. |
| 7 | +// |
| 8 | +// Cache is per-run (in-memory) and keyed by `<scheme>://<host>`. We don't |
| 9 | +// persist to disk because the expected hit count per run is small and |
| 10 | +// robots.txt content can change rapidly on the publisher's end. |
| 11 | + |
| 12 | +export interface RobotsRule { |
| 13 | + allow: boolean; |
| 14 | + path: string; |
| 15 | +} |
| 16 | + |
| 17 | +export interface ParsedRobots { |
| 18 | + // Rules in file order. Path matching picks the longest-matching rule |
| 19 | + // (ties broken by Allow winning over Disallow per RFC 9309). |
| 20 | + rules: RobotsRule[]; |
| 21 | + crawlDelaySec?: number; |
| 22 | +} |
| 23 | + |
| 24 | +export type RobotsCheckResult = "allow" | "deny" | "unknown"; |
| 25 | + |
| 26 | +export interface RobotsCache { |
| 27 | + get(origin: string): ParsedRobots | null | undefined; |
| 28 | + set(origin: string, parsed: ParsedRobots | null): void; |
| 29 | +} |
| 30 | + |
| 31 | +export interface CanFetchOptions { |
| 32 | + userAgent: string; |
| 33 | + cache?: RobotsCache; |
| 34 | + fetchImpl?: typeof fetch; |
| 35 | + timeoutMs?: number; |
| 36 | + signal?: AbortSignal; |
| 37 | +} |
| 38 | + |
| 39 | +export const DEFAULT_USER_AGENT = "deepdive-bot"; |
| 40 | + |
| 41 | +export function createRobotsCache(): RobotsCache { |
| 42 | + const store = new Map<string, ParsedRobots | null>(); |
| 43 | + return { |
| 44 | + get: (origin) => store.get(origin), |
| 45 | + set: (origin, parsed) => void store.set(origin, parsed), |
| 46 | + }; |
| 47 | +} |
| 48 | + |
| 49 | +export async function canFetch( |
| 50 | + url: string, |
| 51 | + opts: CanFetchOptions, |
| 52 | +): Promise<RobotsCheckResult> { |
| 53 | + let origin: string; |
| 54 | + let path: string; |
| 55 | + try { |
| 56 | + const u = new URL(url); |
| 57 | + if (u.protocol !== "http:" && u.protocol !== "https:") return "allow"; |
| 58 | + origin = `${u.protocol}//${u.host}`; |
| 59 | + path = u.pathname + u.search; |
| 60 | + } catch { |
| 61 | + return "allow"; |
| 62 | + } |
| 63 | + |
| 64 | + const cached = opts.cache?.get(origin); |
| 65 | + const parsed = |
| 66 | + cached === undefined |
| 67 | + ? await fetchAndParse(origin, opts) |
| 68 | + : cached; |
| 69 | + if (opts.cache && cached === undefined) opts.cache.set(origin, parsed); |
| 70 | + |
| 71 | + if (parsed === null) return "unknown"; // couldn't reach robots.txt |
| 72 | + return isPathAllowed(parsed, path, opts.userAgent) ? "allow" : "deny"; |
| 73 | +} |
| 74 | + |
| 75 | +async function fetchAndParse( |
| 76 | + origin: string, |
| 77 | + opts: CanFetchOptions, |
| 78 | +): Promise<ParsedRobots | null> { |
| 79 | + const fetchImpl = opts.fetchImpl ?? fetch; |
| 80 | + const timeoutMs = opts.timeoutMs ?? 5_000; |
| 81 | + const timeout = AbortSignal.timeout(timeoutMs); |
| 82 | + const signal = opts.signal |
| 83 | + ? AbortSignal.any([opts.signal, timeout]) |
| 84 | + : timeout; |
| 85 | + try { |
| 86 | + const res = await fetchImpl(`${origin}/robots.txt`, { |
| 87 | + headers: { "user-agent": opts.userAgent }, |
| 88 | + signal, |
| 89 | + }); |
| 90 | + // Per RFC 9309: 4xx → no restrictions (no robots file); 5xx → treat as |
| 91 | + // "full disallow" conservatively. We lean permissive for 5xx too since |
| 92 | + // it's often transient and we don't want to lock out a run because the |
| 93 | + // publisher's server is flaky. Callers can pass --ignore-robots if they |
| 94 | + // want to bypass robots entirely. |
| 95 | + if (res.status >= 400) { |
| 96 | + return { rules: [] }; |
| 97 | + } |
| 98 | + const text = await res.text(); |
| 99 | + return parseRobotsTxt(text); |
| 100 | + } catch { |
| 101 | + return null; |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +// Exported for unit tests. |
| 106 | +export function parseRobotsTxt(text: string): ParsedRobots { |
| 107 | + const lines = text.split(/\r?\n/); |
| 108 | + // We track a current set of user-agents whose rules we're accumulating. |
| 109 | + // A rule applies to the most-specific matching user-agent (case-insensitive). |
| 110 | + // Simple strategy: collect all rules with their owning user-agents, |
| 111 | + // then at check time pick the right group. |
| 112 | + type GroupedRule = { agent: string; allow: boolean; path: string }; |
| 113 | + const grouped: GroupedRule[] = []; |
| 114 | + let currentAgents: string[] = []; |
| 115 | + let sawRuleThisGroup = false; |
| 116 | + let crawlDelay: number | undefined; |
| 117 | + |
| 118 | + for (const rawLine of lines) { |
| 119 | + const line = stripComment(rawLine).trim(); |
| 120 | + if (!line) continue; |
| 121 | + const match = /^([a-zA-Z-]+)\s*:\s*(.*)$/.exec(line); |
| 122 | + if (!match) continue; |
| 123 | + const [, key, value] = match; |
| 124 | + const lower = key.toLowerCase(); |
| 125 | + if (lower === "user-agent") { |
| 126 | + if (sawRuleThisGroup) { |
| 127 | + // new group |
| 128 | + currentAgents = []; |
| 129 | + sawRuleThisGroup = false; |
| 130 | + } |
| 131 | + currentAgents.push(value.trim().toLowerCase()); |
| 132 | + } else if (lower === "disallow" || lower === "allow") { |
| 133 | + sawRuleThisGroup = true; |
| 134 | + for (const agent of currentAgents) { |
| 135 | + grouped.push({ |
| 136 | + agent, |
| 137 | + allow: lower === "allow", |
| 138 | + path: value.trim(), |
| 139 | + }); |
| 140 | + } |
| 141 | + } else if (lower === "crawl-delay") { |
| 142 | + const n = Number(value.trim()); |
| 143 | + if (Number.isFinite(n) && n >= 0) crawlDelay = n; |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + return { |
| 148 | + rules: grouped.map((g) => ({ allow: g.allow, path: g.path })), |
| 149 | + crawlDelaySec: crawlDelay, |
| 150 | + // We stash the grouping by keeping a hidden field. But since we want a |
| 151 | + // clean exported type, bake agent-matching in: we'll re-do the parse at |
| 152 | + // check time. Simpler: re-parse cheaply or store a bigger structure. |
| 153 | + // Actually let's just store the grouped form and compute at check time: |
| 154 | + ...({ _grouped: grouped } as object), |
| 155 | + } as ParsedRobots; |
| 156 | +} |
| 157 | + |
| 158 | +// Exported for unit tests. |
| 159 | +export function isPathAllowed( |
| 160 | + parsed: ParsedRobots, |
| 161 | + path: string, |
| 162 | + userAgent: string, |
| 163 | +): boolean { |
| 164 | + const grouped = (parsed as unknown as { _grouped?: { agent: string; allow: boolean; path: string }[] })._grouped ?? []; |
| 165 | + if (grouped.length === 0) return true; |
| 166 | + const ua = userAgent.toLowerCase(); |
| 167 | + |
| 168 | + // Pick matching rules: prefer exact agent match; fall back to '*'. |
| 169 | + let applicable = grouped.filter((g) => g.agent && ua.includes(g.agent)); |
| 170 | + if (applicable.length === 0) applicable = grouped.filter((g) => g.agent === "*"); |
| 171 | + if (applicable.length === 0) return true; |
| 172 | + |
| 173 | + // Pick the longest-matching rule. Tie → allow wins (RFC 9309). |
| 174 | + let bestLen = -1; |
| 175 | + let bestAllow = true; |
| 176 | + for (const rule of applicable) { |
| 177 | + if (!rule.path) { |
| 178 | + // Empty Disallow: means allow everything. Empty Allow: is a no-op. |
| 179 | + if (!rule.allow) { |
| 180 | + if (bestLen < 0) { |
| 181 | + bestLen = 0; |
| 182 | + bestAllow = true; // empty Disallow explicitly grants |
| 183 | + } |
| 184 | + } |
| 185 | + continue; |
| 186 | + } |
| 187 | + if (!pathMatches(rule.path, path)) continue; |
| 188 | + if (rule.path.length > bestLen || (rule.path.length === bestLen && rule.allow)) { |
| 189 | + bestLen = rule.path.length; |
| 190 | + bestAllow = rule.allow; |
| 191 | + } |
| 192 | + } |
| 193 | + return bestLen < 0 ? true : bestAllow; |
| 194 | +} |
| 195 | + |
| 196 | +function pathMatches(pattern: string, path: string): boolean { |
| 197 | + // Robots.txt patterns support * as wildcard and $ as end-anchor. For the |
| 198 | + // simpler prefix patterns — which is what 95% of robots.txt files use — a |
| 199 | + // startsWith check suffices. Anything fancier: compile to a regex. |
| 200 | + if (!pattern.includes("*") && !pattern.endsWith("$")) { |
| 201 | + return path.startsWith(pattern); |
| 202 | + } |
| 203 | + // Convert to regex, escaping other regex-special chars. |
| 204 | + let re = ""; |
| 205 | + for (let i = 0; i < pattern.length; i++) { |
| 206 | + const c = pattern[i]; |
| 207 | + if (c === "*") re += ".*"; |
| 208 | + else if (c === "$" && i === pattern.length - 1) re += "$"; |
| 209 | + else re += c.replace(/[.+?^${}()|[\]\\]/g, "\\$&"); |
| 210 | + } |
| 211 | + try { |
| 212 | + return new RegExp("^" + re).test(path); |
| 213 | + } catch { |
| 214 | + return false; |
| 215 | + } |
| 216 | +} |
| 217 | + |
| 218 | +function stripComment(s: string): string { |
| 219 | + const i = s.indexOf("#"); |
| 220 | + return i === -1 ? s : s.slice(0, i); |
| 221 | +} |
0 commit comments