|
| 1 | +#!/usr/bin/env node |
| 2 | +/** |
| 3 | + * Extract accessibility tree and page outline from a URL. |
| 4 | + * |
| 5 | + * Extracts: |
| 6 | + * - Page outline (headings h1-h6, sections, articles) |
| 7 | + * - Iframe tree |
| 8 | + * - Accessibility snapshot |
| 9 | + * - ARIA labels and roles |
| 10 | + * |
| 11 | + * Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid> |
| 12 | + * Output: Writes accessibility/accessibility.json |
| 13 | + * |
| 14 | + * Environment variables: |
| 15 | + * SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true) |
| 16 | + */ |
| 17 | + |
| 18 | +const fs = require('fs'); |
| 19 | +const path = require('path'); |
| 20 | +// Add NODE_MODULES_DIR to module resolution paths if set |
| 21 | +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); |
| 22 | +const puppeteer = require('puppeteer-core'); |
| 23 | + |
| 24 | +// Extractor metadata |
| 25 | +const PLUGIN_NAME = 'accessibility'; |
| 26 | +const OUTPUT_DIR = '.'; |
| 27 | +const OUTPUT_FILE = 'accessibility.json'; |
| 28 | +const CHROME_SESSION_DIR = '../chrome'; |
| 29 | +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; |
| 30 | + |
| 31 | +// Parse command line arguments |
| 32 | +function parseArgs() { |
| 33 | + const args = {}; |
| 34 | + process.argv.slice(2).forEach(arg => { |
| 35 | + if (arg.startsWith('--')) { |
| 36 | + const [key, ...valueParts] = arg.slice(2).split('='); |
| 37 | + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; |
| 38 | + } |
| 39 | + }); |
| 40 | + return args; |
| 41 | +} |
| 42 | + |
| 43 | +// Get environment variable with default |
| 44 | +function getEnv(name, defaultValue = '') { |
| 45 | + return (process.env[name] || defaultValue).trim(); |
| 46 | +} |
| 47 | + |
| 48 | +function getEnvBool(name, defaultValue = false) { |
| 49 | + const val = getEnv(name, '').toLowerCase(); |
| 50 | + if (['true', '1', 'yes', 'on'].includes(val)) return true; |
| 51 | + if (['false', '0', 'no', 'off'].includes(val)) return false; |
| 52 | + return defaultValue; |
| 53 | +} |
| 54 | + |
| 55 | +// Wait for chrome tab to be fully loaded |
| 56 | +async function waitForChromeTabLoaded(timeoutMs = 60000) { |
| 57 | + const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); |
| 58 | + const startTime = Date.now(); |
| 59 | + |
| 60 | + while (Date.now() - startTime < timeoutMs) { |
| 61 | + if (fs.existsSync(navigationFile)) { |
| 62 | + return true; |
| 63 | + } |
| 64 | + // Wait 100ms before checking again |
| 65 | + await new Promise(resolve => setTimeout(resolve, 100)); |
| 66 | + } |
| 67 | + |
| 68 | + return false; |
| 69 | +} |
| 70 | + |
| 71 | +// Get CDP URL from chrome plugin |
| 72 | +function getCdpUrl() { |
| 73 | + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); |
| 74 | + if (fs.existsSync(cdpFile)) { |
| 75 | + return fs.readFileSync(cdpFile, 'utf8').trim(); |
| 76 | + } |
| 77 | + return null; |
| 78 | +} |
| 79 | + |
| 80 | +function assertChromeSession() { |
| 81 | + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); |
| 82 | + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); |
| 83 | + const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid'); |
| 84 | + if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) { |
| 85 | + throw new Error(CHROME_SESSION_REQUIRED_ERROR); |
| 86 | + } |
| 87 | + try { |
| 88 | + const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); |
| 89 | + if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid'); |
| 90 | + process.kill(pid, 0); |
| 91 | + } catch (e) { |
| 92 | + throw new Error(CHROME_SESSION_REQUIRED_ERROR); |
| 93 | + } |
| 94 | + const cdpUrl = getCdpUrl(); |
| 95 | + if (!cdpUrl) { |
| 96 | + throw new Error(CHROME_SESSION_REQUIRED_ERROR); |
| 97 | + } |
| 98 | + return cdpUrl; |
| 99 | +} |
| 100 | + |
| 101 | +// Extract accessibility info |
| 102 | +async function extractAccessibility(url) { |
| 103 | + // Output directory is current directory (hook already runs in output dir) |
| 104 | + const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); |
| 105 | + |
| 106 | + let browser = null; |
| 107 | + |
| 108 | + try { |
| 109 | + // Connect to existing Chrome session |
| 110 | + const cdpUrl = assertChromeSession(); |
| 111 | + |
| 112 | + browser = await puppeteer.connect({ |
| 113 | + browserWSEndpoint: cdpUrl, |
| 114 | + }); |
| 115 | + |
| 116 | + // Get the page |
| 117 | + const pages = await browser.pages(); |
| 118 | + const page = pages.find(p => p.url().startsWith('http')) || pages[0]; |
| 119 | + |
| 120 | + if (!page) { |
| 121 | + return { success: false, error: 'No page found in Chrome session' }; |
| 122 | + } |
| 123 | + |
| 124 | + // Get accessibility snapshot |
| 125 | + const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true }); |
| 126 | + |
| 127 | + // Extract page outline (headings, sections, etc.) |
| 128 | + const outline = await page.evaluate(() => { |
| 129 | + const headings = []; |
| 130 | + const elements = document.querySelectorAll( |
| 131 | + 'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe' |
| 132 | + ); |
| 133 | + |
| 134 | + elements.forEach(elem => { |
| 135 | + // Skip unnamed anchors |
| 136 | + if (elem.tagName.toLowerCase() === 'a' && !elem.name) return; |
| 137 | + |
| 138 | + const tagName = elem.tagName.toLowerCase(); |
| 139 | + const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || ''; |
| 140 | + const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .'); |
| 141 | + const action = elem.action?.split('/').pop() || ''; |
| 142 | + |
| 143 | + let summary = (elem.innerText || '').slice(0, 128); |
| 144 | + if (summary.length >= 128) summary += '...'; |
| 145 | + |
| 146 | + let prefix = ''; |
| 147 | + let title = ''; |
| 148 | + |
| 149 | + // Format headings with # prefix |
| 150 | + const level = parseInt(tagName.replace('h', '')); |
| 151 | + if (!isNaN(level)) { |
| 152 | + prefix = '#'.repeat(level); |
| 153 | + title = elem.innerText || elemId || elemClasses; |
| 154 | + } else { |
| 155 | + // For other elements, create breadcrumb path |
| 156 | + const parents = [tagName]; |
| 157 | + let node = elem.parentNode; |
| 158 | + while (node && parents.length < 5) { |
| 159 | + if (node.tagName) { |
| 160 | + const tag = node.tagName.toLowerCase(); |
| 161 | + if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) { |
| 162 | + parents.unshift(tag); |
| 163 | + } else { |
| 164 | + parents.unshift(''); |
| 165 | + } |
| 166 | + } |
| 167 | + node = node.parentNode; |
| 168 | + } |
| 169 | + prefix = parents.join('>'); |
| 170 | + |
| 171 | + title = elemId ? `#${elemId}` : ''; |
| 172 | + if (!title && elemClasses) title = `.${elemClasses}`; |
| 173 | + if (action) title += ` /${action}`; |
| 174 | + if (summary && !title.includes(summary)) title += `: ${summary}`; |
| 175 | + } |
| 176 | + |
| 177 | + // Clean up title |
| 178 | + title = title.replace(/\s+/g, ' ').trim(); |
| 179 | + |
| 180 | + if (prefix) { |
| 181 | + headings.push(`${prefix} ${title}`); |
| 182 | + } |
| 183 | + }); |
| 184 | + |
| 185 | + return headings; |
| 186 | + }); |
| 187 | + |
| 188 | + // Get iframe tree |
| 189 | + const iframes = []; |
| 190 | + function dumpFrameTree(frame, indent = '>') { |
| 191 | + iframes.push(indent + frame.url()); |
| 192 | + for (const child of frame.childFrames()) { |
| 193 | + dumpFrameTree(child, indent + '>'); |
| 194 | + } |
| 195 | + } |
| 196 | + dumpFrameTree(page.mainFrame(), ''); |
| 197 | + |
| 198 | + const accessibilityData = { |
| 199 | + url, |
| 200 | + headings: outline, |
| 201 | + iframes, |
| 202 | + tree: accessibilityTree, |
| 203 | + }; |
| 204 | + |
| 205 | + // Write output |
| 206 | + fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2)); |
| 207 | + |
| 208 | + return { success: true, output: outputPath, accessibilityData }; |
| 209 | + |
| 210 | + } catch (e) { |
| 211 | + return { success: false, error: `${e.name}: ${e.message}` }; |
| 212 | + } finally { |
| 213 | + if (browser) { |
| 214 | + browser.disconnect(); |
| 215 | + } |
| 216 | + } |
| 217 | +} |
| 218 | + |
| 219 | +async function main() { |
| 220 | + const args = parseArgs(); |
| 221 | + const url = args.url; |
| 222 | + const snapshotId = args.snapshot_id; |
| 223 | + |
| 224 | + if (!url || !snapshotId) { |
| 225 | + console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>'); |
| 226 | + process.exit(1); |
| 227 | + } |
| 228 | + |
| 229 | + const startTs = new Date(); |
| 230 | + let status = 'failed'; |
| 231 | + let output = null; |
| 232 | + let error = ''; |
| 233 | + |
| 234 | + try { |
| 235 | + // Check if enabled |
| 236 | + if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) { |
| 237 | + console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)'); |
| 238 | + // Output clean JSONL (no RESULT_JSON= prefix) |
| 239 | + console.log(JSON.stringify({ |
| 240 | + type: 'ArchiveResult', |
| 241 | + status: 'skipped', |
| 242 | + output_str: 'ACCESSIBILITY_ENABLED=False', |
| 243 | + })); |
| 244 | + process.exit(0); |
| 245 | + } |
| 246 | + |
| 247 | + // Check if Chrome session exists, then wait for page load |
| 248 | + assertChromeSession(); |
| 249 | + const pageLoaded = await waitForChromeTabLoaded(60000); |
| 250 | + if (!pageLoaded) { |
| 251 | + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); |
| 252 | + } |
| 253 | + |
| 254 | + const result = await extractAccessibility(url); |
| 255 | + |
| 256 | + if (result.success) { |
| 257 | + status = 'succeeded'; |
| 258 | + output = result.output; |
| 259 | + const headingCount = result.accessibilityData.headings.length; |
| 260 | + const iframeCount = result.accessibilityData.iframes.length; |
| 261 | + console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`); |
| 262 | + } else { |
| 263 | + status = 'failed'; |
| 264 | + error = result.error; |
| 265 | + } |
| 266 | + } catch (e) { |
| 267 | + error = `${e.name}: ${e.message}`; |
| 268 | + status = 'failed'; |
| 269 | + } |
| 270 | + |
| 271 | + const endTs = new Date(); |
| 272 | + |
| 273 | + if (error) console.error(`ERROR: ${error}`); |
| 274 | + |
| 275 | + // Output clean JSONL (no RESULT_JSON= prefix) |
| 276 | + console.log(JSON.stringify({ |
| 277 | + type: 'ArchiveResult', |
| 278 | + status, |
| 279 | + output_str: output || error || '', |
| 280 | + })); |
| 281 | + |
| 282 | + process.exit(status === 'succeeded' ? 0 : 1); |
| 283 | +} |
| 284 | + |
| 285 | +main().catch(e => { |
| 286 | + console.error(`Fatal error: ${e.message}`); |
| 287 | + process.exit(1); |
| 288 | +}); |
0 commit comments