Skip to content

Commit bdb7d23

Browse files
committed
initial commit
0 parents  commit bdb7d23

245 files changed

Lines changed: 32440 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024 Nick Sweeting
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# abx-plugins
2+
3+
ArchiveBox-compatible plugin suite (hooks, config schemas, binaries manifests).
4+
5+
This package contains only plugin assets and a tiny helper to locate them.
6+
It does **not** depend on Django or ArchiveBox.
7+
8+
## Usage
9+
10+
```python
11+
from abx_plugins import get_plugins_dir
12+
13+
plugins_dir = get_plugins_dir()
14+
# scan plugins_dir for plugins/*/config.json, binaries.jsonl, on_* hooks
15+
```
16+
17+
Tools like `abx-dl` and ArchiveBox can discover plugins from this package
18+
without symlinks or environment-variable tricks.

abx_plugins/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Plugin suite package for ArchiveBox-compatible tools."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
from importlib import resources
7+
8+
9+
def get_plugins_dir() -> Path:
10+
"""Return the filesystem path to the bundled plugins directory."""
11+
return Path(resources.files(__name__) / "plugins")
12+
13+
14+
__all__ = ["get_plugins_dir"]

abx_plugins/plugins/.coverage

364 KB
Binary file not shown.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema#",
3+
"type": "object",
4+
"additionalProperties": false,
5+
"required_plugins": ["chrome"],
6+
"properties": {
7+
"ACCESSIBILITY_ENABLED": {
8+
"type": "boolean",
9+
"default": true,
10+
"x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"],
11+
"description": "Enable accessibility tree capture"
12+
},
13+
"ACCESSIBILITY_TIMEOUT": {
14+
"type": "integer",
15+
"default": 30,
16+
"minimum": 5,
17+
"x-fallback": "TIMEOUT",
18+
"description": "Timeout for accessibility capture in seconds"
19+
}
20+
}
21+
}
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Extract accessibility tree and page outline from a URL.
4+
*
5+
* Extracts:
6+
* - Page outline (headings h1-h6, sections, articles)
7+
* - Iframe tree
8+
* - Accessibility snapshot
9+
* - ARIA labels and roles
10+
*
11+
* Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
12+
* Output: Writes accessibility/accessibility.json
13+
*
14+
* Environment variables:
15+
* SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
16+
*/
17+
18+
const fs = require('fs');
19+
const path = require('path');
20+
// Add NODE_MODULES_DIR to module resolution paths if set
21+
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
22+
const puppeteer = require('puppeteer-core');
23+
24+
// Extractor metadata
25+
const PLUGIN_NAME = 'accessibility';
26+
const OUTPUT_DIR = '.';
27+
const OUTPUT_FILE = 'accessibility.json';
28+
const CHROME_SESSION_DIR = '../chrome';
29+
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
30+
31+
// Parse command line arguments
32+
function parseArgs() {
33+
const args = {};
34+
process.argv.slice(2).forEach(arg => {
35+
if (arg.startsWith('--')) {
36+
const [key, ...valueParts] = arg.slice(2).split('=');
37+
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
38+
}
39+
});
40+
return args;
41+
}
42+
43+
// Get environment variable with default
44+
function getEnv(name, defaultValue = '') {
45+
return (process.env[name] || defaultValue).trim();
46+
}
47+
48+
function getEnvBool(name, defaultValue = false) {
49+
const val = getEnv(name, '').toLowerCase();
50+
if (['true', '1', 'yes', 'on'].includes(val)) return true;
51+
if (['false', '0', 'no', 'off'].includes(val)) return false;
52+
return defaultValue;
53+
}
54+
55+
// Wait for chrome tab to be fully loaded
56+
async function waitForChromeTabLoaded(timeoutMs = 60000) {
57+
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
58+
const startTime = Date.now();
59+
60+
while (Date.now() - startTime < timeoutMs) {
61+
if (fs.existsSync(navigationFile)) {
62+
return true;
63+
}
64+
// Wait 100ms before checking again
65+
await new Promise(resolve => setTimeout(resolve, 100));
66+
}
67+
68+
return false;
69+
}
70+
71+
// Get CDP URL from chrome plugin
72+
function getCdpUrl() {
73+
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
74+
if (fs.existsSync(cdpFile)) {
75+
return fs.readFileSync(cdpFile, 'utf8').trim();
76+
}
77+
return null;
78+
}
79+
80+
function assertChromeSession() {
81+
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
82+
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
83+
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
84+
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
85+
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
86+
}
87+
try {
88+
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
89+
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
90+
process.kill(pid, 0);
91+
} catch (e) {
92+
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
93+
}
94+
const cdpUrl = getCdpUrl();
95+
if (!cdpUrl) {
96+
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
97+
}
98+
return cdpUrl;
99+
}
100+
101+
// Extract accessibility info
102+
async function extractAccessibility(url) {
103+
// Output directory is current directory (hook already runs in output dir)
104+
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
105+
106+
let browser = null;
107+
108+
try {
109+
// Connect to existing Chrome session
110+
const cdpUrl = assertChromeSession();
111+
112+
browser = await puppeteer.connect({
113+
browserWSEndpoint: cdpUrl,
114+
});
115+
116+
// Get the page
117+
const pages = await browser.pages();
118+
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
119+
120+
if (!page) {
121+
return { success: false, error: 'No page found in Chrome session' };
122+
}
123+
124+
// Get accessibility snapshot
125+
const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
126+
127+
// Extract page outline (headings, sections, etc.)
128+
const outline = await page.evaluate(() => {
129+
const headings = [];
130+
const elements = document.querySelectorAll(
131+
'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
132+
);
133+
134+
elements.forEach(elem => {
135+
// Skip unnamed anchors
136+
if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
137+
138+
const tagName = elem.tagName.toLowerCase();
139+
const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
140+
const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
141+
const action = elem.action?.split('/').pop() || '';
142+
143+
let summary = (elem.innerText || '').slice(0, 128);
144+
if (summary.length >= 128) summary += '...';
145+
146+
let prefix = '';
147+
let title = '';
148+
149+
// Format headings with # prefix
150+
const level = parseInt(tagName.replace('h', ''));
151+
if (!isNaN(level)) {
152+
prefix = '#'.repeat(level);
153+
title = elem.innerText || elemId || elemClasses;
154+
} else {
155+
// For other elements, create breadcrumb path
156+
const parents = [tagName];
157+
let node = elem.parentNode;
158+
while (node && parents.length < 5) {
159+
if (node.tagName) {
160+
const tag = node.tagName.toLowerCase();
161+
if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
162+
parents.unshift(tag);
163+
} else {
164+
parents.unshift('');
165+
}
166+
}
167+
node = node.parentNode;
168+
}
169+
prefix = parents.join('>');
170+
171+
title = elemId ? `#${elemId}` : '';
172+
if (!title && elemClasses) title = `.${elemClasses}`;
173+
if (action) title += ` /${action}`;
174+
if (summary && !title.includes(summary)) title += `: ${summary}`;
175+
}
176+
177+
// Clean up title
178+
title = title.replace(/\s+/g, ' ').trim();
179+
180+
if (prefix) {
181+
headings.push(`${prefix} ${title}`);
182+
}
183+
});
184+
185+
return headings;
186+
});
187+
188+
// Get iframe tree
189+
const iframes = [];
190+
function dumpFrameTree(frame, indent = '>') {
191+
iframes.push(indent + frame.url());
192+
for (const child of frame.childFrames()) {
193+
dumpFrameTree(child, indent + '>');
194+
}
195+
}
196+
dumpFrameTree(page.mainFrame(), '');
197+
198+
const accessibilityData = {
199+
url,
200+
headings: outline,
201+
iframes,
202+
tree: accessibilityTree,
203+
};
204+
205+
// Write output
206+
fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
207+
208+
return { success: true, output: outputPath, accessibilityData };
209+
210+
} catch (e) {
211+
return { success: false, error: `${e.name}: ${e.message}` };
212+
} finally {
213+
if (browser) {
214+
browser.disconnect();
215+
}
216+
}
217+
}
218+
219+
async function main() {
220+
const args = parseArgs();
221+
const url = args.url;
222+
const snapshotId = args.snapshot_id;
223+
224+
if (!url || !snapshotId) {
225+
console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
226+
process.exit(1);
227+
}
228+
229+
const startTs = new Date();
230+
let status = 'failed';
231+
let output = null;
232+
let error = '';
233+
234+
try {
235+
// Check if enabled
236+
if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
237+
console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
238+
// Output clean JSONL (no RESULT_JSON= prefix)
239+
console.log(JSON.stringify({
240+
type: 'ArchiveResult',
241+
status: 'skipped',
242+
output_str: 'ACCESSIBILITY_ENABLED=False',
243+
}));
244+
process.exit(0);
245+
}
246+
247+
// Check if Chrome session exists, then wait for page load
248+
assertChromeSession();
249+
const pageLoaded = await waitForChromeTabLoaded(60000);
250+
if (!pageLoaded) {
251+
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
252+
}
253+
254+
const result = await extractAccessibility(url);
255+
256+
if (result.success) {
257+
status = 'succeeded';
258+
output = result.output;
259+
const headingCount = result.accessibilityData.headings.length;
260+
const iframeCount = result.accessibilityData.iframes.length;
261+
console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
262+
} else {
263+
status = 'failed';
264+
error = result.error;
265+
}
266+
} catch (e) {
267+
error = `${e.name}: ${e.message}`;
268+
status = 'failed';
269+
}
270+
271+
const endTs = new Date();
272+
273+
if (error) console.error(`ERROR: ${error}`);
274+
275+
// Output clean JSONL (no RESULT_JSON= prefix)
276+
console.log(JSON.stringify({
277+
type: 'ArchiveResult',
278+
status,
279+
output_str: output || error || '',
280+
}));
281+
282+
process.exit(status === 'succeeded' ? 0 : 1);
283+
}
284+
285+
main().catch(e => {
286+
console.error(`Fatal error: ${e.message}`);
287+
process.exit(1);
288+
});
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>

0 commit comments

Comments
 (0)