|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Build a single-file markdown digest from a Hugo content directory. |
| 3 | +
|
| 4 | +Usage: python3 build-digest.py <content_dir> <output_file> <title> |
| 5 | +
|
| 6 | +Excludes: release notes, helm chart values, images, videos, HTML comments, Hugo shortcodes. |
| 7 | +""" |
| 8 | + |
| 9 | +import sys |
| 10 | +import os |
| 11 | +import re |
| 12 | +from datetime import datetime, timezone |
| 13 | +from pathlib import Path |
| 14 | + |
| 15 | + |
| 16 | +def strip_front_matter(text: str) -> str: |
| 17 | + """Remove YAML front matter delimited by ---.""" |
| 18 | + if text.startswith("---"): |
| 19 | + end = text.find("---", 3) |
| 20 | + if end != -1: |
| 21 | + return text[end + 3:].lstrip("\n") |
| 22 | + return text |
| 23 | + |
| 24 | + |
| 25 | +def extract_title(text: str) -> str | None: |
| 26 | + """Extract title from YAML front matter.""" |
| 27 | + if not text.startswith("---"): |
| 28 | + return None |
| 29 | + end = text.find("---", 3) |
| 30 | + if end == -1: |
| 31 | + return None |
| 32 | + fm = text[3:end] |
| 33 | + for line in fm.split("\n"): |
| 34 | + line = line.strip() |
| 35 | + if line.lower().startswith("title:"): |
| 36 | + title = line[6:].strip() |
| 37 | + # Remove quotes |
| 38 | + if (title.startswith('"') and title.endswith('"')) or \ |
| 39 | + (title.startswith("'") and title.endswith("'")): |
| 40 | + title = title[1:-1] |
| 41 | + return title |
| 42 | + return None |
| 43 | + |
| 44 | + |
| 45 | +def clean_body(body: str) -> str: |
| 46 | + """Remove images, videos, shortcodes, HTML, and comments from markdown body.""" |
| 47 | + # Remove HTML comments (multiline) |
| 48 | + body = re.sub(r'<!--.*?-->', '', body, flags=re.DOTALL) |
| 49 | + |
| 50 | + # Remove style blocks |
| 51 | + body = re.sub(r'<style>.*?</style>', '', body, flags=re.DOTALL) |
| 52 | + |
| 53 | + # Remove script blocks |
| 54 | + body = re.sub(r'<script>.*?</script>', '', body, flags=re.DOTALL) |
| 55 | + |
| 56 | + lines = body.split('\n') |
| 57 | + cleaned = [] |
| 58 | + for line in lines: |
| 59 | + # Skip image references (markdown images with image/video extensions) |
| 60 | + if re.match(r'^\s*!\[.*?\]\(.*?\.(png|jpg|jpeg|gif|svg|webp|mp4|webm|mov|avi).*?\)\s*$', line, re.IGNORECASE): |
| 61 | + continue |
| 62 | + |
| 63 | + # Skip lines that are entirely a Hugo shortcode call |
| 64 | + if re.match(r'^\s*\{\{[<%<\*]', line): |
| 65 | + continue |
| 66 | + |
| 67 | + # Remove any inline Hugo shortcode tags: {{< ... >}}, {{% ... %}}, {{</* ... */>}} |
| 68 | + line = re.sub(r'\{\{[<%<\*]+.*?[%>\*>]+\}\}', '', line) |
| 69 | + # Catch {{ ... }} template calls |
| 70 | + line = re.sub(r'\{\{.*?\}\}', '', line) |
| 71 | + |
| 72 | + # Remove remaining HTML tags but keep content |
| 73 | + line = re.sub(r'<[^>]+>', '', line) |
| 74 | + |
| 75 | + cleaned.append(line) |
| 76 | + |
| 77 | + body = '\n'.join(cleaned) |
| 78 | + |
| 79 | + # Collapse 3+ consecutive blank lines to 2 |
| 80 | + body = re.sub(r'\n{3,}', '\n\n', body) |
| 81 | + |
| 82 | + return body.strip() |
| 83 | + |
| 84 | + |
| 85 | +def title_from_path(filepath: Path) -> str: |
| 86 | + """Derive a title from the file path.""" |
| 87 | + name = filepath.stem |
| 88 | + if name in ('_index', 'index'): |
| 89 | + name = filepath.parent.name |
| 90 | + # Convert hyphens/underscores to spaces, title case |
| 91 | + return name.replace('-', ' ').replace('_', ' ').title() |
| 92 | + |
| 93 | + |
| 94 | +def heading_depth(relpath: Path, is_index: bool) -> int: |
| 95 | + """Calculate heading depth from relative path depth.""" |
| 96 | + parts = list(relpath.parts) |
| 97 | + depth = len(parts) |
| 98 | + if is_index: |
| 99 | + depth -= 1 |
| 100 | + # Clamp between 2 and 4 |
| 101 | + return max(2, min(4, depth)) |
| 102 | + |
| 103 | + |
| 104 | +def main(): |
| 105 | + if len(sys.argv) != 4: |
| 106 | + print(f"Usage: {sys.argv[0]} <content_dir> <output_file> <title>") |
| 107 | + sys.exit(1) |
| 108 | + |
| 109 | + content_dir = Path(sys.argv[1]) |
| 110 | + output_file = Path(sys.argv[2]) |
| 111 | + doc_title = sys.argv[3] |
| 112 | + |
| 113 | + # Collect all markdown files, excluding releases and helm-chart-values |
| 114 | + md_files = [] |
| 115 | + for f in sorted(content_dir.rglob("*.md")): |
| 116 | + rel = f.relative_to(content_dir) |
| 117 | + parts = rel.parts |
| 118 | + |
| 119 | + # Exclude release notes |
| 120 | + if 'releases' in parts: |
| 121 | + continue |
| 122 | + # Exclude helm chart values |
| 123 | + if f.name == 'helm-chart-values.md': |
| 124 | + continue |
| 125 | + |
| 126 | + md_files.append(f) |
| 127 | + |
| 128 | + now = datetime.now(timezone.utc).strftime("%Y-%m-%d") |
| 129 | + out_lines = [ |
| 130 | + f"# {doc_title}\n", |
| 131 | + f"> Auto-generated documentation digest. Source: `{content_dir}` ", |
| 132 | + f"> Generated: {now}\n", |
| 133 | + "---\n", |
| 134 | + ] |
| 135 | + |
| 136 | + file_count = 0 |
| 137 | + for filepath in md_files: |
| 138 | + raw = filepath.read_text(encoding='utf-8', errors='replace') |
| 139 | + |
| 140 | + title = extract_title(raw) or title_from_path(filepath) |
| 141 | + body = strip_front_matter(raw) |
| 142 | + body = clean_body(body) |
| 143 | + |
| 144 | + # Skip empty files |
| 145 | + if not body.strip(): |
| 146 | + continue |
| 147 | + |
| 148 | + rel = filepath.relative_to(content_dir) |
| 149 | + is_index = filepath.stem in ('_index', 'index') |
| 150 | + depth = heading_depth(rel, is_index) |
| 151 | + heading = '#' * depth |
| 152 | + |
| 153 | + out_lines.append(f"\n{heading} {title}\n") |
| 154 | + out_lines.append(body) |
| 155 | + out_lines.append("\n\n---\n") |
| 156 | + file_count += 1 |
| 157 | + |
| 158 | + output_file.write_text('\n'.join(out_lines), encoding='utf-8') |
| 159 | + total_lines = sum(1 for _ in output_file.read_text().split('\n')) |
| 160 | + print(f"Done: {total_lines} lines, {file_count} documents written to {output_file}") |
| 161 | + |
| 162 | + |
| 163 | +if __name__ == '__main__': |
| 164 | + main() |
0 commit comments