Skip to content

Commit 9ccdeac

Browse files
committed
Tool: create single-file, text-only digest of each content area
Signed-off-by: hortison <160366376+hortison@users.noreply.github.com>
1 parent d4e8218 commit 9ccdeac

File tree

1 file changed

+164
-0
lines changed

1 file changed

+164
-0
lines changed

build-digest.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#!/usr/bin/env python3
2+
"""Build a single-file markdown digest from a Hugo content directory.
3+
4+
Usage: python3 build-digest.py <content_dir> <output_file> <title>
5+
6+
Excludes: release notes, helm chart values, images, videos, HTML comments, Hugo shortcodes.
7+
"""
8+
9+
import sys
10+
import os
11+
import re
12+
from datetime import datetime, timezone
13+
from pathlib import Path
14+
15+
16+
def strip_front_matter(text: str) -> str:
17+
"""Remove YAML front matter delimited by ---."""
18+
if text.startswith("---"):
19+
end = text.find("---", 3)
20+
if end != -1:
21+
return text[end + 3:].lstrip("\n")
22+
return text
23+
24+
25+
def extract_title(text: str) -> str | None:
26+
"""Extract title from YAML front matter."""
27+
if not text.startswith("---"):
28+
return None
29+
end = text.find("---", 3)
30+
if end == -1:
31+
return None
32+
fm = text[3:end]
33+
for line in fm.split("\n"):
34+
line = line.strip()
35+
if line.lower().startswith("title:"):
36+
title = line[6:].strip()
37+
# Remove quotes
38+
if (title.startswith('"') and title.endswith('"')) or \
39+
(title.startswith("'") and title.endswith("'")):
40+
title = title[1:-1]
41+
return title
42+
return None
43+
44+
45+
def clean_body(body: str) -> str:
46+
"""Remove images, videos, shortcodes, HTML, and comments from markdown body."""
47+
# Remove HTML comments (multiline)
48+
body = re.sub(r'<!--.*?-->', '', body, flags=re.DOTALL)
49+
50+
# Remove style blocks
51+
body = re.sub(r'<style>.*?</style>', '', body, flags=re.DOTALL)
52+
53+
# Remove script blocks
54+
body = re.sub(r'<script>.*?</script>', '', body, flags=re.DOTALL)
55+
56+
lines = body.split('\n')
57+
cleaned = []
58+
for line in lines:
59+
# Skip image references (markdown images with image/video extensions)
60+
if re.match(r'^\s*!\[.*?\]\(.*?\.(png|jpg|jpeg|gif|svg|webp|mp4|webm|mov|avi).*?\)\s*$', line, re.IGNORECASE):
61+
continue
62+
63+
# Skip lines that are entirely a Hugo shortcode call
64+
if re.match(r'^\s*\{\{[<%<\*]', line):
65+
continue
66+
67+
# Remove any inline Hugo shortcode tags: {{< ... >}}, {{% ... %}}, {{</* ... */>}}
68+
line = re.sub(r'\{\{[<%<\*]+.*?[%>\*>]+\}\}', '', line)
69+
# Catch {{ ... }} template calls
70+
line = re.sub(r'\{\{.*?\}\}', '', line)
71+
72+
# Remove remaining HTML tags but keep content
73+
line = re.sub(r'<[^>]+>', '', line)
74+
75+
cleaned.append(line)
76+
77+
body = '\n'.join(cleaned)
78+
79+
# Collapse 3+ consecutive blank lines to 2
80+
body = re.sub(r'\n{3,}', '\n\n', body)
81+
82+
return body.strip()
83+
84+
85+
def title_from_path(filepath: Path) -> str:
86+
"""Derive a title from the file path."""
87+
name = filepath.stem
88+
if name in ('_index', 'index'):
89+
name = filepath.parent.name
90+
# Convert hyphens/underscores to spaces, title case
91+
return name.replace('-', ' ').replace('_', ' ').title()
92+
93+
94+
def heading_depth(relpath: Path, is_index: bool) -> int:
95+
"""Calculate heading depth from relative path depth."""
96+
parts = list(relpath.parts)
97+
depth = len(parts)
98+
if is_index:
99+
depth -= 1
100+
# Clamp between 2 and 4
101+
return max(2, min(4, depth))
102+
103+
104+
def main():
105+
if len(sys.argv) != 4:
106+
print(f"Usage: {sys.argv[0]} <content_dir> <output_file> <title>")
107+
sys.exit(1)
108+
109+
content_dir = Path(sys.argv[1])
110+
output_file = Path(sys.argv[2])
111+
doc_title = sys.argv[3]
112+
113+
# Collect all markdown files, excluding releases and helm-chart-values
114+
md_files = []
115+
for f in sorted(content_dir.rglob("*.md")):
116+
rel = f.relative_to(content_dir)
117+
parts = rel.parts
118+
119+
# Exclude release notes
120+
if 'releases' in parts:
121+
continue
122+
# Exclude helm chart values
123+
if f.name == 'helm-chart-values.md':
124+
continue
125+
126+
md_files.append(f)
127+
128+
now = datetime.now(timezone.utc).strftime("%Y-%m-%d")
129+
out_lines = [
130+
f"# {doc_title}\n",
131+
f"> Auto-generated documentation digest. Source: `{content_dir}` ",
132+
f"> Generated: {now}\n",
133+
"---\n",
134+
]
135+
136+
file_count = 0
137+
for filepath in md_files:
138+
raw = filepath.read_text(encoding='utf-8', errors='replace')
139+
140+
title = extract_title(raw) or title_from_path(filepath)
141+
body = strip_front_matter(raw)
142+
body = clean_body(body)
143+
144+
# Skip empty files
145+
if not body.strip():
146+
continue
147+
148+
rel = filepath.relative_to(content_dir)
149+
is_index = filepath.stem in ('_index', 'index')
150+
depth = heading_depth(rel, is_index)
151+
heading = '#' * depth
152+
153+
out_lines.append(f"\n{heading} {title}\n")
154+
out_lines.append(body)
155+
out_lines.append("\n\n---\n")
156+
file_count += 1
157+
158+
output_file.write_text('\n'.join(out_lines), encoding='utf-8')
159+
total_lines = sum(1 for _ in output_file.read_text().split('\n'))
160+
print(f"Done: {total_lines} lines, {file_count} documents written to {output_file}")
161+
162+
163+
if __name__ == '__main__':
164+
main()

0 commit comments

Comments
 (0)