Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,801 changes: 2,801 additions & 0 deletions parser/chapter1_chunks.json

Large diffs are not rendered by default.

1,352 changes: 1,352 additions & 0 deletions parser/chapter2_chunks.json

Large diffs are not rendered by default.

2,378 changes: 2,378 additions & 0 deletions parser/chapter3_chunks.json

Large diffs are not rendered by default.

8,786 changes: 8,786 additions & 0 deletions parser/chapter4_chunks.json

Large diffs are not rendered by default.

2,468 changes: 2,468 additions & 0 deletions parser/chapter5_chunks.json

Large diffs are not rendered by default.

553 changes: 553 additions & 0 deletions parser/chunking.py

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions parser/parse_sicp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import os
import re
import xml.etree.ElementTree as ET
import html
import json

# Path to chapter folders
SICP_XML_DIR = os.path.join(os.path.dirname(__file__), "..", "xml")

def parse_file(file_path, parent_title=None, depth=0):
"""
Recursively parse any XML file (chapter, section, or subsection).
"""
indent = " " * depth # for nice indentation in logs

if not os.path.exists(file_path):
print(f"{indent}⚠️ Missing file: {file_path}")
return []

print(f"{indent}📄 Parsing ({depth=}): {file_path}")

# Parse and unescape
try:
tree = ET.parse(file_path)
root = tree.getroot()
except Exception as e:
print(f"{indent}❌ XML parse error in {file_path}: {e}")
return []

xml_text = html.unescape(ET.tostring(root, encoding="unicode"))
chunks = []

# Identify tag type
tag_type = root.tag.upper()
if root.find("NAME") is not None:
title = " ".join(root.find("NAME").itertext())
title = re.sub(r"\s+", " ", title).strip()
else:
title = "Untitled"

# Extract text paragraphs
text_blocks = root.findall(".//TEXT")
print(f"{indent}🧩 Found {len(text_blocks)} <TEXT> blocks in {os.path.basename(file_path)}")

for i, t in enumerate(text_blocks, start=1):
for bad_tag in ["INDEX", "LABEL", "CITATION", "FOOTNOTE", "COMMENT", "WEB_ONLY"]:
for el in t.findall(f".//{bad_tag}"):
el.clear()

text_content = " ".join(t.itertext()).strip()
text_content = re.sub(r"\s+", " ", text_content)

if text_content:
chunks.append({
"source_file": os.path.basename(file_path),
"tag_type": tag_type,
"title": title,
"parent_title": parent_title,
"depth": depth,
"paragraph_index": i,
"content": text_content
})

# Look for section and subsection references
section_refs = re.findall(r"&section([\d\.]+);", xml_text)
subsection_refs = re.findall(r"&subsection([\d\.]+);", xml_text)

if section_refs:
print(f"{indent}🔍 Found {len(section_refs)} section ref(s): {section_refs}")
if subsection_refs:
print(f"{indent} ↳ Found {len(subsection_refs)} subsection ref(s): {subsection_refs}")

# Recurse into sections
for ref in section_refs:
section_folder = os.path.join(os.path.dirname(file_path), f"section{ref.split('.')[0]}")
section_file = os.path.join(section_folder, f"section{ref.split('.')[0]}.xml")
print(f"{indent}➡️ Going into section file: {section_file}")
chunks.extend(parse_file(section_file, parent_title=title, depth=depth + 1))

# Recurse into subsections
for ref in subsection_refs:
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml")
Comment on lines +75 to +82
Copy link

Copilot AI Jan 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The section reference extraction logic uses split('.')[0] which only takes the first part before a dot. For references like "&section1.2.3;", this would incorrectly extract only "1" instead of the full identifier "1.2.3". This will cause the wrong section files to be loaded.

Suggested change
section_folder = os.path.join(os.path.dirname(file_path), f"section{ref.split('.')[0]}")
section_file = os.path.join(section_folder, f"section{ref.split('.')[0]}.xml")
print(f"{indent}➡️ Going into section file: {section_file}")
chunks.extend(parse_file(section_file, parent_title=title, depth=depth + 1))
# Recurse into subsections
for ref in subsection_refs:
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml")
section_folder = os.path.join(os.path.dirname(file_path), f"section{ref}")
section_file = os.path.join(section_folder, f"section{ref}.xml")
print(f"{indent}➡️ Going into section file: {section_file}")
chunks.extend(parse_file(section_file, parent_title=title, depth=depth + 1))
# Recurse into subsections
for ref in subsection_refs:
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref}.xml")

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The subsection reference extraction uses split('.')[0] which is incorrect. Subsection references typically include the full path (e.g., "1.2.3"), so this will extract only the first number and construct the wrong file path.

Suggested change
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml")
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref}.xml")

Copilot uses AI. Check for mistakes.
print(f"{indent}➡️ Going into subsection file: {subsection_file}")
chunks.extend(parse_file(subsection_file, parent_title=title, depth=depth + 1))

print(f"{indent}✅ Done parsing {os.path.basename(file_path)}, total chunks so far: {len(chunks)}\n")
return chunks
Comment on lines +10 to +87
Copy link

Copilot AI Jan 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code lacks any mechanism to prevent infinite recursion if there are circular references between XML files. While the chunking.py has a visited set to track processed files, parse_sicp.py does not have this protection, which could lead to stack overflow errors.

Copilot uses AI. Check for mistakes.

if __name__ == "__main__":
print("🚀 Starting full SICP parse\n")

# ✅ Automatically detect all chapter folders (chapter1, chapter2, ...)
for chapter_dir in sorted(os.listdir(SICP_XML_DIR)):
if not chapter_dir.startswith("chapter"):
continue

chapter_path = os.path.join(SICP_XML_DIR, chapter_dir, f"{chapter_dir}.xml")
if not os.path.exists(chapter_path):
print(f"⚠️ Skipping {chapter_dir}: main XML not found\n")
continue

print(f"\n==============================")
print(f"📘 Parsing {chapter_dir}")
print(f"==============================")

all_chunks = parse_file(chapter_path)
print(f"✅ Extracted {len(all_chunks)} chunks for {chapter_dir}\n")

# Save separate JSON for each chapter
out_path = os.path.join(os.path.dirname(__file__), f"{chapter_dir}_chunks.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(all_chunks, f, indent=2, ensure_ascii=False)

print(f"💾 Saved {chapter_dir}_chunks.json ({len(all_chunks)} chunks)\n")

print("🏁 All chapters processed successfully!")
5,382 changes: 5,382 additions & 0 deletions parser/sicp_mesochunks_semantic_rag.json

Large diffs are not rendered by default.

Loading