-
Notifications
You must be signed in to change notification settings - Fork 140
Added pre-processing for implementing RAG for Louis:Chatbot #1118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 4 commits
4e827f6
4cec811
310941c
e6a0a73
1928f17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,116 @@ | ||||||
| import os | ||||||
| import re | ||||||
| import xml.etree.ElementTree as ET | ||||||
| import html | ||||||
| import json | ||||||
|
|
||||||
| # Path to chapter folders | ||||||
| SICP_XML_DIR = os.path.join(os.path.dirname(__file__), "..", "xml") | ||||||
|
|
||||||
| def parse_file(file_path, parent_title=None, depth=0): | ||||||
| """ | ||||||
| Recursively parse any XML file (chapter, section, or subsection). | ||||||
| """ | ||||||
| indent = " " * depth # for nice indentation in logs | ||||||
|
|
||||||
| if not os.path.exists(file_path): | ||||||
| print(f"{indent}⚠️ Missing file: {file_path}") | ||||||
| return [] | ||||||
|
|
||||||
| print(f"{indent}📄 Parsing ({depth=}): {file_path}") | ||||||
|
|
||||||
| # Parse and unescape | ||||||
| try: | ||||||
| tree = ET.parse(file_path) | ||||||
| root = tree.getroot() | ||||||
| except Exception as e: | ||||||
| print(f"{indent}❌ XML parse error in {file_path}: {e}") | ||||||
| return [] | ||||||
|
|
||||||
| xml_text = html.unescape(ET.tostring(root, encoding="unicode")) | ||||||
| chunks = [] | ||||||
|
|
||||||
| # Identify tag type | ||||||
| tag_type = root.tag.upper() | ||||||
| if root.find("NAME") is not None: | ||||||
| title = " ".join(root.find("NAME").itertext()) | ||||||
| title = re.sub(r"\s+", " ", title).strip() | ||||||
| else: | ||||||
| title = "Untitled" | ||||||
|
|
||||||
| # Extract text paragraphs | ||||||
| text_blocks = root.findall(".//TEXT") | ||||||
| print(f"{indent}🧩 Found {len(text_blocks)} <TEXT> blocks in {os.path.basename(file_path)}") | ||||||
|
|
||||||
| for i, t in enumerate(text_blocks, start=1): | ||||||
| for bad_tag in ["INDEX", "LABEL", "CITATION", "FOOTNOTE", "COMMENT", "WEB_ONLY"]: | ||||||
| for el in t.findall(f".//{bad_tag}"): | ||||||
| el.clear() | ||||||
|
|
||||||
| text_content = " ".join(t.itertext()).strip() | ||||||
| text_content = re.sub(r"\s+", " ", text_content) | ||||||
|
|
||||||
| if text_content: | ||||||
| chunks.append({ | ||||||
| "source_file": os.path.basename(file_path), | ||||||
| "tag_type": tag_type, | ||||||
| "title": title, | ||||||
| "parent_title": parent_title, | ||||||
| "depth": depth, | ||||||
| "paragraph_index": i, | ||||||
| "content": text_content | ||||||
| }) | ||||||
|
|
||||||
| # Look for section and subsection references | ||||||
| section_refs = re.findall(r"§ion([\d\.]+);", xml_text) | ||||||
| subsection_refs = re.findall(r"&subsection([\d\.]+);", xml_text) | ||||||
|
|
||||||
| if section_refs: | ||||||
| print(f"{indent}🔍 Found {len(section_refs)} section ref(s): {section_refs}") | ||||||
| if subsection_refs: | ||||||
| print(f"{indent} ↳ Found {len(subsection_refs)} subsection ref(s): {subsection_refs}") | ||||||
|
|
||||||
| # Recurse into sections | ||||||
| for ref in section_refs: | ||||||
| section_folder = os.path.join(os.path.dirname(file_path), f"section{ref.split('.')[0]}") | ||||||
| section_file = os.path.join(section_folder, f"section{ref.split('.')[0]}.xml") | ||||||
| print(f"{indent}➡️ Going into section file: {section_file}") | ||||||
| chunks.extend(parse_file(section_file, parent_title=title, depth=depth + 1)) | ||||||
|
|
||||||
| # Recurse into subsections | ||||||
| for ref in subsection_refs: | ||||||
| subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml") | ||||||
|
||||||
| subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml") | |
| subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref}.xml") |
Copilot
AI
Jan 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code lacks any mechanism to prevent infinite recursion if there are circular references between XML files. While the chunking.py has a visited set to track processed files, parse_sicp.py does not have this protection, which could lead to stack overflow errors.
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The section reference extraction logic uses
split('.')[0]which only takes the first part before a dot. For references like "§ion1.2.3;", this would incorrectly extract only "1" instead of the full identifier "1.2.3". This will cause the wrong section files to be loaded.