Skip to content

Commit 48a4623

Browse files
ENH: fix the dollar sign in DeepSeekOCR
1 parent e5ac1b2 commit 48a4623

1 file changed

Lines changed: 216 additions & 0 deletions

File tree

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env python3
2+
r"""Replace broken dollar markers in .mmd files using heuristic-based selection.
3+
4+
Heuristic A (pair-based):
5+
- Adjacent marker pair "\\(" then "\\)" with no curly braces between them.
6+
7+
Heuristic B (money-context):
8+
- Marker appears to precede an amount-like token or nearby money phrasing.
9+
- Excludes obvious math-like markup such as "\\( _{2}" and "\\( ^{TM}".
10+
11+
Selection strategies:
12+
- money: use only money-context markers (higher recall; default).
13+
- overlap: use intersection of pair-based and money-context markers (higher precision).
14+
15+
Always-on exact rule:
16+
- Replace exact table cell markers "<td>\(</td>" and "<td>\)</td>".
17+
"""
18+
19+
from __future__ import annotations
20+
21+
import argparse
22+
import re
23+
from pathlib import Path
24+
from typing import Iterable
25+
26+
MARKER_RE = re.compile(r"\\\(|\\\)")
27+
MONEY_NUM_RE = re.compile(r"^\s*[\(\-]?\d(?:[\d,]*\.?\d*)")
28+
MONEY_WORD_RE = re.compile(r"^.{0,24}\b(?:million|billion|thousand)\b", re.IGNORECASE)
29+
MONEY_PHRASE_RE = re.compile(
30+
r"^.{0,30}\b(?:per\s+share|per\s+ton|per\s+gallon|per\s+bushel|market\s+value)\b",
31+
re.IGNORECASE,
32+
)
33+
MATHISH_RE = re.compile(r"^\s*[_\^]?\s*\{")
34+
EXACT_TD_RE = re.compile(r"<td>(\\\(|\\\))</td>")
35+
36+
37+
def iter_mmd_files(root: Path) -> Iterable[Path]:
38+
for path in root.rglob("*.mmd"):
39+
if path.is_file():
40+
yield path
41+
42+
43+
def get_markers(text: str) -> list[tuple[int, str]]:
44+
return [(m.start(), m.group(0)) for m in MARKER_RE.finditer(text)]
45+
46+
47+
def select_user_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
48+
selected: set[int] = set()
49+
for i in range(len(markers) - 1):
50+
pos_a, tok_a = markers[i]
51+
pos_b, tok_b = markers[i + 1]
52+
if tok_a != r"\(" or tok_b != r"\)":
53+
continue
54+
between = text[pos_a + 2 : pos_b]
55+
if "{" in between or "}" in between:
56+
continue
57+
selected.add(pos_a)
58+
selected.add(pos_b)
59+
return selected
60+
61+
62+
def select_money_context_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
63+
selected: set[int] = set()
64+
for pos, _tok in markers:
65+
after = text[pos + 2 : pos + 66]
66+
67+
# Exclude obvious math-like constructions: \( _{...}, \( ^{...}, \({ ...
68+
if MATHISH_RE.match(after):
69+
continue
70+
71+
is_money = bool(
72+
MONEY_NUM_RE.match(after)
73+
or MONEY_WORD_RE.match(after)
74+
or MONEY_PHRASE_RE.match(after)
75+
)
76+
if is_money:
77+
selected.add(pos)
78+
return selected
79+
80+
81+
def select_exact_td_markers(text: str) -> set[int]:
82+
# Capture the marker token position inside exact HTML cells like <td>\(</td>.
83+
return {m.start(1) for m in EXACT_TD_RE.finditer(text)}
84+
85+
86+
def apply_replacements(
87+
text: str, markers: list[tuple[int, str]], positions: set[int]
88+
) -> tuple[str, int]:
89+
if not positions:
90+
return text, 0
91+
92+
out: list[str] = []
93+
cursor = 0
94+
replaced = 0
95+
96+
for pos, _tok in markers:
97+
if pos in positions:
98+
out.append(text[cursor:pos])
99+
out.append("$")
100+
cursor = pos + 2
101+
replaced += 1
102+
103+
out.append(text[cursor:])
104+
return "".join(out), replaced
105+
106+
107+
def process_file(path: Path, dry_run: bool, strategy: str) -> dict[str, int]:
108+
text = path.read_text(encoding="utf-8")
109+
markers = get_markers(text)
110+
111+
user_positions = select_user_markers(text, markers)
112+
money_positions = select_money_context_markers(text, markers)
113+
overlap = user_positions & money_positions
114+
td_exact_positions = select_exact_td_markers(text)
115+
116+
if strategy == "money":
117+
selected_positions = money_positions | td_exact_positions
118+
elif strategy == "overlap":
119+
selected_positions = overlap | td_exact_positions
120+
else:
121+
raise ValueError(f"Unknown strategy: {strategy}")
122+
123+
updated_text, replaced = apply_replacements(text, markers, selected_positions)
124+
125+
changed = int(replaced > 0)
126+
if replaced > 0 and not dry_run:
127+
path.write_text(updated_text, encoding="utf-8")
128+
129+
return {
130+
"markers": len(markers),
131+
"user": len(user_positions),
132+
"money": len(money_positions),
133+
"overlap": len(overlap),
134+
"td_exact": len(td_exact_positions),
135+
"replaced": replaced,
136+
"changed": changed,
137+
}
138+
139+
140+
def main() -> int:
141+
parser = argparse.ArgumentParser(
142+
description=(
143+
"Replace broken dollar markers in .mmd files using heuristic-based "
144+
"selection."
145+
)
146+
)
147+
parser.add_argument(
148+
"directory", type=Path, help="Root directory to scan recursively"
149+
)
150+
parser.add_argument(
151+
"--dry-run",
152+
action="store_true",
153+
help="Compute and report changes without writing files",
154+
)
155+
parser.add_argument(
156+
"--verbose",
157+
action="store_true",
158+
help="Print per-file replacement counts",
159+
)
160+
parser.add_argument(
161+
"--strategy",
162+
choices=("money", "overlap"),
163+
default="money",
164+
help=(
165+
"Replacement selection strategy: 'money' (higher recall, default) "
166+
"or 'overlap' (higher precision)."
167+
),
168+
)
169+
args = parser.parse_args()
170+
171+
root = args.directory
172+
if not root.exists() or not root.is_dir():
173+
raise SystemExit(f"Directory not found or not a directory: {root}")
174+
175+
totals = {
176+
"files": 0,
177+
"markers": 0,
178+
"user": 0,
179+
"money": 0,
180+
"overlap": 0,
181+
"td_exact": 0,
182+
"replaced": 0,
183+
"changed": 0,
184+
}
185+
186+
for path in iter_mmd_files(root):
187+
stats = process_file(path, dry_run=args.dry_run, strategy=args.strategy)
188+
totals["files"] += 1
189+
totals["markers"] += stats["markers"]
190+
totals["user"] += stats["user"]
191+
totals["money"] += stats["money"]
192+
totals["overlap"] += stats["overlap"]
193+
totals["td_exact"] += stats["td_exact"]
194+
totals["replaced"] += stats["replaced"]
195+
totals["changed"] += stats["changed"]
196+
197+
if args.verbose and stats["replaced"] > 0:
198+
print(f"{path}: replacements={stats['replaced']}")
199+
200+
mode = "DRY RUN" if args.dry_run else "APPLY"
201+
print(f"MODE={mode}")
202+
print(f"STRATEGY={args.strategy}")
203+
print(f"FILES_SCANNED={totals['files']}")
204+
print(f"TOTAL_MARKER_TOKENS={totals['markers']}")
205+
print(f"USER_HEURISTIC_TOTAL={totals['user']}")
206+
print(f"MONEY_HEURISTIC_TOTAL={totals['money']}")
207+
print(f"OVERLAP_TOTAL={totals['overlap']}")
208+
print(f"EXACT_TD_TOTAL={totals['td_exact']}")
209+
print(f"REPLACEMENTS={totals['replaced']}")
210+
print(f"FILES_CHANGED={totals['changed']}")
211+
212+
return 0
213+
214+
215+
if __name__ == "__main__":
216+
raise SystemExit(main())

0 commit comments

Comments
 (0)