Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/claude-code-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:

jobs:
claude-review:
if: github.event.pull_request.head.repo.full_name == github.repository
# Optional: Filter by PR author
# if: |
# github.event.pull_request.user.login == 'external-contributor' ||
Expand Down
85 changes: 49 additions & 36 deletions OfficeIMO.Pdf/Reading/Core/PdfReadDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,15 @@ private List<PdfReadPage> CollectPages() {
if (catalogId is int cat && _objects.TryGetValue(cat, out var catObj) && catObj.Value is PdfDictionary catalog) {
var pagesNode = ResolveDict(catalog.Items.TryGetValue("Pages", out var v) ? v : null);
if (pagesNode is not null) {
var kids = pagesNode.Get<PdfArray>("Kids");
var kids = ResolveArray(pagesNode.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
int kidCount = kids?.Items.Count ?? 0;
var visited = new HashSet<int>();
var contentKeys = new HashSet<string>(StringComparer.Ordinal);
int? target = null;
var cnt = pagesNode.Get<PdfNumber>("Count");
if (cnt is not null) {
int cc = (int)cnt.Value; if (cc > 0) target = cc;
}
TraversePagesNodeDeepLimited(pagesNode, visited, contentKeys, result, target);
TraversePagesNodeDeepLimited(pagesNode, visited, result, target);
if (result.Count == 0 && kidCount > 0) {
// Build a reachable candidate set from Kids only
var reachable = CollectReachableLeafCandidates(pagesNode);
Expand Down Expand Up @@ -92,6 +91,13 @@ private List<PdfReadPage> CollectPages() {
return null;
}

private PdfArray? ResolveArray(PdfObject? obj) {
if (obj is null) return null;
if (obj is PdfArray a) return a;
if (obj is PdfReference r && _objects.TryGetValue(r.ObjectNumber, out var ind) && ind.Value is PdfArray aa) return aa;
return null;
}

private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
var type = node.Get<PdfName>("Type")?.Name;
if (type == "Page" || (type is null && IsLikelyPage(node))) {
Expand All @@ -100,8 +106,9 @@ private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
outList.Add(new PdfReadPage(objNum, node, _objects));
return;
}
if (type == "Pages" || (type is null && node.Get<PdfArray>("Kids") is not null)) {
var kids = node.Get<PdfArray>("Kids");
var kidsObj = node.Items.TryGetValue("Kids", out var kidsValue) ? kidsValue : null;
if (type == "Pages" || (type is null && ResolveArray(kidsObj) is not null)) {
var kids = ResolveArray(kidsObj);
if (kids is null) return;
foreach (var kid in kids.Items) {
var d = ResolveDict(kid);
Expand All @@ -111,45 +118,40 @@ private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
}
}

private static bool IsLikelyPage(PdfDictionary d) {
// Heuristic when /Type is missing: leaf node has Contents, and either its own Resources or MediaBox.
private bool IsLikelyPage(PdfDictionary d) {
// Heuristic when /Type is missing: leaf node has Contents, and page data can come from itself or inherited /Pages nodes.
bool hasContents = d.Items.ContainsKey("Contents");
bool hasRes = d.Items.ContainsKey("Resources");
bool hasMedia = d.Items.ContainsKey("MediaBox") || d.Items.ContainsKey("CropBox");
bool hasKids = d.Items.ContainsKey("Kids");
bool hasRes = d.Items.ContainsKey("Resources") || HasInheritedValue(d, "Resources");
bool hasMedia = HasMedia(d) || HasInheritedValue(d, "MediaBox") || HasInheritedValue(d, "CropBox");
bool hasKids = ResolveArray(d.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null) is not null;
return !hasKids && hasContents && (hasRes || hasMedia);
}

private void TraversePagesNodeDeepLimited(PdfDictionary node, HashSet<int> visited, HashSet<string> contentKeys, List<PdfReadPage> outList, int? limit) {
private void TraversePagesNodeDeepLimited(PdfDictionary node, HashSet<int> visited, List<PdfReadPage> outList, int? limit) {
var type = node.Get<PdfName>("Type")?.Name;
if (type == "Page" || (type is null && IsLikelyPage(node))) {
int objNum = FindObjectNumberFor(node);
if (objNum > 0 && visited.Add(objNum)) {
if (HasMedia(node)) {
var key = ContentsKey(node);
if (key is null || contentKeys.Add(key)) {
outList.Add(new PdfReadPage(objNum, node, _objects));
}
if (type == "Page" || HasMedia(node) || HasInheritedValue(node, "MediaBox") || HasInheritedValue(node, "CropBox")) {
outList.Add(new PdfReadPage(objNum, node, _objects));
}
}
return;
}
var kids = node.Get<PdfArray>("Kids");
var kids = ResolveArray(node.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
if (kids is null) return;
foreach (var kid in kids.Items) {
if (limit.HasValue && outList.Count >= limit.Value) return;
var d = ResolveDict(kid);
if (d is null) { continue; }
var t = d.Get<PdfName>("Type")?.Name;
if (t == "Pages" || (t is null && d.Get<PdfArray>("Kids") is not null)) TraversePagesNodeDeepLimited(d, visited, contentKeys, outList, limit);
else if ((t == "Page" || IsLikelyPage(d) || IsLeafPageByParent(d)) && HasMedia(d)) {
if (t == "Pages" || (t is null && ResolveArray(d.Items.TryGetValue("Kids", out var dKidsObj) ? dKidsObj : null) is not null)) TraversePagesNodeDeepLimited(d, visited, outList, limit);
else if ((t == "Page" || IsLikelyPage(d) || IsLeafPageByParent(d)) &&
(t == "Page" || HasMedia(d) || HasInheritedValue(d, "MediaBox") || HasInheritedValue(d, "CropBox"))) {
int on = FindObjectNumberFor(d);
if (on > 0 && visited.Add(on)) {
var key = ContentsKey(d);
if (key is null || contentKeys.Add(key)) {
outList.Add(new PdfReadPage(on, d, _objects));
if (limit.HasValue && outList.Count >= limit.Value) return;
}
outList.Add(new PdfReadPage(on, d, _objects));
if (limit.HasValue && outList.Count >= limit.Value) return;
}
}
}
Expand All @@ -162,13 +164,13 @@ private HashSet<int> CollectReachableLeafCandidates(PdfDictionary pagesRoot) {
int guard = 0;
while (stack.Count > 0 && guard++ < 10000) {
var cur = stack.Pop();
var kids = cur.Get<PdfArray>("Kids");
var kids = ResolveArray(cur.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
if (kids is null) continue;
foreach (var k in kids.Items) {
var d = ResolveDict(k);
if (d is null) continue;
var t = d.Get<PdfName>("Type")?.Name;
if (t == "Pages" || (t is null && d.Get<PdfArray>("Kids") is not null)) stack.Push(d);
if (t == "Pages" || (t is null && ResolveArray(d.Items.TryGetValue("Kids", out var dKidsObj) ? dKidsObj : null) is not null)) stack.Push(d);
else if (IsLikelyPage(d) || IsLeafPageByParent(d)) {
int on = FindObjectNumberFor(d);
if (on > 0) set.Add(on);
Expand All @@ -193,19 +195,30 @@ private bool IsLeafPageByParent(PdfDictionary d) {
return false;
}

private static bool HasMedia(PdfDictionary d) => d.Items.ContainsKey("MediaBox") || d.Items.ContainsKey("CropBox");
private static string? ContentsKey(PdfDictionary d) {
if (!d.Items.TryGetValue("Contents", out var c)) return null;
if (c is PdfReference r) return $"ref:{r.ObjectNumber}";
if (c is PdfArray arr) {
var sb = new System.Text.StringBuilder();
sb.Append("arr:");
foreach (var it in arr.Items) if (it is PdfReference rr) sb.Append(rr.ObjectNumber).Append(',');
return sb.ToString();
private bool HasInheritedValue(PdfDictionary start, string key) {
PdfDictionary? current = start;
int guard = 0;
while (current is not null && guard++ < 100) {
if (current.Items.ContainsKey(key)) {
return true;
}

if (!current.Items.TryGetValue("Parent", out var parentObj)) {
break;
}

var parent = ResolveDict(parentObj);
if (parent is null) {
break;
}

current = parent;
}
return null;

return false;
}

private static bool HasMedia(PdfDictionary d) => d.Items.ContainsKey("MediaBox") || d.Items.ContainsKey("CropBox");
private int FindObjectNumberFor(PdfDictionary dict) {
foreach (var kv in _objects) if (ReferenceEquals(kv.Value.Value, dict)) return kv.Key;
// As a fallback when dictionary was re-parsed separately, match by identity via a simple scan of Page objects
Expand Down
Loading
Loading