Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 64 additions & 20 deletions OfficeIMO.Pdf/Reading/Core/PdfReadDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ private List<PdfReadPage> CollectPages() {
if (catalogId is int cat && _objects.TryGetValue(cat, out var catObj) && catObj.Value is PdfDictionary catalog) {
var pagesNode = ResolveDict(catalog.Items.TryGetValue("Pages", out var v) ? v : null);
if (pagesNode is not null) {
var kids = pagesNode.Get<PdfArray>("Kids");
var kids = ResolveArray(pagesNode.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
int kidCount = kids?.Items.Count ?? 0;
var visited = new HashSet<int>();
var contentKeys = new HashSet<string>(StringComparer.Ordinal);
Expand Down Expand Up @@ -92,6 +92,13 @@ private List<PdfReadPage> CollectPages() {
return null;
}

private PdfArray? ResolveArray(PdfObject? obj) {
if (obj is null) return null;
if (obj is PdfArray a) return a;
if (obj is PdfReference r && _objects.TryGetValue(r.ObjectNumber, out var ind) && ind.Value is PdfArray aa) return aa;
return null;
}

private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
var type = node.Get<PdfName>("Type")?.Name;
if (type == "Page" || (type is null && IsLikelyPage(node))) {
Expand All @@ -100,8 +107,9 @@ private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
outList.Add(new PdfReadPage(objNum, node, _objects));
return;
}
if (type == "Pages" || (type is null && node.Get<PdfArray>("Kids") is not null)) {
var kids = node.Get<PdfArray>("Kids");
var kidsObj = node.Items.TryGetValue("Kids", out var kidsValue) ? kidsValue : null;
if (type == "Pages" || (type is null && ResolveArray(kidsObj) is not null)) {
var kids = ResolveArray(kidsObj);
if (kids is null) return;
foreach (var kid in kids.Items) {
var d = ResolveDict(kid);
Expand All @@ -111,12 +119,12 @@ private void TraversePagesNode(PdfDictionary node, List<PdfReadPage> outList) {
}
}

private static bool IsLikelyPage(PdfDictionary d) {
// Heuristic when /Type is missing: leaf node has Contents, and either its own Resources or MediaBox.
private bool IsLikelyPage(PdfDictionary d) {
// Heuristic when /Type is missing: leaf node has Contents, and page data can come from itself or inherited /Pages nodes.
bool hasContents = d.Items.ContainsKey("Contents");
bool hasRes = d.Items.ContainsKey("Resources");
bool hasMedia = d.Items.ContainsKey("MediaBox") || d.Items.ContainsKey("CropBox");
bool hasKids = d.Items.ContainsKey("Kids");
bool hasRes = d.Items.ContainsKey("Resources") || HasInheritedValue(d, "Resources");
bool hasMedia = HasMedia(d) || HasInheritedValue(d, "MediaBox") || HasInheritedValue(d, "CropBox");
bool hasKids = ResolveArray(d.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null) is not null;
return !hasKids && hasContents && (hasRes || hasMedia);
}

Expand All @@ -125,7 +133,7 @@ private void TraversePagesNodeDeepLimited(PdfDictionary node, HashSet<int> visit
if (type == "Page" || (type is null && IsLikelyPage(node))) {
int objNum = FindObjectNumberFor(node);
if (objNum > 0 && visited.Add(objNum)) {
if (HasMedia(node)) {
if (type == "Page" || HasMedia(node) || HasInheritedValue(node, "MediaBox") || HasInheritedValue(node, "CropBox")) {
var key = ContentsKey(node);
if (key is null || contentKeys.Add(key)) {
outList.Add(new PdfReadPage(objNum, node, _objects));
Expand All @@ -134,15 +142,16 @@ private void TraversePagesNodeDeepLimited(PdfDictionary node, HashSet<int> visit
}
return;
}
var kids = node.Get<PdfArray>("Kids");
var kids = ResolveArray(node.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
if (kids is null) return;
foreach (var kid in kids.Items) {
if (limit.HasValue && outList.Count >= limit.Value) return;
var d = ResolveDict(kid);
if (d is null) { continue; }
var t = d.Get<PdfName>("Type")?.Name;
if (t == "Pages" || (t is null && d.Get<PdfArray>("Kids") is not null)) TraversePagesNodeDeepLimited(d, visited, contentKeys, outList, limit);
else if ((t == "Page" || IsLikelyPage(d) || IsLeafPageByParent(d)) && HasMedia(d)) {
if (t == "Pages" || (t is null && ResolveArray(d.Items.TryGetValue("Kids", out var dKidsObj) ? dKidsObj : null) is not null)) TraversePagesNodeDeepLimited(d, visited, contentKeys, outList, limit);
else if ((t == "Page" || IsLikelyPage(d) || IsLeafPageByParent(d)) &&
(t == "Page" || HasMedia(d) || HasInheritedValue(d, "MediaBox") || HasInheritedValue(d, "CropBox"))) {
int on = FindObjectNumberFor(d);
if (on > 0 && visited.Add(on)) {
var key = ContentsKey(d);
Expand All @@ -162,13 +171,13 @@ private HashSet<int> CollectReachableLeafCandidates(PdfDictionary pagesRoot) {
int guard = 0;
while (stack.Count > 0 && guard++ < 10000) {
var cur = stack.Pop();
var kids = cur.Get<PdfArray>("Kids");
var kids = ResolveArray(cur.Items.TryGetValue("Kids", out var kidsObj) ? kidsObj : null);
if (kids is null) continue;
foreach (var k in kids.Items) {
var d = ResolveDict(k);
if (d is null) continue;
var t = d.Get<PdfName>("Type")?.Name;
if (t == "Pages" || (t is null && d.Get<PdfArray>("Kids") is not null)) stack.Push(d);
if (t == "Pages" || (t is null && ResolveArray(d.Items.TryGetValue("Kids", out var dKidsObj) ? dKidsObj : null) is not null)) stack.Push(d);
else if (IsLikelyPage(d) || IsLeafPageByParent(d)) {
int on = FindObjectNumberFor(d);
if (on > 0) set.Add(on);
Expand All @@ -193,19 +202,54 @@ private bool IsLeafPageByParent(PdfDictionary d) {
return false;
}

private bool HasInheritedValue(PdfDictionary start, string key) {
PdfDictionary? current = start;
int guard = 0;
while (current is not null && guard++ < 100) {
if (current.Items.ContainsKey(key)) {
return true;
}

if (!current.Items.TryGetValue("Parent", out var parentObj)) {
break;
}

var parent = ResolveDict(parentObj);
if (parent is null) {
break;
}

current = parent;
}

return false;
}

private static bool HasMedia(PdfDictionary d) => d.Items.ContainsKey("MediaBox") || d.Items.ContainsKey("CropBox");
private static string? ContentsKey(PdfDictionary d) {
private string? ContentsKey(PdfDictionary d) {
if (!d.Items.TryGetValue("Contents", out var c)) return null;
if (c is PdfReference r) return $"ref:{r.ObjectNumber}";
if (c is PdfReference r) {
if (_objects.TryGetValue(r.ObjectNumber, out var ind) && ind.Value is PdfArray referencedArray) {
return ContentsArrayKey(referencedArray);
}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve referenced content-array identity in page dedupe

When /Contents is an indirect reference to a PdfArray, this code now converts the key from ref:<objectNumber> to ContentsArrayKey(...), and TraversePagesNodeDeepLimited uses that key to drop “duplicates.” In valid PDFs, different page objects can point to different content-array objects that contain the same stream references (or reordered wrappers), and this change causes later pages to be silently skipped, reducing Pages and extracted text. Keep the indirect object identity in the dedupe key (or stop content-based dedupe) so distinct pages are not collapsed.

Useful? React with 👍 / 👎.

return $"ref:{r.ObjectNumber}";
}
if (c is PdfStream) return "stream:direct";
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve unique keys for direct page content streams

ContentsKey now returns the constant "stream:direct" for every page whose /Contents is a direct stream, but TraversePagesNodeDeepLimited uses this key for de-duplication via contentKeys.Add(key). That means valid PDFs with multiple pages using direct (non-referenced) content streams will keep only the first page and silently drop the rest from Pages. Use a per-page unique key (or disable content-key dedupe for direct streams) so distinct pages are not collapsed.

Useful? React with 👍 / 👎.

if (c is PdfArray arr) {
var sb = new System.Text.StringBuilder();
sb.Append("arr:");
foreach (var it in arr.Items) if (it is PdfReference rr) sb.Append(rr.ObjectNumber).Append(',');
return sb.ToString();
return ContentsArrayKey(arr);
}
return null;
}

private static string ContentsArrayKey(PdfArray arr) {
var sb = new System.Text.StringBuilder();
sb.Append("arr:");
foreach (var it in arr.Items) {
if (it is PdfReference rr) sb.Append(rr.ObjectNumber).Append(',');
}
return sb.ToString();
}

private int FindObjectNumberFor(PdfDictionary dict) {
foreach (var kv in _objects) if (ReferenceEquals(kv.Value.Value, dict)) return kv.Key;
// As a fallback when dictionary was re-parsed separately, match by identity via a simple scan of Page objects
Expand Down
Loading
Loading