Replies: 1 comment
-
|
Hi, I'm working on semantically understanding documents. I've decided to use the docling document format as a base. As far as I can see there is currently no support for interactive forms in pdf files. I'm currently working on this functionality. For this I've implemented an annotation wrapper with pdfium. It can extract various properties from annotations. My next step is defining an extension for the docling document format so the information can be represented. Are you interested in collaborating so the extension can be upstreamed? The following is a working example script that can be executed with # /// script
# dependencies = ["docling_core", "pypdfium2", "tabulate", "requests"]
# ///
from __future__ import annotations
import ctypes
from enum import IntEnum, auto
from docling_core.types.doc.base import BoundingBox
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
import pypdfium2.internal as pdfium_i
from tabulate import tabulate
class FormType(IntEnum):
"""mirrors pdfium FORMTYPE"""
FORMTYPE_NONE = 0
FORMTYPE_ACRO_FORM = auto()
FORMTYPE_XFA_FULL = auto()
FORMTYPE_XFA_FOREGROUND = auto()
FORMTYPE_COUNT = auto()
class FormFieldType(IntEnum):
"""mirrors pdfium FormFieldType"""
K_UNKNOWN = 0
K_PUSH_BUTTON = auto()
K_CHECK_BOX = auto()
K_RADIO_BUTTON = auto()
K_COMBO_BOX = auto()
K_LIST_BOX = auto()
K_TEXT_FIELD = auto()
K_SIGNATURE = auto()
K_XFA = auto()
K_XFA_CHECK_BOX = auto()
K_XFA_COMBO_BOX = auto()
K_XFA_IMAGE_FIELD = auto()
K_XFA_LIST_BOX = auto()
K_XFA_PUSH_BUTTON = auto()
K_XFA_SIGNATURE = auto()
K_XFA_TEXT_FIELD = auto()
def is_checkbox(self):
return self in [
FormFieldType.K_CHECK_BOX,
FormFieldType.K_RADIO_BUTTON,
FormFieldType.K_COMBO_BOX,
FormFieldType.K_LIST_BOX,
FormFieldType.K_XFA_CHECK_BOX,
FormFieldType.K_XFA_COMBO_BOX,
FormFieldType.K_XFA_LIST_BOX,
]
class AnnotationSubtype(IntEnum):
FPDF_ANNOT_UNKNOWN = 0
FPDF_ANNOT_TEXT = auto()
FPDF_ANNOT_LINK = auto()
FPDF_ANNOT_FREETEXT = auto()
FPDF_ANNOT_LINE = auto()
FPDF_ANNOT_SQUARE = auto()
FPDF_ANNOT_CIRCLE = auto()
FPDF_ANNOT_POLYGON = auto()
FPDF_ANNOT_POLYLINE = auto()
FPDF_ANNOT_HIGHLIGHT = auto()
FPDF_ANNOT_UNDERLINE = auto()
FPDF_ANNOT_SQUIGGLY = auto()
FPDF_ANNOT_STRIKEOUT = auto()
FPDF_ANNOT_STAMP = auto()
FPDF_ANNOT_CARET = auto()
FPDF_ANNOT_INK = auto()
FPDF_ANNOT_POPUP = auto()
FPDF_ANNOT_FILEATTACHMENT = auto()
FPDF_ANNOT_SOUND = auto()
FPDF_ANNOT_MOVIE = auto()
FPDF_ANNOT_WIDGET = auto()
FPDF_ANNOT_SCREEN = auto()
FPDF_ANNOT_PRINTERMARK = auto()
FPDF_ANNOT_TRAPNET = auto()
FPDF_ANNOT_WATERMARK = auto()
FPDF_ANNOT_THREED = auto()
FPDF_ANNOT_RICHMEDIA = auto()
FPDF_ANNOT_XFAWIDGET = auto()
FPDF_ANNOT_REDACT = auto()
class PdfAnnotation(pdfium_i.AutoCloseable):
def __init__(self, raw: pdfium_c.FPDF_ANNOTATION, page: pdfium.PdfPage): # pyright: ignore[reportInvalidTypeForm]
self.raw = raw
self.page = page
page.pdf.init_forms()
super().__init__(pdfium_c.FPDFPage_CloseAnnot, raw)
def __repr__(self):
typ = self.get_form_field_type()
if typ is not None:
typ = typ.name
repr = "Annotation:\n"
bbox = self.to_bbox()
bbox_repr = (
f"l: {bbox.l:.2f}, r: {bbox.r:.2f}, t: {bbox.t:.2f}, b: {bbox.b:.2f}"
)
js_actions = [
[f"javascript: {key}", action.replace("\n", r" \n ")]
for key, action in self.get_form_additional_action_javascript().items()
]
appearances = [
[f"appearance: {key}", appearance.replace("\n", r" \n ")]
for key, appearance in self.get_appearance().items()
]
repr += tabulate(
[
["BBox", bbox_repr],
["Type", typ],
["Subtype", self.get_subtype().name],
["Name", self.get_form_field_name()],
["Alternate Name", self.get_form_field_alternate_name()],
["Value", self.get_form_field_value()],
["Export Value", self.get_form_field_export_value()],
["Is checked", self.is_checked()],
["Flags", self.get_flags()],
["Form Field Flags", self.get_form_field_flags()],
["Attachment Points Count", self.count_attachment_points()],
["Option Count", self.get_option_count()],
["Border", str(self.get_border())],
["Object Count", self.get_object_count()],
["Form Control Index", self.get_form_control_index()],
["Form Control Count", self.get_form_control_count()],
]
+ js_actions
+ appearances,
headers=["key", "value"],
)
return repr
@property
def parent(self):
return self.page
@property
def formenv(self):
return self.page.pdf.formenv
def _get_text(self, fn) -> str:
length = fn(self.formenv, self.raw, None, 0)
if length <= 2:
return ""
buffer = (ctypes.c_ushort * length)()
fn(self.formenv, self.raw, buffer, length)
byte_data = bytes(buffer)[:-2]
value = byte_data.decode("utf-16-le").rstrip("\x00")
return value
def get_flags(self) -> int:
return pdfium_c.FPDFAnnot_GetFlags(self.raw)
def get_object_count(self) -> int:
return pdfium_c.FPDFAnnot_GetObjectCount(self.raw)
def get_border(self) -> dict[str, float]:
horizontal_radius = ctypes.c_float()
vertical_radius = ctypes.c_float()
border_width = ctypes.c_float()
pdfium_c.FPDFAnnot_GetBorder(
self.raw,
ctypes.byref(horizontal_radius),
ctypes.byref(vertical_radius),
ctypes.byref(border_width),
)
return {
"horizontal_radius": horizontal_radius.value,
"vertical_radius": vertical_radius.value,
"border_width": border_width.value,
}
def count_attachment_points(self) -> int:
return pdfium_c.FPDFAnnot_CountAttachmentPoints(self.raw)
def get_appearance(self) -> dict[str, str]:
results = {}
appearances = {
"normal": pdfium_c.FPDF_ANNOT_APPEARANCEMODE_NORMAL,
"count": pdfium_c.FPDF_ANNOT_APPEARANCEMODE_COUNT,
"down": pdfium_c.FPDF_ANNOT_APPEARANCEMODE_DOWN,
"rollover": pdfium_c.FPDF_ANNOT_APPEARANCEMODE_ROLLOVER,
}
for key, mode in appearances.items():
fn = pdfium_c.FPDFAnnot_GetAP
length = fn(self.raw, mode, None, 0)
if length <= 2:
results[key] = ""
continue
buffer = (ctypes.c_ushort * length)()
fn(self.raw, mode, buffer, length)
byte_data = bytes(buffer)[:-2]
value = byte_data.decode("utf-16-le").rstrip("\x00")
results[key] = value.replace("\n", r" \n ")
return results
def get_form_additional_action_javascript(self) -> dict[str, str]:
fn = pdfium_c.FPDFAnnot_GetFormAdditionalActionJavaScript
actions = {
"format": pdfium_c.FPDF_ANNOT_AACTION_FORMAT,
"calculate": pdfium_c.FPDF_ANNOT_AACTION_CALCULATE,
"key_stroke": pdfium_c.FPDF_ANNOT_AACTION_KEY_STROKE,
"validate": pdfium_c.FPDF_ANNOT_AACTION_VALIDATE,
}
results = {}
for key, action in actions.items():
length = fn(self.formenv, self.raw, action, None, 0)
if length <= 2:
results[key] = ""
continue
buffer = (ctypes.c_ushort * length)()
fn(self.formenv, self.raw, action, buffer, length)
byte_data = bytes(buffer)[:-2]
value = byte_data.decode("utf-16-le").rstrip("\x00")
results[key] = value
return results
def get_form_control_count(self) -> int:
return pdfium_c.FPDFAnnot_GetFormControlCount(self.formenv, self.raw)
def get_form_control_index(self) -> int:
return pdfium_c.FPDFAnnot_GetFormControlIndex(self.formenv, self.raw)
def get_form_field_export_value(self) -> str:
return self._get_text(pdfium_c.FPDFAnnot_GetFormFieldExportValue)
def get_form_field_alternate_name(self) -> str:
return self._get_text(pdfium_c.FPDFAnnot_GetFormFieldAlternateName)
def get_form_field_name(self) -> str:
return self._get_text(pdfium_c.FPDFAnnot_GetFormFieldName)
def get_form_field_value(self) -> str:
return self._get_text(pdfium_c.FPDFAnnot_GetFormFieldValue)
def get_option_count(self) -> int:
return pdfium_c.FPDFAnnot_GetOptionCount(self.formenv, self.raw)
def option_selected_list(self) -> list[bool]:
option_count = pdfium_c.FPDFAnnot_GetOptionCount(self.formenv, self.raw)
results = []
for idx in range(option_count):
results.append(
pdfium_c.FPDFAnnot_IsOptionSelected(self.formenv, self.raw, idx)
)
return results
def is_checked(self) -> bool | None:
typ = self.get_form_field_type()
if typ is None or not typ.is_checkbox():
return None
checked = bool(pdfium_c.FPDFAnnot_IsChecked(self.formenv, self.raw))
if checked:
return True
value = self.get_form_field_value()
if value.lower() in ("", "off", "0", "false", "unchecked", "no", "nein"):
return False
if self.get_form_control_count() > 1:
normal_appearance = self.get_appearance()["normal"]
return normal_appearance != ""
return True
def get_form_field_flags(self) -> int:
return pdfium_c.FPDFAnnot_GetFormFieldFlags(self.formenv, self.raw)
def get_subtype(self) -> AnnotationSubtype:
return AnnotationSubtype(pdfium_c.FPDFAnnot_GetSubtype(self.raw))
def get_value(self) -> str | int | bool | None:
field_type = self.get_form_field_type()
match field_type:
case None:
return None
case FormFieldType.K_UNKNOWN:
return None
case FormFieldType.K_PUSH_BUTTON:
return None
case FormFieldType.K_CHECK_BOX:
return self.is_checked()
case FormFieldType.K_RADIO_BUTTON:
return None
case FormFieldType.K_COMBO_BOX:
return None
case FormFieldType.K_LIST_BOX:
return None
case FormFieldType.K_TEXT_FIELD:
return self.get_form_field_value()
case FormFieldType.K_SIGNATURE:
return None
case FormFieldType.K_XFA:
return None
case FormFieldType.K_XFA_CHECK_BOX:
return None
case FormFieldType.K_XFA_COMBO_BOX:
return None
case FormFieldType.K_XFA_IMAGE_FIELD:
return None
case FormFieldType.K_XFA_LIST_BOX:
return None
case FormFieldType.K_XFA_PUSH_BUTTON:
return None
case FormFieldType.K_XFA_SIGNATURE:
return None
case FormFieldType.K_XFA_TEXT_FIELD:
return None
def to_bbox(self) -> BoundingBox:
rectangle = pdfium_c.FS_RECTF()
pdfium_c.FPDFAnnot_GetRect(self.raw, rectangle)
bbox = BoundingBox(
l=rectangle.left, t=rectangle.top, r=rectangle.right, b=rectangle.bottom
)
bbox.to_top_left_origin(self.page.get_height())
return bbox
def is_widget(self) -> bool:
return self.get_subtype() == pdfium_c.FPDF_ANNOT_WIDGET
def get_form_field_type(self) -> FormFieldType | None:
if not self.is_widget():
return None
formenv = self.page.pdf.formenv
form_field_type = pdfium_c.FPDFAnnot_GetFormFieldType(formenv, self.raw)
return FormFieldType(form_field_type)
@classmethod
def get_annotations(cls, page: pdfium.PdfPage) -> list[PdfAnnotation]:
annotations = []
annotation_count = pdfium_c.FPDFPage_GetAnnotCount(page.raw)
for idx in range(annotation_count):
raw_annotation = pdfium_c.FPDFPage_GetAnnot(page.raw, idx)
annotations.append(PdfAnnotation(raw=raw_annotation, page=page))
return annotations
@staticmethod
def get_form_type(document: pdfium.PdfDocument) -> FormType:
document.init_forms()
return FormType(document.get_formtype())
@staticmethod
def get_count(page: pdfium.PdfPage) -> int:
return pdfium_c.FPDFPage_GetAnnotCount(page.raw)
if __name__ == "__main__":
import os
import requests
import tempfile
irs_form_url = "https://www.irs.gov/pub/irs-pdf/f1040s1.pdf"
response = requests.get(irs_form_url)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(response.content)
tmp_path = tmp_file.name
try:
document = pdfium.PdfDocument(tmp_path)
page = document[0]
annotations = PdfAnnotation.get_annotations(page)
for annotation in annotations:
print(annotation)
print("")
finally:
os.unlink(tmp_path) |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Docling creates well-structured output for many common item types with plain text-representation (paragraphs, headings, captions, lists, formulas), and tables, including their cell structure. We want to extend Docling to support creating well-structured output also for pictures, forms, and key-value regions. To address these types of content, there are a few gaps which need to be addressed.
Current situation
The layout model in
docling-ibm-modelsis trained to recognize forms and key-value regions, but both classes are currently ignored in downstream processing. As a consequence, the items contained in a form or a key-value region are treated as individual items without that context. In key-value regions, the key and value text is therefore represented as plain text items, with no connection between key and value, and not ordered correctly. In forms, extra elements such as checkboxes, groups of choices, and other elements are likewise represented as plain text items without grouping or useful order.For content detected as Picture item, the text content inside is ignored by default (even if OCR detected text inside). We have examples which outline how to build picture-enrichment models, but they are not used by default and don't exploit the known text content inside picture items so far (see here)
Planned extensions
This topic will require work on several steps to prepare docling for the additional content types.
docling-ibm-models), understand where it causes confusion (especially with table detections)DoclingDocumentwith new data models for respective item typesAdditionally, to ensure high-quality results for difficult samples, we will need to invest into the development (or third-party integration) of specialized models for form- and key-value understanding.
Questions to be answered
GroupItemsufficient?childrenwith text items? Other?Everyone is invited to contribute to this discussion and provide feedback or examples from other solutions.
Beta Was this translation helpful? Give feedback.
All reactions