pdf_ua/lym_PDFUA_final.py at main · yhan818/pdf_ua · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import io
import datetime
import uuid
from PIL import Image
import pytesseract
import fitz  # PyMuPDF

# === CONFIGURATION ===
INPUT_FOLDER = "input_pdfs"
OUTPUT_FOLDER = "output_pdfs"
DPI = 150
JPEG_QUALITY = 80

os.makedirs(INPUT_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def process_all_pdfs():
    """
    Main function to find and process all PDFs.
    """
    print(f"--- Script Started: Checking for files in '{INPUT_FOLDER}' ---")

    pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]

    if not pdf_files:
        print(f"INFO: No .pdf files found in '{INPUT_FOLDER}'. Nothing to process.")
        return

    print(f"✅ Found {len(pdf_files)} PDF file(s). Starting processing...")

    for filename in pdf_files:
        input_path = os.path.join(INPUT_FOLDER, filename)
        output_path = os.path.join(OUTPUT_FOLDER, f"pdfua_ocr_{filename}")

        print(f"\n🔍 Starting processing for: {filename}")
        try:
            # Call the single function to do all work
            create_ocr_pdf_with_metadata(input_path, output_path)
            print(f"✅ Successfully processed and saved to: {output_path}")
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    print("\n--- Script Finished ---")


def create_ocr_pdf_with_metadata(input_path, output_path):
    """
    [THE COMBINED SOLUTION]
    This function uses the 'TextWriter' (which adds text) and
    'fill_opacity=0' (to make it invisible). This is the
    most robust method we have.
    """
    source_doc = fitz.open(input_path)
    output_doc = fitz.open()

    # --- METADATA VALUES ---
    doc_title = os.path.splitext(os.path.basename(input_path))[0].replace('_', ' ').title()
    author = 'University of Arizona Libraries'
    subject = 'Lymphology Journal Article'
    keywords = 'OCR, accessibility, PDF/UA, Lymphology'

    # --- Set basic metadata (Info dictionary) ---
    output_doc.set_metadata({
        'title': doc_title,
        'author': author,
        'subject': subject,
        'keywords': keywords,
    })
    output_doc.set_language("en-US")

    try:
        output_doc.set_viewer_preferences({'DisplayDocTitle': True})
    except AttributeError:
        print("      -> INFO: 'set_viewer_preferences' not available. Skipping.")
        pass

    # Define the font *once* for re-use.
    try:
        font = fitz.Font("helv")
    except Exception as e:
        print(f"      -> ERROR: Could not load built-in font 'helv'. {e}")
        return # Cannot proceed without a font

    # --- PAGE PROCESSING ---
    for page_num, source_page in enumerate(source_doc):
        print(f"      -> Processing page {page_num + 1}/{len(source_doc)}...")
        pix = source_page.get_pixmap(dpi=DPI)
        img_bytes = pix.tobytes("png")

        pil_image = Image.open(io.BytesIO(img_bytes))
        ocr_data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT, lang='eng')

        output_page = output_doc.new_page(width=source_page.rect.width, height=source_page.rect.height)

        # --- Image Layer (Compressed) ---
        buffer = io.BytesIO()
        if pil_image.mode in ("RGBA", "P"):
            pil_image = pil_image.convert("RGB")
        pil_image.save(buffer, format="JPEG", quality=JPEG_QUALITY)
        output_page.insert_image(source_page.rect, stream=buffer.getvalue())

        # --- [FIXED] Text Layer (Invisible) using TextWriter ---

# --- Text Layer (Invisible but Searchable) ---
# --- Text Layer (TRULY Invisible but Searchable) ---

        # Don't use TextWriter - use direct text insertion with render mode 3

        scale_x = source_page.rect.width / pix.width
        scale_y = source_page.rect.height / pix.height

        word_count = 0

        for i in range(len(ocr_data['text'])):
            word = ocr_data['text'][i]
            conf = int(ocr_data['conf'][i])

            w = ocr_data['width'][i]
            h = ocr_data['height'][i]

            if word.strip() and conf > 50 and w > 0 and h > 0:
                scaled_height = h * scale_y

                if scaled_height < 2.0:
                    continue

                left, top = ocr_data['left'][i], ocr_data['top'][i]

                # Calculate position
                x = left * scale_x
                y = (top + h) * scale_y  # Bottom-left corner for text insertion

                fontsize = scaled_height * 0.99

                try:
                    # Insert text with render_mode=3 (invisible)
                    output_page.insert_text(
                        point=(x, y),
                        text=word,
                        fontname="helv",
                        fontsize=fontsize,
                        render_mode=3,  # 3 = invisible text (neither fill nor stroke)
                        color=(0, 0, 0)  # Color doesn't matter with render_mode=3
                    )
                    word_count += 1
                except Exception as e:
                    pass  # Skip problematic words

        print(f"         -> Added {word_count} invisible words to text layer")
    # --- ADD XMP METADATA (using .format()) ---
    instance_id = str(uuid.uuid4())
    create_date = datetime.datetime.now(datetime.timezone.utc).isoformat()
    keywords_rdf = keywords.replace(",", "</rdf:li><rdf:li>")

    xmp_metadata = '''
    <?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
    <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="PyMuPDF">
      <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <rdf:Description rdf:about=""
            xmlns:dc="http://purl.org/dc/elements/1.1/"
            xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
            xmlns:pdfuaid="http://www.aiim.org/pdfua/ns/id/"
            xmlns:xmp="http://ns.adobe.com/xap/1.0/"
            xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/">

          <dc:title><rdf:Alt><rdf:li xml:lang="x-default">{v_doc_title}</rdf:li></rdf:Alt></dc:title>
          <dc:creator><rdf:Seq><rdf:li>{v_author}</rdf:li></rdf:Seq></dc:creator>
          <dc:description><rdf:Alt><rdf:li xml:lang="x-default">{v_subject}</rdf:li></rdf:Alt></dc:description>
          <dc:subject><rdf:Bag><rdf:li>{v_keywords_rdf}</rdf:li></rdf:Bag></dc:subject>
          <pdf:Producer>PyMuPDF (fitz)</pdf:Producer>
          <pdf:Keywords>{v_keywords}</pdf:Keywords>
          <xmp:CreateDate>{v_create_date}</xmp:CreateDate>
          <xmp:CreatorTool>UA Libraries OCR and Accessibility Script (fitz)</xmp:CreatorTool>
          <xmpMM:DocumentID>uuid:{v_instance_id}</xmpMM:DocumentID>
          <xmpMM:InstanceID>uuid:{v_instance_id}</xmpMM:InstanceID>
          <pdfuaid:part>1</pdfuaid:part>
        </rdf:Description>
      </rdf:RDF>
    </x:xmpmeta>
    <?xpacket end="w"?>
    '''.format(
        v_doc_title=doc_title,
        v_author=author,
        v_subject=subject,
        v_keywords_rdf=keywords_rdf,
        v_keywords=keywords,
        v_create_date=create_date,
        v_instance_id=instance_id
    )

    output_doc.set_xml_metadata(xmp_metadata)
    print("      -> INFO: XMP metadata attached.")

    # --- SAVE TO DISK ---
    output_doc.save(output_path, garbage=4, deflate=True, encryption=fitz.PDF_ENCRYPT_NONE)

    output_doc.close()
    source_doc.close()

# --- Main execution block ---
if __name__ == "__main__":
    process_all_pdfs()