Skip to content
This repository was archived by the owner on Apr 15, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
PDFMiner
========

*This is a custom version of PDFMiner that extracts color information for text alongside the bbox, font, and size info.

[![Build Status](https://travis-ci.org/euske/pdfminer.svg?branch=master)](https://travis-ci.org/euske/pdfminer)

PDFMiner is a tool for extracting information from PDF documents.
Expand Down
55 changes: 19 additions & 36 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
#!/usr/bin/env python
import logging
import re
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from .layout import LTContainer
from .layout import LTPage
from .layout import LTText
from .layout import LTLine
from .layout import LTRect
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTChar
from .layout import LTTextLine
from .layout import LTTextBox
from .layout import LTTextBoxVertical
from .layout import LTTextGroup
from .utils import apply_matrix_pt
from .utils import mult_matrix
from .utils import enc
from .utils import bbox2str
import sys
from pdfdevice import PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
from layout import LTFigure, LTImage, LTChar, LTTextLine
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str


## PDFLayoutAnalyzer
Expand Down Expand Up @@ -103,20 +90,21 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
self.cur_item.add(LTCurve(gstate.linewidth, pts))
return

def render_char(self, matrix, font, fontsize, scaling, rise, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, nc):
try:
text = font.to_unichr(cid)
assert isinstance(text, unicode), text
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, nc)
self.cur_item.add(item)
return item.adv

def handle_undefined_char(self, font, cid):
logging.info('undefined: %r, %r' % (font, cid))
if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (font, cid)
return '(cid:%d)' % cid

def receive_layout(self, ltpage):
Expand Down Expand Up @@ -185,7 +173,7 @@ def render(item):
return

# Some dummy functions to save memory/CPU when all that is wanted
# is text. This stops all the image and drawing output from being
# is text. This stops all the image and drawing ouput from being
# recorded and taking up RAM.
def render_image(self, name, stream):
if self.imagewriter is None:
Expand Down Expand Up @@ -218,7 +206,7 @@ class HTMLConverter(PDFConverter):

def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None, debug=0,
pagemargin=50, imagewriter=None,
rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char': 'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
Expand All @@ -230,7 +218,7 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
self.imagewriter = imagewriter
self.rect_colors = rect_colors
self.text_colors = text_colors
if debug:
if self.debug:
self.rect_colors.update(self.RECT_COLORS)
self.text_colors.update(self.TEXT_COLORS)
self._yoffset = self.pagemargin
Expand Down Expand Up @@ -315,7 +303,7 @@ def put_text(self, text, fontname, fontsize):
if self._font is not None:
self.write('</span>')
self.write('<span style="font-family: %s; font-size:%dpx">' %
(enc(fontname), fontsize * self.scale * self.fontscale))
(fontname, fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
return
Expand Down Expand Up @@ -398,13 +386,10 @@ def close(self):
##
class XMLConverter(PDFConverter):

CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None, stripcontrol=False):
laparams=None, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header()
return

Expand All @@ -418,8 +403,6 @@ def write_footer(self):
return

def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub(u'', text)
self.outfp.write(enc(text, self.codec))
return

Expand Down Expand Up @@ -477,8 +460,8 @@ def render(item):
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.fontname), bbox2str(item.bbox), item.size))
self.outfp.write('<text font="%s" bbox="%s" color="%s%s" size="%.3f">' %
(enc(item.fontname), bbox2str(item.bbox), item.ncs.name, item.nc, item.size))
self.write_text(item.get_text())
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
Expand Down
58 changes: 14 additions & 44 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
#!/usr/bin/env python
from .utils import INF
from .utils import Plane
from .utils import get_bound
from .utils import uniq
from .utils import csort
from .utils import fsplit
from .utils import bbox2str
from .utils import matrix2str
from .utils import apply_matrix_pt
from utils import INF, Plane, get_bound, uniq, csort, fsplit
from utils import bbox2str, matrix2str, apply_matrix_pt


## IndexAssigner
Expand Down Expand Up @@ -88,25 +81,14 @@ def __repr__(self):
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))

# Disable comparison.
def __lt__(self, _):
raise ValueError
def __le__(self, _):
raise ValueError
def __gt__(self, _):
raise ValueError
def __ge__(self, _):
raise ValueError

def set_bbox(self, bbox):
(x0, y0, x1, y1) = bbox
def set_bbox(self, (x0, y0, x1, y1)):
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
self.bbox = bbox
self.bbox = (x0, y0, x1, y1)
return

def is_empty(self):
Expand Down Expand Up @@ -176,8 +158,7 @@ def __init__(self, linewidth, p0, p1):
##
class LTRect(LTCurve):

def __init__(self, linewidth, bbox):
(x0, y0, x1, y1) = bbox
def __init__(self, linewidth, (x0, y0, x1, y1)):
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
return

Expand Down Expand Up @@ -222,19 +203,21 @@ def get_text(self):
class LTChar(LTComponent, LTText):

def __init__(self, matrix, font, fontsize, scaling, rise,
text, textwidth, textdisp):
text, textwidth, textdisp, ncs, nc):
LTText.__init__(self)
self._text = text
self.matrix = matrix
self.fontname = font.fontname
self.ncs = ncs
self.nc = nc
self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
if font.is_vertical():
# vertical
width = font.get_width() * fontsize
(vx, vy) = textdisp
if vx is None:
vx = width * 0.5
vx = width//2
else:
vx = vx * fontsize * .001
vy = (1000 - vy) * fontsize * .001
Expand Down Expand Up @@ -626,20 +609,14 @@ def isany(obj1, obj2):
y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))

def key_obj(t):
(c,d,_,_) = t
return (c,d)

# XXX this still takes O(n^2) :(
dists = []
for i in xrange(len(boxes)):
obj1 = boxes[i]
for j in xrange(i+1, len(boxes)):
obj2 = boxes[j]
dists.append((0, dist(obj1, obj2), obj1, obj2))
# We could use dists.sort(), but it would randomize the test result.
dists = csort(dists, key=key_obj)
dists.sort()
plane = Plane(self.bbox)
plane.extend(boxes)
while dists:
Expand All @@ -654,11 +631,11 @@ def key_obj(t):
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in plane and obj2 in plane) ]
# this line is optimized -- don't change without profiling
dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
for other in plane:
dists.append((0, dist(group, other), group, other))
dists = csort(dists, key=key_obj)
dists.sort()
plane.add(group)
assert len(plane) == 1
return list(plane)
Expand All @@ -676,20 +653,13 @@ def analyze(self, laparams):
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.group_textlines(laparams, textlines))
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes:
if textboxes:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
else:
def getkey(box):
if isinstance(box, LTTextBoxVertical):
return (0, -box.x1, box.y0)
else:
return (1, box.y0, box.x0)
textboxes.sort(key=getkey)
self._objs = textboxes + otherobjs + empties
return

Expand Down
38 changes: 18 additions & 20 deletions pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
#!/usr/bin/env python
from .utils import mult_matrix
from .utils import translate_matrix
from .utils import enc
from .utils import bbox2str
from .utils import isnumber
from .pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, translate_matrix
from utils import enc, bbox2str, isnumber
from pdffont import PDFUnicodeNotDefined


## PDFDevice
##
class PDFDevice(object):

debug = 0

def __init__(self, rsrcmgr):
self.rsrcmgr = rsrcmgr
self.ctm = None
Expand Down Expand Up @@ -53,15 +52,15 @@ def paint_path(self, graphicstate, stroke, fill, evenodd, path):
def render_image(self, name, stream):
return

def render_string(self, textstate, seq):
def render_string(self, textstate, seq, scs):
return


## PDFTextDevice
##
class PDFTextDevice(PDFDevice):

def render_string(self, textstate, seq):
def render_string(self, textstate, seq, ncs, nc):
matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
Expand All @@ -75,16 +74,15 @@ def render_string(self, textstate, seq):
if font.is_vertical():
textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
scaling, charspace, wordspace, rise, dxscale, ncs, nc)
else:
textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
scaling, charspace, wordspace, rise, dxscale, ncs, nc)
return

def render_string_horizontal(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
(x, y) = pos
def render_string_horizontal(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, nc):
needcharspace = False
for obj in seq:
if isnumber(obj):
Expand All @@ -95,15 +93,14 @@ def render_string_horizontal(self, seq, matrix, pos,
if needcharspace:
x += charspace
x += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
font, fontsize, scaling, rise, cid, ncs, nc)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)

def render_string_vertical(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
(x, y) = pos
def render_string_vertical(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, nc):
needcharspace = False
for obj in seq:
if isnumber(obj):
Expand All @@ -114,24 +111,25 @@ def render_string_vertical(self, seq, matrix, pos,
if needcharspace:
y += charspace
y += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
font, fontsize, scaling, rise, cid, ncs, nc)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
return (x, y)

def render_char(self, matrix, font, fontsize, scaling, rise, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, nc):
return 0


## TagExtractor
##
class TagExtractor(PDFDevice):

def __init__(self, rsrcmgr, outfp, codec='utf-8'):
def __init__(self, rsrcmgr, outfp, codec='utf-8', debug=0):
PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp
self.codec = codec
self.debug = debug
self.pageno = 0
self._stack = []
return
Expand Down
Loading