# from __future__ import annotations
import itertools
from typing import Union, List, Tuple, Callable
import unicodedata
from pdfscraper.layout.utils import Bbox, create_bbox_backend, Backend, PageOrientation, Rectangular, group_objs
from pdfscraper.layout.utils import (
get_leftmost,
get_rightmost,
get_topmost,
get_bottommost,
)
[docs]class TextLine(Rectangular):
"""
A horizontal line of text.
"""
def __init__(self, words):
self.words = words
if words:
self.bbox = Bbox(words[0].bbox.x0, words[0].bbox.y0,
words[-1].bbox.x1, words[-1].bbox.y1)
else:
self.bbox = Bbox(-1,-1,-1,-1)
@property
[docs] def text(self):
return ' '.join(str(i) for i in self.words)
[docs] def __getitem__(self, key):
return self.words[key]
[docs] def __bool__(self):
return bool(self.words)
[docs] def __str__(self):
return self.text
[docs] def __repr__(self):
return f'TextLine(bbox={self.bbox}, words=\n[%s]' % ',\n '.join(repr(i) for i in self.words)
[docs] def __contains__(self, text):
if text in self.text:
return True
else:
return False
[docs]class SortedTextlines:
def __init__(self, textlines: List[TextLine], words, origin=None):
self.textlines = textlines
self.words = words
self.origin = origin
[docs] def select(self, condition: Callable, retain_empty_lines=False) -> 'SortedTextlines':
"""
Find content matching condition.
"""
words = [i for i in self.words if condition(i)]
textlines = [TextLine([word for word in textline if condition(word)]) for textline in self.textlines]
if not retain_empty_lines:
textlines = list(filter(bool, textlines))
ret = SortedTextlines(words=words, textlines=textlines, origin=self.origin)
return ret
[docs] def resort(self):
return SortedTextlines(textlines=[TextLine(line) for line in group_objs(self.words)], words=self.words)
[docs] def __repr__(self) -> str:
return "Textlines: %s" % "".join([repr(i) + "\n" for i in self.textlines])
[docs]class Word(Rectangular):
"""
A text string representing one word. It's generated from a line of text by splitting on a space.
"""
#__slots__ = ("text", "bbox", "font", "size", "color")
def __init__(
self,
bbox: Bbox,
text: str = "",
font: str = "",
size: str = "",
color=None,
normalize_text=False,
):
self.text = text
if normalize_text:
self.text = self.text.replace("\xad", "-")
self.text = unicodedata.normalize("NFKD", self.text)
self.bbox = bbox
self.font = font
self.size = size
self.color = color
[docs] def __hash__(self) -> int:
return hash(repr(self))
[docs] def __repr__(self) -> str:
return f'Word(text="{self.text}",bbox={self.bbox})'
[docs] def __eq__(self, other) -> bool:
if (self.text, self.bbox) == (other.text, other.bbox):
return True
return False
[docs] def __str__(self) -> str:
return self.text
[docs]class Span(Rectangular):
[docs] __slots__ = ("words", "bbox")
def __init__(self, bbox: Bbox, words: List[Word] = None):
"""
A collection of words.
"""
self.words = words
self.bbox = bbox
@property
[docs] def text(self):
return "".join([i.text for i in self.words])
[docs] def __repr__(self):
return "Span <%s> %s" % ([round(i) for i in self.bbox], self.words)
@classmethod
[docs] def from_pymupdf(cls, span: dict, page_orientation: PageOrientation) -> "Span":
words = [
list(g)
for k, g in (
itertools.groupby(
span["chars"], key=lambda x: x["c"] not in (" ", "\xa0")
)
)
]
new_words = []
coords = []
for word in words:
x0, y0 = get_leftmost(word[0]["bbox"]), get_topmost(word[0]["bbox"])
x1, y1 = get_rightmost(word[-1]["bbox"]), get_bottommost(word[-1]["bbox"])
coords.append([x0, y0, x1, y1])
text = "".join([c["c"] for c in word])
# mupdf has top as zero and left as zero by default
bbox = create_bbox_backend(
backend=Backend.PYMUPDF,
coords=(x0, y0, x1, y1),
page_orientation=page_orientation,
)
new_words.append(
Word(
**{
"text": text,
"bbox": bbox,
"font": span["font"],
"size": span["size"],
"color": span["color"],
},
normalize_text=True,
)
)
bbox = get_span_bbox(new_words)
return cls(words=new_words, bbox=bbox)
@classmethod
[docs] def from_pdfminer(cls, span: List["pdfminer.layout.LTChar"], page_orientation: PageOrientation) -> "Span":
"""
Convert a list of pdfminer characters into a Span.
Split a list by space into Words.
@param span: list of characters
"""
import pdfminer
words = [
list(g)
for k, g in (
itertools.groupby(span, key=lambda x: x.get_text() not in (" ", "\xa0"))
)
]
new_words = []
coords = []
for word in words:
if type(word) == pdfminer.layout.LTAnno:
continue
# reversing y-coordinates: in pdfminer the zero is the bottom of the page
# make it top
x0, y0 = word[0].x0, word[0].y0
x1, y1 = word[-1].x1, word[-1].y1
coords.append([x0, y0, x1, y1])
text = "".join([c.get_text() for c in word])
font = word[0].fontname
size = word[0].size
bbox = create_bbox_backend(
backend=Backend.PDFMINER,
coords=(x0, y0, x1, y1),
page_orientation=page_orientation,
)
new_words.append(
Word(
**{
"text": text,
"bbox": bbox,
"font": font,
"size": size,
"color": None,
},
normalize_text=True,
)
)
bbox = get_span_bbox(new_words)
return cls(words=new_words, bbox=bbox)
[docs]class Line:
[docs] __slots__ = ("bbox", "spans")
def __init__(self, bbox: Bbox, spans):
self.bbox = bbox
self.spans = spans
[docs] def __repr__(self):
# '\n'.join([i.text for i in self.spans])
return "Line: %s" % self.spans
@property
[docs] def text(self):
return " ".join([i.text for i in self.spans])
[docs]class Block:
[docs] __slots__ = ("bbox", "lines")
def __init__(self, bbox: Bbox, lines):
"""
A collection spans.
"""
self.bbox = bbox
self.lines = lines
[docs] def __repr__(self):
return "Block: %s" % self.lines
[docs]def get_span_bbox(span: List) -> Bbox:
"""
Calculate bounding box for a span.
:param span:
:return:
"""
coords = [i.bbox for i in span]
min_x0 = min((i.x0 for i in coords))
min_y0 = min((i.y0 for i in coords))
min_x1 = min((i.x1 for i in coords))
min_y1 = min((i.y1 for i in coords))
max_x0 = max((i.x0 for i in coords))
max_y0 = max((i.y0 for i in coords))
max_x1 = max((i.x1 for i in coords))
max_y1 = max((i.y1 for i in coords))
leftmost = min([min_x0, min_x1])
rightmost = max([max_x0, max_x1])
topmost = min([min_y0, min_y1])
bottommost = max([max_y0, max_y1])
bbox = Bbox(x0=leftmost, y0=topmost, x1=rightmost, y1=bottommost)
return bbox
[docs]def line2str(line: List[Word]) -> str:
return " ".join(map(str, line))