import itertools
from collections import defaultdict
from dataclasses import dataclass
from enum import Enum
from typing import List, Tuple, Iterable, NamedTuple, Dict
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
@dataclass(frozen=True)
[docs]class VerticalOrientation:
"""
Direction of a Y-axis. Bottom→Top or Top→Bottom.
"""
@dataclass(frozen=True)
[docs]class HorizontalOrientation:
"""
Direction of a X-axis. Left→Right or Right→Left.
"""
@dataclass(frozen=True)
[docs]class Orientation:
"""
Directions of X and Y axes.
"""
[docs] vertical_orientation: VerticalOrientation
[docs] horizontal_orientation: HorizontalOrientation
@classmethod
[docs] def create(cls, left_is_zero=True, bottom_is_zero=False):
return cls(
horizontal_orientation=HorizontalOrientation(left_is_zero=left_is_zero),
vertical_orientation=VerticalOrientation(bottom_is_zero=bottom_is_zero),
)
@dataclass(frozen=True)
[docs]class PageOrientation:
"""
Directions of X/Y axes together with page dimensions.
"""
[docs] orientation: Orientation
[docs] page_height: float
@property
[docs] def left_is_zero(self):
return self.orientation.horizontal_orientation.left_is_zero
@property
[docs] def bottom_is_zero(self):
return self.orientation.vertical_orientation.bottom_is_zero
@classmethod
[docs] def create(cls, left_is_zero=True, bottom_is_zero=False, page_width=None, page_height=None):
orientation = Orientation(
horizontal_orientation=HorizontalOrientation(left_is_zero=left_is_zero),
vertical_orientation=VerticalOrientation(bottom_is_zero=bottom_is_zero),
)
return cls(orientation=orientation, page_height=page_height, page_width=page_width)
[docs]class Bbox(NamedTuple):
"""
A rectangular bounding box.
"""
[docs] def __str__(self) -> str:
return f"Bbox(x0={self.x0:.2f},y0={self.y0:.2f},x1={self.x1:.2f},y1={self.y1:.2f})"
[docs] def __eq__(self, other, decimals=1, n=4) -> bool:
return [round(i, ndigits=decimals) for i in self[:n]] == [round(i, ndigits=decimals) for i in other[:n]]
[docs] def move(self, delta=(0, 0, 0, 0)):
return self + Bbox(*delta)
[docs] def __add__(self, other: 'Bbox'):
return Bbox(*[x + y for x, y in zip(self, other)])
[docs] def isclose(self, other: 'Bbox', tolerance: float):
"""Check if two bboxes are close to each other."""
return (
abs(self.x0 - other.x0) < tolerance
and abs(self.y0 - other.y0) < tolerance
and abs(self.x1 - other.x1) < tolerance
and abs(self.y1 - other.y1) < tolerance
)
[docs] def isinside(self, other: 'Bbox') -> bool:
"""Check if this bbox is inside another bbox."""
return self.x0 >= other.x0 and self.y0 >= other.y0 and self.x1 <= other.x1 and self.y1 <= other.y1
@property
[docs] def height(self) -> float:
return abs(self.y0 - self.y1)
@property
[docs] def width(self) -> float:
return abs(self.x0 - self.x1)
@classmethod
[docs] def from_coords(cls, coords, invert_y=False, invert_x=False, page_height=None, page_width=None) -> "Bbox":
x0, y0, x1, y1 = coords
if invert_y:
y0, y1 = page_height - y1, page_height - y0
if invert_x:
x0, x1 = page_width - x1, page_width - x0
return cls(x0, y0, x1, y1)
[docs]class Rectangular:
"""
An object with a rectangular bounding box.
"""
[docs] def move(self, delta):
attributes = {k: v for k, v in vars(self).items()}
bbox = attributes.pop('bbox')
attributes['bbox'] = bbox + Bbox(*delta)
return type(self)(**attributes)
@property
[docs] def width(self):
return self.bbox.width
@property
[docs] def height(self):
return self.bbox.height
@property
[docs] def x0(self):
return self.bbox.x0
@property
[docs] def x1(self):
return self.bbox.x1
@property
[docs] def y0(self):
return self.bbox.y0
@property
[docs] def y1(self):
return self.bbox.y1
[docs]DEFAULT_BACKEND_PAGE_ORIENTATIONS: Dict[Literal[Backend.PDFMINER, Backend.PYMUPDF], Orientation] = {
Backend.PDFMINER: Orientation.create(bottom_is_zero=True, left_is_zero=True),
Backend.PYMUPDF: Orientation.create(bottom_is_zero=False, left_is_zero=True),
}
[docs]def create_bbox_backend(backend: Backend, coords, page_orientation: PageOrientation) -> Bbox:
"""
Creates a bbox taking into account axis direction from a given page.
:param backend: backend type
:param coords: 4-item sequence of x0,y0,x1,y1 coordinates
:param page_orientation: page size together with X/Y axes directions.
:return: a bounding box
"""
bottom_is_zero = DEFAULT_BACKEND_PAGE_ORIENTATIONS[backend].vertical_orientation.bottom_is_zero
left_is_zero = DEFAULT_BACKEND_PAGE_ORIENTATIONS[backend].horizontal_orientation.left_is_zero
return Bbox.from_coords(
coords=coords,
invert_y=page_orientation.bottom_is_zero ^ bottom_is_zero,
invert_x=page_orientation.left_is_zero ^ left_is_zero,
page_height=page_orientation.page_height,
page_width=page_orientation.page_width,
)
@dataclass(frozen=True)
[docs]class Color:
[docs] def __eq__(self, other, decimals=1):
if (
round(self.r, decimals) == round(other.r, decimals)
and round(self.b, decimals) == round(other.b, decimals)
and round(self.g, decimals) == round(other.g, decimals)
):
return True
else:
return False
[docs]def get_bbox(block) -> Tuple[float, float, float, float]:
if hasattr(block, "bbox"):
block = block.bbox
if type(block) == dict and "rect" in block:
block = block["rect"]
x0, y0, x1, y1, *_ = block
return x0, y0, x1, y1
[docs]def get_rightmost(block) -> float:
x0, _, x1, _, *_ = get_bbox(block)
return max(x0, x1)
[docs]def get_leftmost(block) -> float:
x0, _, x1, _, *_ = get_bbox(block)
return min(x0, x1)
[docs]def get_topmost(block) -> float:
# top is zero
_, y0, _, y1, *_ = get_bbox(block)
return min(y0, y1)
[docs]def get_bottommost(block) -> float:
# bottom is infinity
_, y0, _, y1, *_ = get_bbox(block)
return max(y0, y1)
[docs]def group_objs(words: List, gap: float = 5, decimals: int = 1, axis: str = 'y') -> List[List]:
"""
Group words into vertically adjacent lines.
First, create a dictionary with rounded y-coordinates as keys, and lists of words as values.
Then merge together lists whose coordinate delta is <= gap.
:param words: list of Words
:param gap: vertical delta between lines to be merged.
:param decimals: rounding precision.
:param axis: horizontal (x) or vertical (y) grouping
:return: vertically grouped lines, each line is sorted horizontally inside.
"""
d = defaultdict(list)
if axis == 'y':
func = get_topmost
else:
func = get_leftmost
for i in sorted(words, key=lambda x: round(func(x), decimals)):
d[round(func(i), decimals)].append(i)
lines = list(d.items())
total = []
curr_group = [lines[0][1]]
for n, (y, i) in enumerate(lines):
if n == 0:
continue
left = lines[n - 1][0]
right = y
dist = abs(left - right)
if dist <= gap:
curr_group.append(i)
else:
total.append(sum(curr_group, []))
curr_group = [i]
total.append(sum(curr_group, []))
# sort every line horizontally
total = [sorted(i, key=lambda x: get_leftmost(x) if axis == 'y' else get_topmost(x)) for i in total]
return total
[docs]def get_center_group(group: List) -> float:
"""
Get a middle point of a group of words.
"""
left = get_leftmost(group[0])
right = get_rightmost(group[-1])
return (left + right) / 2
[docs]def get_center(obj) -> float:
"""
Get a middle point of a word.
"""
return (obj.bbox.x0 + obj.bbox.x1) / 2
[docs]def flatten(items):
"""Yield items from any nested iterable."""
for x in items:
if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
for sub_x in flatten(x):
yield sub_x
else:
yield x
[docs]def groupby_consec(df, col):
string_groups = sum([["%s_%s" % (i, n) for i in g] for n, (k, g) in enumerate(itertools.groupby(df[col]))], [],)
return df.groupby(string_groups, sort=False)