Source code for pdfscraper.page

from __future__ import annotations

from dataclasses import dataclass
from typing import List, Callable, Union, Tuple, Optional

from pdfscraper.layout.drawing import (
    Shape,
    process_pymupdf_drawing,
    process_pdfminer_drawing,
)
from pdfscraper.layout.image import Image, get_images_from_pymupdf_page, get_image
from pdfscraper.layout.text import Word, TextLine, Block, Line, Span
from pdfscraper.layout.utils import (
    Bbox,
    group_objs,
    get_topmost,
    Orientation,
    create_bbox_backend,
    Backend,
)
from pdfscraper.layout.utils import PageOrientation




[docs]class Page: def __init__( self, words: List[Word], drawings: List[Shape], images: List[Image], raw_object: Union["fitz.fitz.Page", "pdfminer.layout.LTPage"], blocks: List[Block], ) -> None: self.words = words self.drawings = drawings self.images = images self.raw_object = raw_object self.blocks = blocks # self.wordlines = self.sorted
[docs] def __repr__(self) -> str: return "Page: %s" % "".join([repr(i) + "\n" for i in self.words])
[docs] def select(self, condition: Callable) -> Page: """ Find content matching condition. """ words = [i for i in self.words if condition(i)] drawings = [i for i in self.drawings if condition(i)] images = [i for i in self.images if condition(i)] ret = Page(words=words, drawings=drawings, images=images, blocks=self.blocks, raw_object=self.raw_object,) return ret
[docs] def __add__(self, other, other_position_delta: Bbox = None) -> 'Page': """ Create a new page by summing objects of this and another page. To concatenate them vertically or horizontally move all objects of the other page by specified delta. :param other: another Page :param other_position_delta: :return: a new Page """ def move_objects(objects, delta): if delta: return [i.move(delta=delta) for i in objects] return objects return Page( words=self.words + move_objects(other.words, delta=other_position_delta), drawings=self.drawings + move_objects(other.drawings, delta=other_position_delta), images=self.images + move_objects(other.images, delta=other_position_delta), blocks=self.blocks + (other.blocks), raw_object=[self.raw_object, other.raw_object],
) @staticmethod
[docs] def _split_sequence_by_condition(seq, condition): success = [] failure = [] for i in seq: if condition(i): success.append(i) else: failure.append(i) return success, failure
[docs] def split(self, condition: Callable): words_true, words_false = self._split_sequence_by_condition(self.words, condition) drawings_true, drawings_false = self._split_sequence_by_condition(self.drawings, condition) images_true, images_false = self._split_sequence_by_condition(self.images, condition) ret_1 = PageSection( words=words_true, drawings=drawings_true, images=images_true, parent=self, condition=condition, ) ret_2 = PageSection( words=words_false, drawings=drawings_false, images=images_false, parent=self, condition=condition, ) return ret_1, ret_2
[docs] def take_screenshot(self, area: Tuple[float, float, float, float], output_path): import fitz if isinstance(self.raw_object, fitz.fitz.Page): self.raw_object.get_pixmap(dpi=300, clip=area).save(output_path) else: raise NotImplementedError("Only PyMuPDF pages support taking screenshots.")
@property
[docs] def sorted(self) -> List[TextLine]: if len(self.words) == 0: return [[]] return [TextLine(line) for line in group_objs(self.words)]
@property
[docs] def sorted_lines(self) -> Optional[SortedTextlines]: if len(self.words) == 0: return None return SortedTextlines(textlines=[TextLine(line) for line in group_objs(self.words)], words=self.words)
@classmethod
[docs] def from_pymupdf(cls, page: "fitz.fitz.Page", orientation: Orientation = None) -> Page: if not orientation: page_orientation = PageOrientation.create( bottom_is_zero=False, left_is_zero=True, page_width=page.rect.width, page_height=page.rect.height, ) else: page_orientation = PageOrientation( orientation=orientation, page_width=page.rect.width, page_height=page.rect.height, ) def _get_words_blocks_from_page(page, page_orientation: PageOrientation): blocks = page.get_text("rawdict", flags=3)["blocks"] for block in blocks: for line in block["lines"]: for j, span in enumerate(line["spans"]): line["spans"][j] = Span.from_pymupdf(span, page_orientation) for block in blocks: for k, line in enumerate(block["lines"]): block["lines"][k] = Line( bbox=create_bbox_backend( backend=Backend.PYMUPDF, coords=line["bbox"], page_orientation=page_orientation, ), spans=line["spans"], ) for n, block in enumerate(blocks): blocks[n] = Block( bbox=create_bbox_backend( backend=Backend.PYMUPDF, coords=block["bbox"], page_orientation=page_orientation, ), lines=block["lines"], ) words = [word for block in blocks for line in block.lines for span in line.spans for word in span.words] return words, blocks def _get_drawings_from_pymupdf_page(page, page_orientation): drawings = sorted(page.get_drawings(), key=lambda x: x["rect"][1]) drawings = [process_pymupdf_drawing(i, page_orientation) for i in drawings] drawings = sorted(drawings, key=get_topmost) return drawings drawings = _get_drawings_from_pymupdf_page(page, page_orientation) words, blocks = _get_words_blocks_from_page(page, page_orientation) # type: ignore pymupdf_images = get_images_from_pymupdf_page(page) images = [ Image.from_pymupdf(image=image, doc=page.parent, page_orientation=page_orientation) for image in pymupdf_images ] page = cls(words=words, drawings=drawings, images=images, raw_object=page, blocks=blocks,) return page
@classmethod
[docs] def from_pdfminer(cls, page: "pdfminer.layout.LTPage", orientation: Orientation = None) -> Page: import pdfminer if not orientation: page_orientation = PageOrientation.create( bottom_is_zero=False, left_is_zero=True, page_width=page.width, page_height=page.height, ) else: page_orientation = PageOrientation(orientation=orientation, page_width=page.width, page_height=page.height) def _get_words_blocks_from_page(page, page_orientation): blocks = [] text_boxes = [i for i in page if hasattr(i, "get_text")] for text_box in text_boxes: # get text lines lines = [text_line for text_line in text_box if hasattr(text_line, "get_text")] # convert lines into spans lines = [ Span.from_pdfminer([i for i in line if type(i) != pdfminer.layout.LTAnno], page_orientation,) for line in lines ] # make a block out of spans blocks.append(Block(bbox=Bbox(*text_box.bbox), lines=lines)) words = [word for block in blocks for line in block.lines for word in line.words] return words, blocks def _get_drawings_from_page(page, page_orientation): drawings = [i for i in page if issubclass(type(i), pdfminer.layout.LTCurve)] drawings = [process_pdfminer_drawing(i, page_orientation) for i in drawings] drawings = sorted(drawings, key=get_topmost) return drawings def _get_images_from_page(page, page_orientation): images = filter(bool, map(get_image, page)) images = [Image.from_pdfminer(image, page_orientation) for image in images] return images drawings = _get_drawings_from_page(page, page_orientation=page_orientation) images = _get_images_from_page(page, page_orientation=page_orientation) words, blocks = _get_words_blocks_from_page(page, page_orientation=page_orientation) page = cls(words=words, images=images, drawings=drawings, raw_object=page, blocks=blocks,) return page
@dataclass
[docs]class PageSection(Page):
[docs] words: List[Word]
[docs] drawings: List
[docs] images: List
[docs] condition: str
[docs] parent: Page
[docs] name: str = ""