Source code for pdfscraper.layout.image

from __future__ import annotations

import os
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, Any, Tuple, Iterable, Iterator

try:
    from typing import Literal, TypedDict
except ImportError:
    from typing_extensions import Literal, TypedDict  # type: ignore

from pdfscraper.layout.utils import (
    Bbox,
    PageOrientation,
    create_bbox_backend,
    Backend,
    Rectangular,
)

[docs]ImageSource = Literal["pdfminer", "mupdf"]
[docs]def get_image(layout_object) -> Optional["pdfminer.layout.LTImage"]: import pdfminer if isinstance(layout_object, pdfminer.layout.LTImage): return layout_object elif isinstance(layout_object, pdfminer.layout.LTContainer): for child in layout_object: return get_image(child) return None else: return None
@contextmanager
[docs]def attr_as(obj, field: str, value) -> Iterator[None]: old_value = getattr(obj, field) setattr(obj, field, value) yield setattr(obj, field, old_value)
@dataclass(frozen=True)
[docs]class Image(Rectangular): """ An image created from pdfminer or pymupdf object. """
[docs] bbox: Bbox
[docs] source_width: Optional[int]
[docs] source_height: Optional[int]
[docs] colorspace_name: Optional[str]
[docs] bpc: Optional[int]
[docs] xref: Optional[int]
[docs] name: Optional[str]
[docs] source: ImageSource
[docs] raw_object: Any = None
[docs] parent_object: Any = None
[docs] colorspace_n: Optional[int] = None
[docs] class Config:
[docs] arbitrary_types_allowed = True
[docs] def __hash__(self): return hash(self.bbox)
[docs] def __contains__(self, other): return self.raw_object['bbox'].contains(other.raw_object['bbox'])
[docs] def _save_pdfminer(self, path: str): from pdfminer.image import ImageWriter path, ext = os.path.splitext(path) path = os.path.abspath(path) folder, name = os.path.split(path) im = self.raw_object with attr_as(im, "name", name): return ImageWriter(folder).export_image(im)
[docs] def _save_pymupdf(self, path: str): with open(path, "wb") as f: f.write(self.parent_object.extract_image(self.xref)["image"])
[docs] def save(self, path: str): if self.source == "pdfminer": self._save_pdfminer(path) elif self.source == "mupdf": self._save_pymupdf(path)
@classmethod
[docs] def from_pdfminer(cls, image: "pdfminer.layout.LTImage", page_orientation: PageOrientation) -> "Image": """ Create an image out of pdfminer object. :param image: pdfminer LTImage object. :param orientation: page orientation data. :return: """ from pdfminer.psparser import PSLiteral bbox = create_bbox_backend(backend=Backend.PDFMINER, coords=image.bbox, page_orientation=page_orientation,) bpc = image.bits if hasattr(image.colorspace[0], "name"): colorspace_name = image.colorspace[0].name else: objs = image.colorspace[0] if objs: objs = objs.resolve() else: colorspace_name = None if type(objs) == PSLiteral: colorspace_name = objs.name elif objs: colorspaces = [i for i in objs if hasattr(i, "name")] colorspace_name = colorspaces[0].name else: colorspace_name = None name = image.name source_width, source_height = image.srcsize xref = image.stream.objid return cls( bbox=bbox, source_width=source_width, source_height=source_height, colorspace_name=colorspace_name, bpc=bpc, xref=xref, name=name, raw_object=image, source="pdfminer",
[docs] ) @classmethod
[docs] def from_pymupdf(cls, image: MuPDFImage, doc: "fitz.fitz.Document", page_orientation: PageOrientation,) -> "Image": raw_bbox = image.get("bbox")
bbox = create_bbox_backend(backend=Backend.PYMUPDF, coords=raw_bbox, page_orientation=page_orientation)
[docs] bpc = image.get("bpc")
colorspace_name = image.get("colorspace_name") name = image.get("name")
[docs] source_width, source_height = ( image.get("source_width"),
image.get("source_height"), )
[docs] xref = image.get("xref") return cls(
bbox=bbox, source_width=source_width,
[docs] source_height=source_height, colorspace_name=colorspace_name,
bpc=bpc, xref=xref,
[docs] name=name, raw_object=image,
source="mupdf", parent_object=doc,
[docs]class MuPDFImage(TypedDict):
[docs] xref: int
[docs] mask_xref: int
[docs] source_width: int
[docs] source_height: int
[docs] bpc: int
[docs] colorspace_name: str
[docs] name: str
[docs] decode_filter: str
[docs] bbox: Tuple
[docs]def get_images_from_pymupdf_page(page) -> Iterable[MuPDFImage]: images = page.get_images(full=True) for ( xref, smask, source_width, source_height, bpc, colorspace, alt_colorspace, name, decode_filter, referencer_xref, ) in images: bbox = page.get_image_bbox( ( xref, smask, source_width, source_height, bpc, colorspace, alt_colorspace, name, decode_filter, referencer_xref, ) ) yield { "xref": xref, "mask_xref": smask, "source_width": source_width, "source_height": source_height, "bpc": bpc, "colorspace_name": colorspace, "name": name, "decode_filter": decode_filter, "bbox": bbox,
}