Source code for pdfscraper.layout.image

from __future__ import annotations

import os
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, Any, Tuple, Iterable, Iterator

try:
    from typing import Literal, TypedDict
except ImportError:
    from typing_extensions import Literal, TypedDict  # type: ignore

from pdfscraper.layout.utils import (
    Bbox,
    PageOrientation,
    create_bbox_backend,
    Backend,
    Rectangular,
)

[docs]ImageSource = Literal["pdfminer", "mupdf"]


[docs]def get_image(layout_object) -> Optional["pdfminer.layout.LTImage"]:
    import pdfminer

    if isinstance(layout_object, pdfminer.layout.LTImage):
        return layout_object
    elif isinstance(layout_object, pdfminer.layout.LTContainer):
        for child in layout_object:
            return get_image(child)
        return None
    else:
        return None


@contextmanager
[docs]def attr_as(obj, field: str, value) -> Iterator[None]:
    old_value = getattr(obj, field)
    setattr(obj, field, value)
    yield
    setattr(obj, field, old_value)


@dataclass(frozen=True)
[docs]class Image(Rectangular):
    """
    An image created from pdfminer or pymupdf object.

    """

[docs]    bbox: Bbox
[docs]    source_width: Optional[int]
[docs]    source_height: Optional[int]
[docs]    colorspace_name: Optional[str]
[docs]    bpc: Optional[int]
[docs]    xref: Optional[int]
[docs]    name: Optional[str]
[docs]    source: ImageSource
[docs]    raw_object: Any = None
[docs]    parent_object: Any = None
[docs]    colorspace_n: Optional[int] = None

[docs]    class Config:
[docs]        arbitrary_types_allowed = True

[docs]    def __hash__(self):
        return hash(self.bbox)

[docs]    def __contains__(self, other):
        return self.raw_object['bbox'].contains(other.raw_object['bbox'])

[docs]    def _save_pdfminer(self, path: str):
        from pdfminer.image import ImageWriter

        path, ext = os.path.splitext(path)
        path = os.path.abspath(path)
        folder, name = os.path.split(path)
        im = self.raw_object
        with attr_as(im, "name", name):
            return ImageWriter(folder).export_image(im)

[docs]    def _save_pymupdf(self, path: str):
        with open(path, "wb") as f:
            f.write(self.parent_object.extract_image(self.xref)["image"])

[docs]    def save(self, path: str):
        if self.source == "pdfminer":
            self._save_pdfminer(path)
        elif self.source == "mupdf":
            self._save_pymupdf(path)

    @classmethod
[docs]    def from_pdfminer(cls, image: "pdfminer.layout.LTImage", page_orientation: PageOrientation) -> "Image":
        """
        Create an image out of pdfminer object.

        :param image: pdfminer LTImage object.
        :param orientation: page orientation data.
        :return:
        """
        from pdfminer.psparser import PSLiteral

        bbox = create_bbox_backend(backend=Backend.PDFMINER, coords=image.bbox, page_orientation=page_orientation,)

        bpc = image.bits
        if hasattr(image.colorspace[0], "name"):
            colorspace_name = image.colorspace[0].name
        else:
            objs = image.colorspace[0]
            if objs:
                objs = objs.resolve()
            else:
                colorspace_name = None
            if type(objs) == PSLiteral:
                colorspace_name = objs.name
            elif objs:
                colorspaces = [i for i in objs if hasattr(i, "name")]
                colorspace_name = colorspaces[0].name
            else:
                colorspace_name = None

        name = image.name
        source_width, source_height = image.srcsize
        xref = image.stream.objid
        return cls(
            bbox=bbox,
            source_width=source_width,
            source_height=source_height,
            colorspace_name=colorspace_name,
            bpc=bpc,
            xref=xref,
            name=name,
            raw_object=image,
            source="pdfminer",
[docs]        )

    @classmethod
[docs]    def from_pymupdf(cls, image: MuPDFImage, doc: "fitz.fitz.Document", page_orientation: PageOrientation,) -> "Image":
        raw_bbox = image.get("bbox")

        bbox = create_bbox_backend(backend=Backend.PYMUPDF, coords=raw_bbox, page_orientation=page_orientation)
[docs]
        bpc = image.get("bpc")
        colorspace_name = image.get("colorspace_name")
        name = image.get("name")
[docs]        source_width, source_height = (
            image.get("source_width"),
            image.get("source_height"),
        )
[docs]        xref = image.get("xref")
        return cls(
            bbox=bbox,
            source_width=source_width,
[docs]            source_height=source_height,
            colorspace_name=colorspace_name,
            bpc=bpc,
            xref=xref,
[docs]            name=name,
            raw_object=image,
            source="mupdf",
            parent_object=doc,
[docs]        )


[docs]class MuPDFImage(TypedDict):
[docs]    xref: int
[docs]    mask_xref: int
[docs]    source_width: int
[docs]    source_height: int
[docs]    bpc: int
[docs]    colorspace_name: str
[docs]    name: str
[docs]    decode_filter: str
[docs]    bbox: Tuple


[docs]def get_images_from_pymupdf_page(page) -> Iterable[MuPDFImage]:
    images = page.get_images(full=True)
    for (
        xref,
        smask,
        source_width,
        source_height,
        bpc,
        colorspace,
        alt_colorspace,
        name,
        decode_filter,
        referencer_xref,
    ) in images:
        bbox = page.get_image_bbox(
            (
                xref,
                smask,
                source_width,
                source_height,
                bpc,
                colorspace,
                alt_colorspace,
                name,
                decode_filter,
                referencer_xref,
            )
        )
        yield {
            "xref": xref,
            "mask_xref": smask,
            "source_width": source_width,
            "source_height": source_height,
            "bpc": bpc,
            "colorspace_name": colorspace,
            "name": name,
            "decode_filter": decode_filter,
            "bbox": bbox,
        }