from __future__ import annotations
import os
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, Any, Tuple, Iterable, Iterator
try:
from typing import Literal, TypedDict
except ImportError:
from typing_extensions import Literal, TypedDict # type: ignore
from pdfscraper.layout.utils import (
Bbox,
PageOrientation,
create_bbox_backend,
Backend,
Rectangular,
)
[docs]ImageSource = Literal["pdfminer", "mupdf"]
[docs]def get_image(layout_object) -> Optional["pdfminer.layout.LTImage"]:
import pdfminer
if isinstance(layout_object, pdfminer.layout.LTImage):
return layout_object
elif isinstance(layout_object, pdfminer.layout.LTContainer):
for child in layout_object:
return get_image(child)
return None
else:
return None
@contextmanager
[docs]def attr_as(obj, field: str, value) -> Iterator[None]:
old_value = getattr(obj, field)
setattr(obj, field, value)
yield
setattr(obj, field, old_value)
@dataclass(frozen=True)
[docs]class Image(Rectangular):
"""
An image created from pdfminer or pymupdf object.
"""
[docs] source_width: Optional[int]
[docs] source_height: Optional[int]
[docs] colorspace_name: Optional[str]
[docs] parent_object: Any = None
[docs] colorspace_n: Optional[int] = None
[docs] def __hash__(self):
return hash(self.bbox)
[docs] def __contains__(self, other):
return self.raw_object['bbox'].contains(other.raw_object['bbox'])
[docs] def _save_pdfminer(self, path: str):
from pdfminer.image import ImageWriter
path, ext = os.path.splitext(path)
path = os.path.abspath(path)
folder, name = os.path.split(path)
im = self.raw_object
with attr_as(im, "name", name):
return ImageWriter(folder).export_image(im)
[docs] def _save_pymupdf(self, path: str):
with open(path, "wb") as f:
f.write(self.parent_object.extract_image(self.xref)["image"])
[docs] def save(self, path: str):
if self.source == "pdfminer":
self._save_pdfminer(path)
elif self.source == "mupdf":
self._save_pymupdf(path)
@classmethod
[docs] def from_pdfminer(cls, image: "pdfminer.layout.LTImage", page_orientation: PageOrientation) -> "Image":
"""
Create an image out of pdfminer object.
:param image: pdfminer LTImage object.
:param orientation: page orientation data.
:return:
"""
from pdfminer.psparser import PSLiteral
bbox = create_bbox_backend(backend=Backend.PDFMINER, coords=image.bbox, page_orientation=page_orientation,)
bpc = image.bits
if hasattr(image.colorspace[0], "name"):
colorspace_name = image.colorspace[0].name
else:
objs = image.colorspace[0]
if objs:
objs = objs.resolve()
else:
colorspace_name = None
if type(objs) == PSLiteral:
colorspace_name = objs.name
elif objs:
colorspaces = [i for i in objs if hasattr(i, "name")]
colorspace_name = colorspaces[0].name
else:
colorspace_name = None
name = image.name
source_width, source_height = image.srcsize
xref = image.stream.objid
return cls(
bbox=bbox,
source_width=source_width,
source_height=source_height,
colorspace_name=colorspace_name,
bpc=bpc,
xref=xref,
name=name,
raw_object=image,
source="pdfminer",
[docs] )
@classmethod
[docs] def from_pymupdf(cls, image: MuPDFImage, doc: "fitz.fitz.Document", page_orientation: PageOrientation,) -> "Image":
raw_bbox = image.get("bbox")
bbox = create_bbox_backend(backend=Backend.PYMUPDF, coords=raw_bbox, page_orientation=page_orientation)
colorspace_name = image.get("colorspace_name")
name = image.get("name")
[docs] source_width, source_height = (
image.get("source_width"),
image.get("source_height"),
)
[docs] xref = image.get("xref")
return cls(
bbox=bbox,
source_width=source_width,
[docs] source_height=source_height,
colorspace_name=colorspace_name,
bpc=bpc,
xref=xref,
[docs] name=name,
raw_object=image,
source="mupdf",
parent_object=doc,
[docs]class MuPDFImage(TypedDict):
[docs]def get_images_from_pymupdf_page(page) -> Iterable[MuPDFImage]:
images = page.get_images(full=True)
for (
xref,
smask,
source_width,
source_height,
bpc,
colorspace,
alt_colorspace,
name,
decode_filter,
referencer_xref,
) in images:
bbox = page.get_image_bbox(
(
xref,
smask,
source_width,
source_height,
bpc,
colorspace,
alt_colorspace,
name,
decode_filter,
referencer_xref,
)
)
yield {
"xref": xref,
"mask_xref": smask,
"source_width": source_width,
"source_height": source_height,
"bpc": bpc,
"colorspace_name": colorspace,
"name": name,
"decode_filter": decode_filter,
"bbox": bbox,
}