Source code for pdfscraper.layout.annotations

from dataclasses import dataclass
from typing import Dict, List


from pdfscraper.layout.utils import Bbox, create_bbox_backend, Backend, PageOrientation, Rectangular


@dataclass
[docs]class PyMuPDFAnnotation:
[docs]    border: Dict
[docs]    colors: Dict
[docs]    flags: int
[docs]    has_popup: bool
[docs]    info: Dict
[docs]    is_open: bool
[docs]    line_ends: tuple
[docs]    next_annotation: "Annotation"
[docs]    opacity: float
[docs]    popup_rect: tuple
[docs]    popup_xref: int
[docs]    rect: tuple
[docs]    anno_type: tuple
[docs]    vertices: list
[docs]    xref: int

    @classmethod
[docs]    def from_annot(cls, annot: "fitz.fitz.Annot"):
        border = annot.border
        colors = annot.colors
        flags = annot.flags
        has_popup = annot.has_popup
        info = annot.info
        is_open = annot.is_open
        line_ends = annot.line_ends
        opacity = annot.opacity
        next_annotation = annot.next
        popup_rect = annot.popup_rect
        popup_xref = annot.popup_xref
        rect = annot.rect
        anno_type = annot.type
        vertices = annot.vertices
        xref = annot.xref

        return cls(
            border=border,
            colors=colors,
            flags=flags,
            has_popup=has_popup,
            info=info,
            is_open=is_open,
            line_ends=line_ends,
            next_annotation=next_annotation,
            opacity=opacity,
            popup_rect=popup_rect,
            popup_xref=popup_xref,
            rect=rect,
            anno_type=anno_type,
            vertices=vertices,
            xref=xref,
        )


@dataclass
[docs]class PDFMinerAnnotation:
[docs]    subject: str
[docs]    flags: int
[docs]    color: List
[docs]    creation_date: str
[docs]    mod_date: str
[docs]    name: str
[docs]    author: str
[docs]    rect: List
[docs]    content: str
    import pdfminer

    @classmethod
[docs]    def normalize_value(cls, s):
        if s:
            return cls.pdfminer.utils.decode_text(s)
        return s

    @classmethod
[docs]    def from_annot(cls, annot: Dict):
        subject = annot.get("Subj")
        flags = int(annot.get("F"))

        # flags = int(flags) if flags else flags
        color = annot.get("C")
        creation_date = annot.get("CreationDate")
        mod_date = annot.get("M") or annot.get("ModDate")
        rect = cls.pdfminer.pdftypes.resolve1(annot.get("Rect"))
        author = annot.get("T")
        content = annot.get("Contents", "")
        name = annot.get("NM")
        content, name, author, mod_date, creation_date, subject = [
            cls.normalize_value(i)
            for i in (content, name, author, mod_date, creation_date, subject)
        ]
        return cls(
            subject=subject,
            flags=flags,
            color=color,
            creation_date=creation_date,
            mod_date=mod_date,
            rect=rect,
            author=author,
            content=content,
            name=name,
        )


@dataclass
[docs]class Annotation:
[docs]    content: str
[docs]    author: str
[docs]    mod_date: str
[docs]    creation_date: str
[docs]    bbox: Bbox

    @classmethod
[docs]    def from_pymupdf_annot(cls, annot: PyMuPDFAnnotation, page_orientation: PageOrientation):
        content = annot.info.get("content",'')
        author = annot.info.get("title",'')
        name = annot.info.get("id")
        creation_date = annot.info.get("creationDate",'')
        mod_date = annot.info.get("modDate")
        subject = annot.info.get("subject")

        bbox = create_bbox_backend(
            backend=Backend.PYMUPDF, coords=annot.rect, page_orientation=page_orientation
        )

        return cls(
            content=content,
            author=author,
            mod_date=mod_date,
            creation_date=creation_date,
            bbox=bbox,
        )

    @classmethod
[docs]    def from_pdfminer_annot(
        cls, annot: PDFMinerAnnotation, page_orientation: PageOrientation
    ):
        bbox = create_bbox_backend(
            backend=Backend.PDFMINER, coords=annot.rect, page_orientation=page_orientation
        )

        return cls(
            content=annot.content,
            author=annot.author,
            mod_date=annot.mod_date,
            creation_date=annot.creation_date,
            bbox=bbox,
        )