Source code for pdfscraper.layout.annotations

from dataclasses import dataclass
from typing import Dict, List


from pdfscraper.layout.utils import Bbox, create_bbox_backend, Backend, PageOrientation, Rectangular


@dataclass
[docs]class PyMuPDFAnnotation:
[docs] border: Dict
[docs] colors: Dict
[docs] flags: int
[docs] has_popup: bool
[docs] info: Dict
[docs] is_open: bool
[docs] line_ends: tuple
[docs] next_annotation: "Annotation"
[docs] opacity: float
[docs] popup_rect: tuple
[docs] popup_xref: int
[docs] rect: tuple
[docs] anno_type: tuple
[docs] vertices: list
[docs] xref: int
@classmethod
[docs] def from_annot(cls, annot: "fitz.fitz.Annot"): border = annot.border colors = annot.colors flags = annot.flags has_popup = annot.has_popup info = annot.info is_open = annot.is_open line_ends = annot.line_ends opacity = annot.opacity next_annotation = annot.next popup_rect = annot.popup_rect popup_xref = annot.popup_xref rect = annot.rect anno_type = annot.type vertices = annot.vertices xref = annot.xref return cls( border=border, colors=colors, flags=flags, has_popup=has_popup, info=info, is_open=is_open, line_ends=line_ends, next_annotation=next_annotation, opacity=opacity, popup_rect=popup_rect, popup_xref=popup_xref, rect=rect, anno_type=anno_type, vertices=vertices, xref=xref,
) @dataclass
[docs]class PDFMinerAnnotation:
[docs] subject: str
[docs] flags: int
[docs] color: List
[docs] creation_date: str
[docs] mod_date: str
[docs] name: str
[docs] author: str
[docs] rect: List
[docs] content: str
import pdfminer @classmethod
[docs] def normalize_value(cls, s): if s: return cls.pdfminer.utils.decode_text(s) return s
@classmethod
[docs] def from_annot(cls, annot: Dict): subject = annot.get("Subj") flags = int(annot.get("F")) # flags = int(flags) if flags else flags color = annot.get("C") creation_date = annot.get("CreationDate") mod_date = annot.get("M") or annot.get("ModDate") rect = cls.pdfminer.pdftypes.resolve1(annot.get("Rect")) author = annot.get("T") content = annot.get("Contents", "") name = annot.get("NM") content, name, author, mod_date, creation_date, subject = [ cls.normalize_value(i) for i in (content, name, author, mod_date, creation_date, subject) ] return cls( subject=subject, flags=flags, color=color, creation_date=creation_date, mod_date=mod_date, rect=rect, author=author, content=content, name=name,
) @dataclass
[docs]class Annotation:
[docs] content: str
[docs] author: str
[docs] mod_date: str
[docs] creation_date: str
[docs] bbox: Bbox
@classmethod
[docs] def from_pymupdf_annot(cls, annot: PyMuPDFAnnotation, page_orientation: PageOrientation): content = annot.info.get("content",'') author = annot.info.get("title",'') name = annot.info.get("id") creation_date = annot.info.get("creationDate",'') mod_date = annot.info.get("modDate") subject = annot.info.get("subject") bbox = create_bbox_backend( backend=Backend.PYMUPDF, coords=annot.rect, page_orientation=page_orientation ) return cls( content=content, author=author, mod_date=mod_date, creation_date=creation_date, bbox=bbox,
) @classmethod
[docs] def from_pdfminer_annot( cls, annot: PDFMinerAnnotation, page_orientation: PageOrientation ): bbox = create_bbox_backend( backend=Backend.PDFMINER, coords=annot.rect, page_orientation=page_orientation ) return cls( content=annot.content, author=annot.author, mod_date=annot.mod_date, creation_date=annot.creation_date, bbox=bbox,
)