from dataclasses import dataclass
from typing import Dict, List
from pdfscraper.layout.utils import Bbox, create_bbox_backend, Backend, PageOrientation, Rectangular
@dataclass
[docs]class PyMuPDFAnnotation:
[docs] next_annotation: "Annotation"
@classmethod
[docs] def from_annot(cls, annot: "fitz.fitz.Annot"):
border = annot.border
colors = annot.colors
flags = annot.flags
has_popup = annot.has_popup
info = annot.info
is_open = annot.is_open
line_ends = annot.line_ends
opacity = annot.opacity
next_annotation = annot.next
popup_rect = annot.popup_rect
popup_xref = annot.popup_xref
rect = annot.rect
anno_type = annot.type
vertices = annot.vertices
xref = annot.xref
return cls(
border=border,
colors=colors,
flags=flags,
has_popup=has_popup,
info=info,
is_open=is_open,
line_ends=line_ends,
next_annotation=next_annotation,
opacity=opacity,
popup_rect=popup_rect,
popup_xref=popup_xref,
rect=rect,
anno_type=anno_type,
vertices=vertices,
xref=xref,
)
@dataclass
[docs]class PDFMinerAnnotation:
import pdfminer
@classmethod
[docs] def normalize_value(cls, s):
if s:
return cls.pdfminer.utils.decode_text(s)
return s
@classmethod
[docs] def from_annot(cls, annot: Dict):
subject = annot.get("Subj")
flags = int(annot.get("F"))
# flags = int(flags) if flags else flags
color = annot.get("C")
creation_date = annot.get("CreationDate")
mod_date = annot.get("M") or annot.get("ModDate")
rect = cls.pdfminer.pdftypes.resolve1(annot.get("Rect"))
author = annot.get("T")
content = annot.get("Contents", "")
name = annot.get("NM")
content, name, author, mod_date, creation_date, subject = [
cls.normalize_value(i)
for i in (content, name, author, mod_date, creation_date, subject)
]
return cls(
subject=subject,
flags=flags,
color=color,
creation_date=creation_date,
mod_date=mod_date,
rect=rect,
author=author,
content=content,
name=name,
)
@dataclass
[docs]class Annotation:
@classmethod
[docs] def from_pymupdf_annot(cls, annot: PyMuPDFAnnotation, page_orientation: PageOrientation):
content = annot.info.get("content",'')
author = annot.info.get("title",'')
name = annot.info.get("id")
creation_date = annot.info.get("creationDate",'')
mod_date = annot.info.get("modDate")
subject = annot.info.get("subject")
bbox = create_bbox_backend(
backend=Backend.PYMUPDF, coords=annot.rect, page_orientation=page_orientation
)
return cls(
content=content,
author=author,
mod_date=mod_date,
creation_date=creation_date,
bbox=bbox,
)
@classmethod
[docs] def from_pdfminer_annot(
cls, annot: PDFMinerAnnotation, page_orientation: PageOrientation
):
bbox = create_bbox_backend(
backend=Backend.PDFMINER, coords=annot.rect, page_orientation=page_orientation
)
return cls(
content=annot.content,
author=annot.author,
mod_date=annot.mod_date,
creation_date=annot.creation_date,
bbox=bbox,
)