Source code for pdfscraper.document

from dataclasses import dataclass
from typing import List, Any

from pdfscraper.layout.utils import Orientation
from pdfscraper.page import Page


@dataclass
[docs]class Document:
[docs]    pages: List[Page]
[docs]    doc: Any
[docs]    orientation: Orientation

    @classmethod
[docs]    def from_pymupdf(
        cls,
        path,
        orientation: Orientation = Orientation.create(
            bottom_is_zero=False, left_is_zero=True
        ),
    ) -> "Document":

        if isinstance(path, str):
            import fitz

            doc = fitz.open(path)
            return cls(
                pages=[
                    Page.from_pymupdf(page, orientation=orientation) for page in doc
                ],
                doc=doc,
                orientation=orientation,
            )

    @classmethod
[docs]    def from_pdfminer(
        cls,
        path,
        orientation: Orientation = Orientation.create(
            bottom_is_zero=False, left_is_zero=True
        ),
    ) -> "Document":

        if isinstance(path, str):
            import pdfminer
            import pdfminer.high_level
            pages = pdfminer.high_level.extract_pages(path)
            return cls(
                [Page.from_pdfminer(page, orientation=orientation) for page in pages],
                doc=None,
                orientation=orientation,
            )

[docs]    def create_sections(self):
        pass