pdfscraper.page

Module Contents

Classes

Page

PageSection

class pdfscraper.page.Page(words, drawings, images, raw_object, blocks)[source]
Parameters
property sorted: List[pdfscraper.layout.text.TextLine][source]
Return type

List[pdfscraper.layout.text.TextLine]

property sorted_lines: Optional[SortedTextlines][source]
Return type

Optional[SortedTextlines]

__repr__()[source]

Return repr(self).

Return type

str

select(condition)[source]

Find content matching condition.

Parameters

condition (Callable) –

Return type

Page

__add__(other, other_position_delta=None)[source]

Create a new page by summing objects of this and another page. To concatenate them vertically or horizontally move all objects of the other page by specified delta.

Parameters
Returns

a new Page

Return type

Page

static _split_sequence_by_condition(seq, condition)[source]
split(condition)[source]
Parameters

condition (Callable) –

take_screenshot(area, output_path)[source]
Parameters

area (Tuple[float, float, float, float]) –

classmethod from_pymupdf(page, orientation=None)[source]
Parameters
Return type

Page

classmethod from_pdfminer(page, orientation=None)[source]
Parameters
Return type

Page

class pdfscraper.page.PageSection(words, drawings, images, raw_object, blocks)[source]

Bases: Page

Parameters
words :List[pdfscraper.layout.text.Word][source]
drawings :List[source]
images :List[source]
condition :str[source]
parent :Page[source]
name :str =[source]