Source code for pdfscraper.layout.text

# from __future__ import annotations

import itertools
from typing import Union, List, Tuple, Callable

import unicodedata

from pdfscraper.layout.utils import Bbox, create_bbox_backend, Backend, PageOrientation, Rectangular, group_objs
from pdfscraper.layout.utils import (
    get_leftmost,
    get_rightmost,
    get_topmost,
    get_bottommost,
)


[docs]class TextLine(Rectangular): """ A horizontal line of text. """ def __init__(self, words): self.words = words if words: self.bbox = Bbox(words[0].bbox.x0, words[0].bbox.y0, words[-1].bbox.x1, words[-1].bbox.y1) else: self.bbox = Bbox(-1,-1,-1,-1) @property
[docs] def text(self): return ' '.join(str(i) for i in self.words)
[docs] def __getitem__(self, key): return self.words[key]
[docs] def __bool__(self): return bool(self.words)
[docs] def __str__(self): return self.text
[docs] def __repr__(self): return f'TextLine(bbox={self.bbox}, words=\n[%s]' % ',\n '.join(repr(i) for i in self.words)
[docs] def __contains__(self, text): if text in self.text: return True else: return False
[docs]class SortedTextlines: def __init__(self, textlines: List[TextLine], words, origin=None): self.textlines = textlines self.words = words self.origin = origin
[docs] def select(self, condition: Callable, retain_empty_lines=False) -> 'SortedTextlines': """ Find content matching condition. """ words = [i for i in self.words if condition(i)] textlines = [TextLine([word for word in textline if condition(word)]) for textline in self.textlines] if not retain_empty_lines: textlines = list(filter(bool, textlines)) ret = SortedTextlines(words=words, textlines=textlines, origin=self.origin) return ret
[docs] def resort(self): return SortedTextlines(textlines=[TextLine(line) for line in group_objs(self.words)], words=self.words)
[docs] def __repr__(self) -> str: return "Textlines: %s" % "".join([repr(i) + "\n" for i in self.textlines])
[docs]class Word(Rectangular): """ A text string representing one word. It's generated from a line of text by splitting on a space. """ #__slots__ = ("text", "bbox", "font", "size", "color") def __init__( self, bbox: Bbox, text: str = "", font: str = "", size: str = "", color=None, normalize_text=False, ): self.text = text if normalize_text: self.text = self.text.replace("\xad", "-") self.text = unicodedata.normalize("NFKD", self.text) self.bbox = bbox self.font = font self.size = size self.color = color
[docs] def __hash__(self) -> int: return hash(repr(self))
[docs] def __repr__(self) -> str: return f'Word(text="{self.text}",bbox={self.bbox})'
[docs] def __eq__(self, other) -> bool: if (self.text, self.bbox) == (other.text, other.bbox): return True return False
[docs] def __str__(self) -> str: return self.text
[docs]class Span(Rectangular):
[docs] __slots__ = ("words", "bbox")
def __init__(self, bbox: Bbox, words: List[Word] = None): """ A collection of words. """ self.words = words self.bbox = bbox @property
[docs] def text(self): return "".join([i.text for i in self.words])
[docs] def __repr__(self): return "Span <%s> %s" % ([round(i) for i in self.bbox], self.words)
@classmethod
[docs] def from_pymupdf(cls, span: dict, page_orientation: PageOrientation) -> "Span": words = [ list(g) for k, g in ( itertools.groupby( span["chars"], key=lambda x: x["c"] not in (" ", "\xa0") ) ) ] new_words = [] coords = [] for word in words: x0, y0 = get_leftmost(word[0]["bbox"]), get_topmost(word[0]["bbox"]) x1, y1 = get_rightmost(word[-1]["bbox"]), get_bottommost(word[-1]["bbox"]) coords.append([x0, y0, x1, y1]) text = "".join([c["c"] for c in word]) # mupdf has top as zero and left as zero by default bbox = create_bbox_backend( backend=Backend.PYMUPDF, coords=(x0, y0, x1, y1), page_orientation=page_orientation, ) new_words.append( Word( **{ "text": text, "bbox": bbox, "font": span["font"], "size": span["size"], "color": span["color"], }, normalize_text=True, ) ) bbox = get_span_bbox(new_words) return cls(words=new_words, bbox=bbox)
@classmethod
[docs] def from_pdfminer(cls, span: List["pdfminer.layout.LTChar"], page_orientation: PageOrientation) -> "Span": """ Convert a list of pdfminer characters into a Span. Split a list by space into Words. @param span: list of characters """ import pdfminer words = [ list(g) for k, g in ( itertools.groupby(span, key=lambda x: x.get_text() not in (" ", "\xa0")) ) ] new_words = [] coords = [] for word in words: if type(word) == pdfminer.layout.LTAnno: continue # reversing y-coordinates: in pdfminer the zero is the bottom of the page # make it top x0, y0 = word[0].x0, word[0].y0 x1, y1 = word[-1].x1, word[-1].y1 coords.append([x0, y0, x1, y1]) text = "".join([c.get_text() for c in word]) font = word[0].fontname size = word[0].size bbox = create_bbox_backend( backend=Backend.PDFMINER, coords=(x0, y0, x1, y1), page_orientation=page_orientation, ) new_words.append( Word( **{ "text": text, "bbox": bbox, "font": font, "size": size, "color": None, }, normalize_text=True, ) ) bbox = get_span_bbox(new_words) return cls(words=new_words, bbox=bbox)
[docs]class Line:
[docs] __slots__ = ("bbox", "spans")
def __init__(self, bbox: Bbox, spans): self.bbox = bbox self.spans = spans
[docs] def __repr__(self): # '\n'.join([i.text for i in self.spans]) return "Line: %s" % self.spans
@property
[docs] def text(self): return " ".join([i.text for i in self.spans])
[docs]class Block:
[docs] __slots__ = ("bbox", "lines")
def __init__(self, bbox: Bbox, lines): """ A collection spans. """ self.bbox = bbox self.lines = lines
[docs] def __repr__(self): return "Block: %s" % self.lines
[docs]def get_span_bbox(span: List) -> Bbox: """ Calculate bounding box for a span. :param span: :return: """ coords = [i.bbox for i in span] min_x0 = min((i.x0 for i in coords)) min_y0 = min((i.y0 for i in coords)) min_x1 = min((i.x1 for i in coords)) min_y1 = min((i.y1 for i in coords)) max_x0 = max((i.x0 for i in coords)) max_y0 = max((i.y0 for i in coords)) max_x1 = max((i.x1 for i in coords)) max_y1 = max((i.y1 for i in coords)) leftmost = min([min_x0, min_x1]) rightmost = max([max_x0, max_x1]) topmost = min([min_y0, min_y1]) bottommost = max([max_y0, max_y1]) bbox = Bbox(x0=leftmost, y0=topmost, x1=rightmost, y1=bottommost) return bbox
[docs]def line2str(line: List[Word]) -> str: return " ".join(map(str, line))