Source code for elm.web.document

# -*- coding: utf-8 -*-
"""ELM Web Document class definitions"""
import re
from abc import ABC, abstractmethod
from copy import deepcopy
from functools import cached_property
import logging

import pandas as pd

from elm.utilities.parse import (
    combine_pages,
    clean_headers,
    html_to_text,
    remove_blank_pages,
    format_html_tables,
    read_pdf,
    read_pdf_ocr,
    replace_common_pdf_conversion_chars,
    replace_multi_dot_lines,
    remove_empty_lines_or_page_footers,
)


logger = logging.getLogger(__name__)



[docs]
class BaseDocument(ABC):
    """Base ELM web document representation

    Purpose:
        Track document content and perform minor processing on it.
    Responsibilities:
        1. Store "raw" document text.
        2. Compute "cleaned" text, which combines pages, strips HTML,
           and formats tables.
        3. Track pages and other document metadata.
    Key Relationships:
        Created by :class:`~elm.web.file_loader.AsyncFileLoader` and
        used all over ordinance code.

    .. end desc
    """

    def __init__(self, pages, attrs=None):
        """

        Parameters
        ----------
        pages : iterable
            Iterable of strings, where each string is a page of a
            document.
        attrs : dict, optional
            Optional dict containing metadata for the document.
            By default, ``None``.
        """
        self.pages = remove_blank_pages(pages)
        self.attrs = attrs or {}

    def __repr__(self):
        header = (f"{self.__class__.__name__} with {len(self.pages):,} "
                  "pages\nAttrs:")
        if not self.attrs:
            return f"{header} None"

        attrs = {}
        for k, v in self.attrs.items():
            if isinstance(v, pd.DataFrame):
                v = f"DataFrame with {len(v):,} rows"
            attrs[k] = v

        indent = max(len(k) for k in attrs) + 2
        attrs = "\n".join([f"{k:>{indent}}:\t{v}"
                           for k, v in attrs.items()])
        return f"{header}\n{attrs}"

    @property
    def empty(self):
        """bool: ``True`` if the document contains no pages."""
        return not any(_non_empty_pages(self.text.split("\n")))

    @cached_property
    def raw_pages(self):
        """list: List of (a limited count of) raw pages"""
        if not self.pages:
            return []

        return self._raw_pages()

    @cached_property
    def text(self):
        """str: Cleaned text from document"""
        if not self.pages:
            return ""

        return self._cleaned_text()

    @abstractmethod
    def _raw_pages(self):
        """Get raw pages from document"""
        raise NotImplementedError(
            "This document does not implement a raw pages extraction function"
        )

    @abstractmethod
    def _cleaned_text(self):
        """Compute cleaned text from document"""
        raise NotImplementedError(
            "This document does not implement a pages cleaning function"
        )

    @property
    @abstractmethod
    def WRITE_KWARGS(self):
        """dict: Dict of kwargs to pass to `open` when writing this doc."""
        raise NotImplementedError

    @property
    @abstractmethod
    def FILE_EXTENSION(self):
        """str: Cleaned document file extension."""
        raise NotImplementedError




[docs]
class PDFDocument(BaseDocument):
    """ELM web PDF document"""

    CLEAN_HEADER_KWARGS = {
        "char_thresh": 0.6,
        "page_thresh": 0.8,
        "split_on": "\n",
        "iheaders": [0, 1, 3, -3, -2, -1],
    }
    """Default :func:`~elm.utilities.parse.clean_headers` arguments"""
    WRITE_KWARGS = {"mode": "wb"}
    FILE_EXTENSION = "pdf"

    def __init__(
        self,
        pages,
        attrs=None,
        percent_raw_pages_to_keep=25,
        max_raw_pages=18,
        num_end_pages_to_keep=2,
        clean_header_kwargs=None,
    ):
        """

        Parameters
        ----------
        pages : iterable
            Iterable of strings, where each string is a page of a
            document.
        attrs : str, optional
            Optional dict containing metadata for the document.
            By default, ``None``.
        percent_raw_pages_to_keep : int, optional
            Percent of "raw" pages to keep. Useful for extracting info
            from headers/footers of a doc, which are normally stripped
            to form the "clean" text. By default, ``25``.
        max_raw_pages : int, optional
            The max number of raw pages to keep. The number of raw pages
            will never exceed the total of this value +
            `num_end_pages_to_keep`. By default, ``18``.
        num_end_pages_to_keep : int, optional
            Number of additional pages to keep from the end of the
            document. This can be useful to extract more meta info.
            The number of raw pages will never exceed the total of this
            value + `max_raw_pages`. By default, ``2``.
        clean_header_kwargs : dict, optional
            Optional dictionary of keyword-value pair arguments to pass
            to the :func:`~elm.utilities.parse.clean_headers`
            function. By default, ``None``.
        """
        super().__init__(pages, attrs=attrs)
        self.percent_raw_pages_to_keep = percent_raw_pages_to_keep
        self.max_raw_pages = min(len(self.pages), max_raw_pages)
        self.num_end_pages_to_keep = num_end_pages_to_keep
        self.clean_header_kwargs = deepcopy(self.CLEAN_HEADER_KWARGS)
        self.clean_header_kwargs.update(clean_header_kwargs or {})

    @cached_property
    def num_raw_pages_to_keep(self):
        """int: Number of raw pages to keep from PDF document"""
        num_to_keep = self.percent_raw_pages_to_keep / 100 * len(self.pages)
        return min(self.max_raw_pages, max(1, int(num_to_keep)))

    @cached_property
    def _last_page_index(self):
        """int: last page index (determines how many end pages to include)"""
        neg_num_extra_pages = self.num_raw_pages_to_keep - len(self.pages)
        neg_num_last_pages = max(
            -self.num_end_pages_to_keep, neg_num_extra_pages
        )
        return min(0, neg_num_last_pages)

    def _cleaned_text(self):
        """Compute cleaned text from document"""
        pages = clean_headers(deepcopy(self.pages), **self.clean_header_kwargs)
        text = combine_pages(pages)
        text = replace_common_pdf_conversion_chars(text)
        text = replace_multi_dot_lines(text)
        text = remove_empty_lines_or_page_footers(text)
        return text

    # pylint: disable=unnecessary-comprehension
    # fmt: off
    def _raw_pages(self):
        """Get raw pages from document"""
        raw_pages = [page for page in self.pages[:self.num_raw_pages_to_keep]]
        if self._last_page_index:
            raw_pages += [page for page in self.pages[self._last_page_index:]]
        return raw_pages


[docs]
    @classmethod
    def from_file(cls, fp, **init_kwargs):
        """Initialize a PDFDocument object from a .pdf file on disk. This
        method will try to use pdftotext (a poppler utility) and then
        OCR with pytesseract.

        Parameters
        ----------
        fp : str
            filepath to .pdf on disk
        init_kwargs : dict
            Optional kwargs for PDFDocument Initialization

        Returns
        -------
        out : PDFDocument
            Initialized PDFDocument class from input fp
        """

        with open(fp, 'rb') as f:
            pages = read_pdf(f.read())

        pages = list(_non_empty_pages(pages))
        if pages:
            return cls(pages, **init_kwargs)

        # fallback to OCR with pytesseract if no pages have more than 10
        # chars. Typical scanned document only has weird ascii per page.
        with open(fp, 'rb') as f:
            pages = read_pdf_ocr(f.read())

        pages = list(_non_empty_pages(pages))
        if not any(pages):
            msg = f'Could not get text from pdf: {fp}'
            logger.error(msg)
            raise RuntimeError(msg)

        return cls(pages, **init_kwargs)





[docs]
class HTMLDocument(BaseDocument):
    """ELM web HTML document"""

    HTML_TABLE_TO_MARKDOWN_KWARGS = {
        "floatfmt": ".5f",
        "index": True,
        "tablefmt": "psql",
    }
    """Default :func:`~elm.utilities.parse.format_html_tables` arguments"""
    WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"}
    FILE_EXTENSION = "txt"

    def __init__(
        self,
        pages,
        attrs=None,
        html_table_to_markdown_kwargs=None,
        ignore_html_links=True,
        text_splitter=None,
    ):
        """

        Parameters
        ----------
        pages : iterable
            Iterable of strings, where each string is a page of a
            document.
        attrs : dict, optional
            Optional dict containing metadata for the document.
            By default, ``None``.
        html_table_to_markdown_kwargs : dict, optional
            Optional dictionary of keyword-value pair arguments to pass
            to the :func:`~elm.utilities.parse.format_html_tables`
            function. By default, ``None``.
        ignore_html_links : bool, optional
            Option to ignore link in HTML text during parsing.
            By default, ``True``.
        text_splitter : obj, optional
            Instance of an object that implements a `split_text` method.
            The method should take text as input (str) and return a list
            of text chunks. The raw pages will be passed through this
            splitter to create raw pages for this document. Langchain's
            text splitters should work for this input.
            By default, ``None``, which means the original pages input
            becomes the raw pages attribute.
        """
        super().__init__(pages, attrs=attrs)
        self.html_table_to_markdown_kwargs = deepcopy(
            self.HTML_TABLE_TO_MARKDOWN_KWARGS
        )
        self.html_table_to_markdown_kwargs.update(
            html_table_to_markdown_kwargs or {}
        )
        self.ignore_html_links = ignore_html_links
        self.text_splitter = text_splitter

    def _cleaned_text(self):
        """Compute cleaned text from document"""
        text = combine_pages(self.pages)
        text = html_to_text(text, self.ignore_html_links)
        text = format_html_tables(text, **self.html_table_to_markdown_kwargs)
        return text

    def _raw_pages(self):
        """Get raw pages from document"""
        if self.text_splitter is None:
            return self.pages
        return self.text_splitter.split_text("\n\n".join(self.pages))



def _non_empty_pages(pages):
    """Return all pages with more than 10 chars"""
    return filter(
        lambda page: re.search('[a-zA-Z]', page) and len(page) > 10, pages
    )