Source code for cardinal_pythonlib.pdf

#!/usr/bin/env python
# cardinal_pythonlib/pdf.py

"""
===============================================================================

    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of cardinal_pythonlib.

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        https://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.

===============================================================================

**Support functions to generate (and serve) PDFs.**

"""

import getpass
import io
import logging
import os
from pprint import pformat
import shutil
import sys
import tempfile
from typing import Any, Dict, Iterable, Union

from pypdf import PdfReader, PdfWriter
from semantic_version import Version

from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler


# =============================================================================
# Conditional/optional imports
# =============================================================================

log = get_brace_style_log_with_null_handler(__name__)

pdfkit = None
xhtml2pdf = None
weasyprint = None

# Preference 1
try:
    log.debug("trying pdfkit...")
    # noinspection PyPackageRequirements
    import pdfkit  # sudo apt-get install wkhtmltopdf; sudo pip install pdfkit

    log.debug("pdfkit: loaded")
except ImportError:
    pdfkit = None
    log.debug("pdfkit: failed to load")

if pdfkit:
    log.debug("pdfkit found, so skipping other PDF rendering engines")
else:
    try:
        # noinspection PyPackageRequirements
        import xhtml2pdf  # pip install xhtml2pdf

        # noinspection PyPackageRequirements
        import xhtml2pdf.document  # pip install xhtml2pdf

        log.debug("xhtml2pdf: loaded")
    except ImportError:
        xhtml2pdf = None
        log.debug("xhtml2pdf: failed to load")

    try:
        log.debug("trying weasyprint...")
        # noinspection PyPackageRequirements
        import weasyprint

        log.debug("weasyprint: loaded")
    except ImportError:
        weasyprint = None
        log.debug("weasyprint: failed to load")

# =============================================================================
# Onwards
# =============================================================================

if not any([xhtml2pdf, weasyprint, pdfkit]):
    raise RuntimeError(
        "No PDF engine (xhtml2pdf, weasyprint, pdfkit) "
        "available; can't load"
    )


[docs]class Processors: """ Class to enumerate possible PDF processors. """ XHTML2PDF = "xhtml2pdf" WEASYPRINT = "weasyprint" PDFKIT = "pdfkit"
_WKHTMLTOPDF_FILENAME = shutil.which("wkhtmltopdf") if pdfkit: _DEFAULT_PROCESSOR = Processors.PDFKIT # the best elif weasyprint: _DEFAULT_PROCESSOR = Processors.WEASYPRINT # imperfect tables else: _DEFAULT_PROCESSOR = Processors.XHTML2PDF # simple/slow # ============================================================================= # PdfPlan # =============================================================================
[docs]class PdfPlan(object): """ Class to describe a PDF on disk or the information required to create the PDF from HTML. """ def __init__( self, # HTML mode is_html: bool = False, html: str = None, header_html: str = None, footer_html: str = None, wkhtmltopdf_filename: str = None, wkhtmltopdf_options: Dict[str, Any] = None, # Filename mode is_filename: bool = False, filename: str = None, ): """ Args: is_html: use HTML mode? html: for HTML mode, the main HTML header_html: for HTML mode, an optional page header (in HTML) footer_html: for HTML mode, an optional page footer (in HTML) wkhtmltopdf_filename: filename of the ``wkhtmltopdf`` executable wkhtmltopdf_options: options for ``wkhtmltopdf`` is_filename: use file mode? filename: for file mode, the filename of the existing PDF on disk Use either ``is_html`` or ``is_filename``, not both. """ assert is_html != is_filename, "Specify is_html XOR is_filename" self.is_html = is_html # is_html options: self.html = html self.header_html = header_html self.footer_html = footer_html self.wkhtmltopdf_filename = wkhtmltopdf_filename self.wkhtmltopdf_options = wkhtmltopdf_options self.is_filename = is_filename # is_filename options self.filename = filename
[docs] def add_to_writer( self, writer: PdfWriter, start_recto: bool = True ) -> None: """ Add the PDF described by this class to a PDF writer. Args: writer: a :class:`pypdf.PdfWriter` start_recto: start a new right-hand page? """ if self.is_html: pdf = get_pdf_from_html( html=self.html, header_html=self.header_html, footer_html=self.footer_html, wkhtmltopdf_filename=self.wkhtmltopdf_filename, wkhtmltopdf_options=self.wkhtmltopdf_options, ) append_memory_pdf_to_writer(pdf, writer, start_recto=start_recto) elif self.is_filename: if start_recto and len(writer.pages) % 2 != 0: writer.add_blank_page() writer.append_pages_from_reader( PdfReader(open(self.filename, "rb")) ) else: raise AssertionError("PdfPlan: shouldn't get here!")
# ============================================================================= # Ancillary functions for PDFs # =============================================================================
[docs]def assert_processor_available(processor: str) -> None: """ Assert that a specific PDF processor is available. Args: processor: a PDF processor type from :class:`Processors` Raises: AssertionError: if bad ``processor`` RuntimeError: if requested processor is unavailable """ if processor not in [ Processors.XHTML2PDF, Processors.WEASYPRINT, Processors.PDFKIT, ]: raise AssertionError( "rnc_pdf.set_pdf_processor: invalid PDF processor" " specified" ) if processor == Processors.WEASYPRINT and not weasyprint: raise RuntimeError("rnc_pdf: Weasyprint requested, but not available") if processor == Processors.XHTML2PDF and not xhtml2pdf: raise RuntimeError("rnc_pdf: xhtml2pdf requested, but not available") if processor == Processors.PDFKIT and not pdfkit: raise RuntimeError("rnc_pdf: pdfkit requested, but not available")
[docs]def get_default_fix_pdfkit_encoding_bug() -> bool: """ Should we be trying to fix a ``pdfkit`` encoding bug, by default? Returns: should we? Yes if we have the specific buggy version of ``pdfkit``. """ # Auto-determine. if pdfkit is None: return False else: # noinspection PyUnresolvedReferences return bool(Version(pdfkit.__version__) == Version("0.5.0"))
[docs]def make_pdf_from_html( # Mandatory parameters: on_disk: bool, html: str, # Disk options: output_path: str = None, # Shared options: header_html: str = None, footer_html: str = None, wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME, wkhtmltopdf_options: Dict[str, Any] = None, file_encoding: str = "utf-8", debug_options: bool = False, debug_content: bool = False, debug_wkhtmltopdf_args: bool = True, fix_pdfkit_encoding_bug: bool = None, processor: str = _DEFAULT_PROCESSOR, ) -> Union[bytes, bool]: """ Takes HTML and either returns a PDF in memory or makes one on disk. For preference, uses ``wkhtmltopdf`` (with ``pdfkit``): - faster than ``xhtml2pdf`` - tables not buggy like ``Weasyprint`` - however, doesn't support CSS Paged Media, so we have the ``header_html`` and ``footer_html`` options to allow you to pass appropriate HTML content to serve as the header/footer (rather than passing it within the main HTML). Args: on_disk: make file on disk (rather than returning it in memory)? html: main HTML output_path: if ``on_disk``, the output filename header_html: optional page header, as HTML footer_html: optional page footer, as HTML wkhtmltopdf_filename: filename of the ``wkhtmltopdf`` executable wkhtmltopdf_options: options for ``wkhtmltopdf`` file_encoding: encoding to use when writing the header/footer to disk debug_options: log ``wkhtmltopdf`` config/options passed to ``pdfkit``? debug_content: log the main/header/footer HTML? debug_wkhtmltopdf_args: log the final command-line arguments to that will be used by ``pdfkit`` when it calls ``wkhtmltopdf``? fix_pdfkit_encoding_bug: attempt to work around bug in e.g. ``pdfkit==0.5.0`` by encoding ``wkhtmltopdf_filename`` to UTF-8 before passing it to ``pdfkit``? If you pass ``None`` here, then a default value is used, from :func:`get_default_fix_pdfkit_encoding_bug`. processor: a PDF processor type from :class:`Processors` Returns: the PDF binary as a ``bytes`` object Raises: AssertionError: if bad ``processor`` RuntimeError: if requested processor is unavailable """ if wkhtmltopdf_options: wkhtmltopdf_options = wkhtmltopdf_options.copy() else: wkhtmltopdf_options = {} # type: Dict[str, Any] assert_processor_available(processor) if debug_content: log.debug("html: {}", html) log.debug("header_html: {}", header_html) log.debug("footer_html: {}", footer_html) if fix_pdfkit_encoding_bug is None: fix_pdfkit_encoding_bug = get_default_fix_pdfkit_encoding_bug() if processor == Processors.XHTML2PDF: if on_disk: with open(output_path, mode="wb") as outfile: # noinspection PyUnresolvedReferences xhtml2pdf.document.pisaDocument(html, outfile) return True else: memfile = io.BytesIO() # noinspection PyUnresolvedReferences xhtml2pdf.document.pisaDocument(html, memfile) # ... returns a document, but we don't use it, so we don't store it # to stop pychecker complaining # http://xhtml2pdf.appspot.com/static/pisa-en.html memfile.seek(0) return memfile.read() # https://stackoverflow.com/questions/3310584 elif processor == Processors.WEASYPRINT: if on_disk: # noinspection PyUnresolvedReferences return weasyprint.HTML(string=html).write_pdf(output_path) else: # http://ampad.de/blog/generating-pdfs-django/ # noinspection PyUnresolvedReferences return weasyprint.HTML(string=html).write_pdf() elif processor == Processors.PDFKIT: # Config: if not wkhtmltopdf_filename: config = None else: if fix_pdfkit_encoding_bug: # needs to be True for pdfkit==0.5.0 log.debug( "Attempting to fix bug in pdfkit (e.g. version 0.5.0)" " by encoding wkhtmltopdf_filename to UTF-8" ) # noinspection PyUnresolvedReferences config = pdfkit.configuration( wkhtmltopdf=wkhtmltopdf_filename.encode("utf-8") ) # the bug is that pdfkit.pdfkit.PDFKit.__init__ will attempt to # decode the string in its configuration object; # https://github.com/JazzCore/python-pdfkit/issues/32 else: # noinspection PyUnresolvedReferences config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_filename) # Temporary files that a subprocess can read: # https://stackoverflow.com/questions/15169101 # wkhtmltopdf requires its HTML files to have ".html" extensions: # https://stackoverflow.com/questions/5776125 h_filename = None f_filename = None try: if header_html: h_fd, h_filename = tempfile.mkstemp(suffix=".html") os.write(h_fd, header_html.encode(file_encoding)) os.close(h_fd) wkhtmltopdf_options["header-html"] = h_filename if footer_html: f_fd, f_filename = tempfile.mkstemp(suffix=".html") os.write(f_fd, footer_html.encode(file_encoding)) os.close(f_fd) wkhtmltopdf_options["footer-html"] = f_filename if debug_options: log.debug("wkhtmltopdf config: {!r}", config) log.debug( "wkhtmltopdf_options: {}", pformat(wkhtmltopdf_options) ) # noinspection PyUnresolvedReferences kit = pdfkit.pdfkit.PDFKit( html, "string", configuration=config, options=wkhtmltopdf_options, ) if on_disk: path = output_path else: path = None # With "path=None", the to_pdf() function directly returns # stdout from a subprocess.Popen().communicate() call (see # pdfkit.py). Since universal_newlines is not set, stdout will # be bytes in Python 3. if debug_wkhtmltopdf_args: log.debug("Probable current user: {!r}", getpass.getuser()) log.debug( "wkhtmltopdf arguments will be: {!r}", kit.command(path=path), ) return kit.to_pdf(path=path) finally: if h_filename: os.remove(h_filename) if f_filename: os.remove(f_filename) else: raise AssertionError("Unknown PDF engine")
[docs]def get_pdf_from_html( html: str, header_html: str = None, footer_html: str = None, wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME, wkhtmltopdf_options: Dict[str, Any] = None, file_encoding: str = "utf-8", debug_options: bool = False, debug_content: bool = False, debug_wkhtmltopdf_args: bool = True, fix_pdfkit_encoding_bug: bool = None, processor: str = _DEFAULT_PROCESSOR, ) -> bytes: """ Takes HTML and returns a PDF. See the arguments to :func:`make_pdf_from_html` (except ``on_disk``). Returns: the PDF binary as a ``bytes`` object """ result = make_pdf_from_html( on_disk=False, html=html, header_html=header_html, footer_html=footer_html, wkhtmltopdf_filename=wkhtmltopdf_filename, wkhtmltopdf_options=wkhtmltopdf_options, file_encoding=file_encoding, debug_options=debug_options, debug_content=debug_content, debug_wkhtmltopdf_args=debug_wkhtmltopdf_args, fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug, processor=processor, ) # type: bytes return result
[docs]def pdf_from_html( html: str, header_html: str = None, footer_html: str = None, wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME, wkhtmltopdf_options: Dict[str, Any] = None, file_encoding: str = "utf-8", debug_options: bool = False, debug_content: bool = False, fix_pdfkit_encoding_bug: bool = True, processor: str = _DEFAULT_PROCESSOR, ) -> bytes: """ Older function name for :func:`get_pdf_from_html` (q.v.). """ return get_pdf_from_html( html=html, header_html=header_html, footer_html=footer_html, wkhtmltopdf_filename=wkhtmltopdf_filename, wkhtmltopdf_options=wkhtmltopdf_options, file_encoding=file_encoding, debug_options=debug_options, debug_content=debug_content, fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug, processor=processor, )
[docs]def make_pdf_on_disk_from_html( html: str, output_path: str, header_html: str = None, footer_html: str = None, wkhtmltopdf_filename: str = _WKHTMLTOPDF_FILENAME, wkhtmltopdf_options: Dict[str, Any] = None, file_encoding: str = "utf-8", debug_options: bool = False, debug_content: bool = False, debug_wkhtmltopdf_args: bool = True, fix_pdfkit_encoding_bug: bool = None, processor: str = _DEFAULT_PROCESSOR, ) -> bool: """ Takes HTML and writes a PDF to the file specified by ``output_path``. See the arguments to :func:`make_pdf_from_html` (except ``on_disk``). Returns: success? """ result = make_pdf_from_html( on_disk=True, output_path=output_path, html=html, header_html=header_html, footer_html=footer_html, wkhtmltopdf_filename=wkhtmltopdf_filename, wkhtmltopdf_options=wkhtmltopdf_options, file_encoding=file_encoding, debug_options=debug_options, debug_content=debug_content, debug_wkhtmltopdf_args=debug_wkhtmltopdf_args, fix_pdfkit_encoding_bug=fix_pdfkit_encoding_bug, processor=processor, ) # type: bool return result
[docs]def pdf_from_writer(writer: PdfWriter) -> bytes: """ Extracts a PDF (as binary data) from a pypdf writer object. """ memfile = io.BytesIO() writer.write(memfile) memfile.seek(0) return memfile.read()
[docs]def serve_pdf_to_stdout(pdf: bytes) -> None: """ Serves a PDF to ``stdout`` (for web servers). Writes a ``Content-Type: application/pdf`` header and then the PDF to ``stdout``. See: - https://stackoverflow.com/questions/312230/proper-mime-type-for-pdf-files - https://www.askapache.com/htaccess/pdf-cookies-headers-rewrites.html - https://stackoverflow.com/questions/2374427 """ # print("Content-type: text/plain\n") # for debugging print("Content-Type: application/pdf\n") # https://stackoverflow.com/questions/908331/how-to-write-binary-data-to-stdout-in-python-3 # noqa: E501 sys.stdout.buffer.write(pdf)
[docs]def make_pdf_writer() -> PdfWriter: """ Creates and returns a pypdf writer. """ return PdfWriter()
[docs]def append_memory_pdf_to_writer( input_pdf: bytes, writer: PdfWriter, start_recto: bool = True ) -> None: """ Appends a PDF (as bytes in memory) to a pypdf writer. Args: input_pdf: the PDF, as ``bytes`` writer: the writer start_recto: start a new right-hand page? """ if not input_pdf: return if start_recto and len(writer.pages) % 2 != 0: writer.add_blank_page() # ... suitable for double-sided printing infile = io.BytesIO(input_pdf) reader = PdfReader(infile) writer.append(reader)
[docs]def append_pdf(input_pdf: bytes, output_writer: PdfWriter): """ Appends a PDF to a pyPDF writer. Legacy interface. """ append_memory_pdf_to_writer(input_pdf=input_pdf, writer=output_writer)
# ============================================================================= # Serve concatenated PDFs # ============================================================================= # Two ways in principle to do this: # (1) Load data from each PDF into memory; concatenate; serve the result. # (2) With each PDF on disk, create a temporary file (e.g. with pdftk), # serve the result (e.g. in one go), then delete the temporary file. # This may be more memory-efficient. # However, there can be problems: # https://stackoverflow.com/questions/7543452/how-to-launch-a-pdftk-subprocess-while-in-wsgi # noqa: E501 # Others' examples: # https://gist.github.com/zyegfryed/918403 # https://gist.github.com/grantmcconnaughey/ce90a689050c07c61c96 # https://stackoverflow.com/questions/3582414/removing-tmp-file-after-return-httpresponse-in-django # noqa: E501
[docs]def get_concatenated_pdf_from_disk( filenames: Iterable[str], start_recto: bool = True ) -> bytes: """ Concatenates PDFs from disk and returns them as an in-memory binary PDF. Args: filenames: iterable of filenames of PDFs to concatenate start_recto: start a new right-hand page for each new PDF? Returns: concatenated PDF, as ``bytes`` """ # https://stackoverflow.com/questions/17104926/pypdf-merging-multiple-pdf-files-into-one-pdf # noqa: E501 # https://en.wikipedia.org/wiki/Recto_and_verso # PdfMerger deprecated as of pypdf==5.0.0; use PdfWriter instead. # - https://pypdf.readthedocs.io/en/stable/modules/PdfMerger.html # - https://pypdf.readthedocs.io/en/stable/modules/PdfWriter.html writer = PdfWriter() for filename in filenames: if not filename: continue if start_recto and len(writer.pages) % 2 != 0: writer.add_blank_page() writer.append(filename) return pdf_from_writer(writer)
[docs]def get_concatenated_pdf_in_memory( pdf_plans: Iterable[PdfPlan], start_recto: bool = True ) -> bytes: """ Concatenates PDFs and returns them as an in-memory binary PDF. Args: pdf_plans: iterable of :class:`PdfPlan` objects start_recto: start a new right-hand page for each new PDF? Returns: concatenated PDF, as ``bytes`` """ writer = PdfWriter() for pdfplan in pdf_plans: pdfplan.add_to_writer(writer, start_recto=start_recto) return pdf_from_writer(writer)
# ============================================================================= # Main -- to enable logging for imports, for debugging # ============================================================================= if __name__ == "__main__": logging.basicConfig() log.setLevel(logging.DEBUG)