Replace pdftoppm logic with PyMuPDF (native python)

Use PyMuPDF (AGPL-licensed) within the container conversion to replace the pdf conversion to RGB. This massively simplifies the code since PyMuPDF is a native python library.
2025-04-28 18:02:38 +02:00 · 2023-11-13 15:20:07 +00:00 · 2023-11-13 15:20:07 +00:00 · 327ab8791f
commit 327ab8791f
parent e923ac0788
2 changed files with 20 additions and 80 deletions
--- a/5
+++ b/5
@ -7,7 +7,6 @@ ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d
 RUN apk --no-cache -U upgrade && \
    apk --no-cache add \
    ghostscript \
    graphicsmagick \
    libreoffice \
    openjdk8 \
    poppler-utils \
@ -17,6 +16,10 @@ RUN apk --no-cache -U upgrade && \
    tesseract-ocr \
    font-noto-cjk
 RUN apk --no-cache add --virtual .builddeps g++ gcc make python3-dev py3-pip \
     && pip install --upgrade PyMuPDF \
     && apk del .builddeps  # FIXME freeze w/ hashes
 # Download the trained models from the latest GitHub release of Tesseract, and
 # store them under /usr/share/tessdata. This is basically what distro packages
 # do under the hood.
--- a/dangerzone/conversion/doc_to_pixels.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@ -15,6 +15,7 @@ import shutil
 import sys
 from typing import Dict, List, Optional
 import fitz
 import magic
 from . import errors
@ -267,87 +268,23 @@ class DocumentToPixels(DangerzoneConverter):
        # Get a more precise timeout, based on the number of pages
        timeout = self.calculate_timeout(size, num_pages)
-
+        percentage_per_page = 45.0 / num_pages
        async def pdftoppm_progress_callback(line: bytes) -> None:
            """Function called for every line the 'pdftoppm' command outputs
            Sample pdftoppm output:
                $ pdftoppm sample.pdf  /tmp/safe -progress
                1 4 /tmp/safe-1.ppm
                2 4 /tmp/safe-2.ppm
                3 4 /tmp/safe-3.ppm
                4 4 /tmp/safe-4.ppm
            Each successful line is in the format "{page} {page_num} {ppm_filename}"
            """
            try:
                (page_str, num_pages_str, _) = line.decode().split()
                num_pages = int(num_pages_str)
                page = int(page_str)
            except ValueError as e:
                # Ignore all non-progress related output, since pdftoppm sends
                # everything to stderr and thus, errors can't be distinguished
                # easily. We rely instead on the exit code.
                return
            percentage_per_page = 45.0 / num_pages
            self.percentage += percentage_per_page
            self.update_progress(f"Converting page {page}/{num_pages} to pixels")
            zero_padding = "0" * (len(num_pages_str) - len(page_str))
            ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
            rgb_filename = f"{page_base}-{page}.rgb"
            width_filename = f"{page_base}-{page}.width"
            height_filename = f"{page_base}-{page}.height"
            filename_base = f"{page_base}-{page}"
            with open(ppm_filename, "rb") as f:
                # NOTE: PPM files have multiple ways of writing headers.
                # For our specific case we parse it expecting the header format that ppmtopdf produces
                # More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
                # Read the header
                header = f.readline().decode().strip()
                if header != "P6":
                    raise errors.PDFtoPPMInvalidHeader()
                # Save the width and height
                dims = f.readline().decode().strip()
                width, height = dims.split()
                await self.write_page_width(int(width), width_filename)
                await self.write_page_height(int(height), height_filename)
                maxval = int(f.readline().decode().strip())
                # Check that the depth is 8
                if maxval != 255:
                    raise errors.PDFtoPPMInvalidDepth()
                data = f.read()
            # Save pixel data
            await self.write_page_data(data, rgb_filename)
            # Delete the ppm file
            os.remove(ppm_filename)
        page_base = "/tmp/page"
        doc = fitz.open(pdf_filename)
        for page in doc:
            # TODO check if page.number is doc-controlled
            page_num = page.number + 1  # pages start in 1
            rgb_filename = f"{page_base}-{page_num}.rgb"
            width_filename = f"{page_base}-{page_num}.width"
            height_filename = f"{page_base}-{page_num}.height"
-        await self.run_command(
+            self.percentage += percentage_per_page
-            [
+            self.update_progress(f"Converting page {page_num}/{num_pages} to pixels")
-                "pdftoppm",
+            pix = page.get_pixmap(dpi=150)
-                pdf_filename,
+            rgb_buf = pix.samples_mv
-                page_base,
+            await self.write_page_width(pix.width, width_filename)
-                "-progress",
+            await self.write_page_height(pix.height, height_filename)
-            ],
+            await self.write_page_data(rgb_buf, rgb_filename)
            error_message="Conversion from PDF to PPM failed",
            timeout_message=(
                f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
                " seconds"
            ),
            stderr_callback=pdftoppm_progress_callback,
            timeout=timeout,
        )
        final_files = (
            glob.glob("/tmp/page-*.rgb")