Replace pdftoppm logic with PyMuPDF (native python)

Use PyMuPDF (AGPL-licensed) within the container conversion to replace the pdf conversion to RGB. This massively simplifies the code since PyMuPDF is a native python library.
2025-04-28 18:02:38 +02:00 · 2023-11-13 15:20:07 +00:00 · 2023-11-13 15:20:07 +00:00 · 327ab8791f
commit 327ab8791f
parent e923ac0788
2 changed files with 20 additions and 80 deletions
--- a/5
+++ b/5
@ -7,7 +7,6 @@ ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d
 RUN apk --no-cache -U upgrade && \
    apk --no-cache add \
    ghostscript \
-    graphicsmagick \
    libreoffice \
    openjdk8 \
    poppler-utils \
@ -17,6 +16,10 @@ RUN apk --no-cache -U upgrade && \
    tesseract-ocr \
    font-noto-cjk

+RUN apk --no-cache add --virtual .builddeps g++ gcc make python3-dev py3-pip \
+     && pip install --upgrade PyMuPDF \
+     && apk del .builddeps  # FIXME freeze w/ hashes
+
 # Download the trained models from the latest GitHub release of Tesseract, and
 # store them under /usr/share/tessdata. This is basically what distro packages
 # do under the hood.
--- a/dangerzone/conversion/doc_to_pixels.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@ -15,6 +15,7 @@ import shutil
 import sys
 from typing import Dict, List, Optional

+import fitz
 import magic

 from . import errors
@ -267,87 +268,23 @@ class DocumentToPixels(DangerzoneConverter):

        # Get a more precise timeout, based on the number of pages
        timeout = self.calculate_timeout(size, num_pages)
-
-        async def pdftoppm_progress_callback(line: bytes) -> None:
-            """Function called for every line the 'pdftoppm' command outputs
-
-            Sample pdftoppm output:
-
-                $ pdftoppm sample.pdf  /tmp/safe -progress
-                1 4 /tmp/safe-1.ppm
-                2 4 /tmp/safe-2.ppm
-                3 4 /tmp/safe-3.ppm
-                4 4 /tmp/safe-4.ppm
-
-            Each successful line is in the format "{page} {page_num} {ppm_filename}"
-            """
-            try:
-                (page_str, num_pages_str, _) = line.decode().split()
-                num_pages = int(num_pages_str)
-                page = int(page_str)
-            except ValueError as e:
-                # Ignore all non-progress related output, since pdftoppm sends
-                # everything to stderr and thus, errors can't be distinguished
-                # easily. We rely instead on the exit code.
-                return
-
-            percentage_per_page = 45.0 / num_pages
-            self.percentage += percentage_per_page
-            self.update_progress(f"Converting page {page}/{num_pages} to pixels")
-
-            zero_padding = "0" * (len(num_pages_str) - len(page_str))
-            ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
-            rgb_filename = f"{page_base}-{page}.rgb"
-            width_filename = f"{page_base}-{page}.width"
-            height_filename = f"{page_base}-{page}.height"
-            filename_base = f"{page_base}-{page}"
-
-            with open(ppm_filename, "rb") as f:
-                # NOTE: PPM files have multiple ways of writing headers.
-                # For our specific case we parse it expecting the header format that ppmtopdf produces
-                # More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
-
-                # Read the header
-                header = f.readline().decode().strip()
-                if header != "P6":
-                    raise errors.PDFtoPPMInvalidHeader()
-
-                # Save the width and height
-                dims = f.readline().decode().strip()
-                width, height = dims.split()
-                await self.write_page_width(int(width), width_filename)
-                await self.write_page_height(int(height), height_filename)
-
-                maxval = int(f.readline().decode().strip())
-                # Check that the depth is 8
-                if maxval != 255:
-                    raise errors.PDFtoPPMInvalidDepth()
-
-                data = f.read()
-
-            # Save pixel data
-            await self.write_page_data(data, rgb_filename)
-
-            # Delete the ppm file
-            os.remove(ppm_filename)
-
+        percentage_per_page = 45.0 / num_pages
        page_base = "/tmp/page"
+        doc = fitz.open(pdf_filename)
+        for page in doc:
+            # TODO check if page.number is doc-controlled
+            page_num = page.number + 1  # pages start in 1
+            rgb_filename = f"{page_base}-{page_num}.rgb"
+            width_filename = f"{page_base}-{page_num}.width"
+            height_filename = f"{page_base}-{page_num}.height"

-        await self.run_command(
-            [
-                "pdftoppm",
-                pdf_filename,
-                page_base,
-                "-progress",
-            ],
-            error_message="Conversion from PDF to PPM failed",
-            timeout_message=(
-                f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
-                " seconds"
-            ),
-            stderr_callback=pdftoppm_progress_callback,
-            timeout=timeout,
-        )
+            self.percentage += percentage_per_page
+            self.update_progress(f"Converting page {page_num}/{num_pages} to pixels")
+            pix = page.get_pixmap(dpi=150)
+            rgb_buf = pix.samples_mv
+            await self.write_page_width(pix.width, width_filename)
+            await self.write_page_height(pix.height, height_filename)
+            await self.write_page_data(rgb_buf, rgb_filename)

        final_files = (
            glob.glob("/tmp/page-*.rgb")