PDFunite: fix too many open files

In large (1200+) PDFs the PDFunite command would fail on some systems (e.g. Qubes), because it would be called with 1024+ files, leading up to too many files open (`ulimit -n`). This solution splits the merging into batches, accumulating the results in a single PDF and then merging it with the next batch.
2025-05-04 20:51:49 +02:00 · 2023-11-02 15:00:10 +00:00 · 2023-11-02 15:00:10 +00:00 · f7190e3876
commit f7190e3876
parent ebfed4ecda
3 changed files with 38 additions and 18 deletions
--- a/dangerzone/conversion/common.py
+++ b/dangerzone/conversion/common.py
@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
        yield (first_page, last_page)


+def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
+    if timeout is None:
+        return None
+    else:
+        num_batches = int(num_pages / PAGE_BATCH_SIZE)
+        return timeout / num_batches
+
+
 class DangerzoneConverter:
    def __init__(self, progress_callback: Optional[Callable] = None) -> None:
        self.percentage: float = 0.0
--- a/dangerzone/conversion/doc_to_pixels.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@ -23,6 +23,7 @@ from .common import (
    PAGE_BATCH_SIZE,
    DangerzoneConverter,
    batch_iterator,
+    get_batch_timeout,
    running_on_qubes,
 )

@ -283,10 +284,7 @@ class DocumentToPixels(DangerzoneConverter):
        # Get a more precise timeout, based on the number of pages
        timeout = self.calculate_timeout(size, num_pages)

-        if timeout is None:
-            timeout_per_batch = None
-        else:
-            timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
+        timeout_per_batch = get_batch_timeout(timeout, num_pages)
        for first_page, last_page in batch_iterator(num_pages):
            # XXX send data from the previous loop's conversion to
            # always be able to process and send data at the same time
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@ -13,7 +13,12 @@ import shutil
 import sys
 from typing import Optional

-from .common import DangerzoneConverter, running_on_qubes
+from .common import (
+    DangerzoneConverter,
+    batch_iterator,
+    get_batch_timeout,
+    running_on_qubes,
+)


 class PixelsToPDF(DangerzoneConverter):
@ -89,20 +94,29 @@ class PixelsToPDF(DangerzoneConverter):
        timeout = self.calculate_timeout(total_size, num_pages)

        # Merge pages into a single PDF
+        timeout_per_batch = get_batch_timeout(timeout, num_pages)
        self.update_progress(f"Merging {num_pages} pages into a single PDF")
+        for first_page, last_page in batch_iterator(num_pages):
            args = ["pdfunite"]
-        for page in range(1, num_pages + 1):
+            accumulator = f"{tempdir}/safe-output.pdf"  # PDF which accumulates pages
+            accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
+            if first_page > 1:  # Append at the beginning
+                args.append(accumulator)
+            for page in range(first_page, last_page + 1):
                args.append(f"{tempdir}/page-{page}.pdf")
-        args.append(f"{tempdir}/safe-output.pdf")
+            args.append(accumulator_temp)
            await self.run_command(
                args,
                error_message="Merging pages into a single PDF failed",
                timeout_message=(
                    "Error merging pages into a single PDF, pdfunite timed out after"
-                f" {timeout} seconds"
+                    f" {timeout_per_batch} seconds"
                ),
-            timeout=timeout,
+                timeout=timeout_per_batch,
            )
+            for page in range(first_page, last_page + 1):
+                os.remove(f"{tempdir}/page-{page}.pdf")
+            os.rename(accumulator_temp, accumulator)

        self.percentage += 2