PDFunite: fix too many open files

In large (1200+) PDFs the PDFunite command would fail on some systems
(e.g. Qubes), because it would be called with 1024+ files, leading up
to too many files open (`ulimit -n`).

This solution splits the merging into batches, accumulating the results
in a single PDF and then merging it with the next batch.
This commit is contained in:
deeplow 2023-11-02 15:00:10 +00:00
parent ebfed4ecda
commit f7190e3876
No known key found for this signature in database
GPG key ID: 577982871529A52A
3 changed files with 38 additions and 18 deletions

View file

@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
yield (first_page, last_page)
def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
if timeout is None:
return None
else:
num_batches = int(num_pages / PAGE_BATCH_SIZE)
return timeout / num_batches
class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0

View file

@ -23,6 +23,7 @@ from .common import (
PAGE_BATCH_SIZE,
DangerzoneConverter,
batch_iterator,
get_batch_timeout,
running_on_qubes,
)
@ -283,10 +284,7 @@ class DocumentToPixels(DangerzoneConverter):
# Get a more precise timeout, based on the number of pages
timeout = self.calculate_timeout(size, num_pages)
if timeout is None:
timeout_per_batch = None
else:
timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
timeout_per_batch = get_batch_timeout(timeout, num_pages)
for first_page, last_page in batch_iterator(num_pages):
# XXX send data from the previous loop's conversion to
# always be able to process and send data at the same time

View file

@ -13,7 +13,12 @@ import shutil
import sys
from typing import Optional
from .common import DangerzoneConverter, running_on_qubes
from .common import (
DangerzoneConverter,
batch_iterator,
get_batch_timeout,
running_on_qubes,
)
class PixelsToPDF(DangerzoneConverter):
@ -89,20 +94,29 @@ class PixelsToPDF(DangerzoneConverter):
timeout = self.calculate_timeout(total_size, num_pages)
# Merge pages into a single PDF
timeout_per_batch = get_batch_timeout(timeout, num_pages)
self.update_progress(f"Merging {num_pages} pages into a single PDF")
for first_page, last_page in batch_iterator(num_pages):
args = ["pdfunite"]
for page in range(1, num_pages + 1):
accumulator = f"{tempdir}/safe-output.pdf" # PDF which accumulates pages
accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
if first_page > 1: # Append at the beginning
args.append(accumulator)
for page in range(first_page, last_page + 1):
args.append(f"{tempdir}/page-{page}.pdf")
args.append(f"{tempdir}/safe-output.pdf")
args.append(accumulator_temp)
await self.run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
f" {timeout_per_batch} seconds"
),
timeout=timeout,
timeout=timeout_per_batch,
)
for page in range(first_page, last_page + 1):
os.remove(f"{tempdir}/page-{page}.pdf")
os.rename(accumulator_temp, accumulator)
self.percentage += 2