mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-05-04 12:41:50 +02:00
PDFunite: fix too many open files
In large (1200+) PDFs the PDFunite command would fail on some systems (e.g. Qubes), because it would be called with 1024+ files, leading up to too many files open (`ulimit -n`). This solution splits the merging into batches, accumulating the results in a single PDF and then merging it with the next batch.
This commit is contained in:
parent
ebfed4ecda
commit
f7190e3876
3 changed files with 38 additions and 18 deletions
|
@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
|
|||
yield (first_page, last_page)
|
||||
|
||||
|
||||
def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
|
||||
if timeout is None:
|
||||
return None
|
||||
else:
|
||||
num_batches = int(num_pages / PAGE_BATCH_SIZE)
|
||||
return timeout / num_batches
|
||||
|
||||
|
||||
class DangerzoneConverter:
|
||||
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
|
||||
self.percentage: float = 0.0
|
||||
|
|
|
@ -23,6 +23,7 @@ from .common import (
|
|||
PAGE_BATCH_SIZE,
|
||||
DangerzoneConverter,
|
||||
batch_iterator,
|
||||
get_batch_timeout,
|
||||
running_on_qubes,
|
||||
)
|
||||
|
||||
|
@ -283,10 +284,7 @@ class DocumentToPixels(DangerzoneConverter):
|
|||
# Get a more precise timeout, based on the number of pages
|
||||
timeout = self.calculate_timeout(size, num_pages)
|
||||
|
||||
if timeout is None:
|
||||
timeout_per_batch = None
|
||||
else:
|
||||
timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
|
||||
timeout_per_batch = get_batch_timeout(timeout, num_pages)
|
||||
for first_page, last_page in batch_iterator(num_pages):
|
||||
# XXX send data from the previous loop's conversion to
|
||||
# always be able to process and send data at the same time
|
||||
|
|
|
@ -13,7 +13,12 @@ import shutil
|
|||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from .common import DangerzoneConverter, running_on_qubes
|
||||
from .common import (
|
||||
DangerzoneConverter,
|
||||
batch_iterator,
|
||||
get_batch_timeout,
|
||||
running_on_qubes,
|
||||
)
|
||||
|
||||
|
||||
class PixelsToPDF(DangerzoneConverter):
|
||||
|
@ -89,20 +94,29 @@ class PixelsToPDF(DangerzoneConverter):
|
|||
timeout = self.calculate_timeout(total_size, num_pages)
|
||||
|
||||
# Merge pages into a single PDF
|
||||
timeout_per_batch = get_batch_timeout(timeout, num_pages)
|
||||
self.update_progress(f"Merging {num_pages} pages into a single PDF")
|
||||
args = ["pdfunite"]
|
||||
for page in range(1, num_pages + 1):
|
||||
args.append(f"{tempdir}/page-{page}.pdf")
|
||||
args.append(f"{tempdir}/safe-output.pdf")
|
||||
await self.run_command(
|
||||
args,
|
||||
error_message="Merging pages into a single PDF failed",
|
||||
timeout_message=(
|
||||
"Error merging pages into a single PDF, pdfunite timed out after"
|
||||
f" {timeout} seconds"
|
||||
),
|
||||
timeout=timeout,
|
||||
)
|
||||
for first_page, last_page in batch_iterator(num_pages):
|
||||
args = ["pdfunite"]
|
||||
accumulator = f"{tempdir}/safe-output.pdf" # PDF which accumulates pages
|
||||
accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
|
||||
if first_page > 1: # Append at the beginning
|
||||
args.append(accumulator)
|
||||
for page in range(first_page, last_page + 1):
|
||||
args.append(f"{tempdir}/page-{page}.pdf")
|
||||
args.append(accumulator_temp)
|
||||
await self.run_command(
|
||||
args,
|
||||
error_message="Merging pages into a single PDF failed",
|
||||
timeout_message=(
|
||||
"Error merging pages into a single PDF, pdfunite timed out after"
|
||||
f" {timeout_per_batch} seconds"
|
||||
),
|
||||
timeout=timeout_per_batch,
|
||||
)
|
||||
for page in range(first_page, last_page + 1):
|
||||
os.remove(f"{tempdir}/page-{page}.pdf")
|
||||
os.rename(accumulator_temp, accumulator)
|
||||
|
||||
self.percentage += 2
|
||||
|
||||
|
|
Loading…
Reference in a new issue