mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-05-04 20:51:49 +02:00
PDFunite: fix too many open files
In large (1200+) PDFs the PDFunite command would fail on some systems (e.g. Qubes), because it would be called with 1024+ files, leading up to too many files open (`ulimit -n`). This solution splits the merging into batches, accumulating the results in a single PDF and then merging it with the next batch.
This commit is contained in:
parent
ebfed4ecda
commit
f7190e3876
3 changed files with 38 additions and 18 deletions
|
@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
|
||||||
yield (first_page, last_page)
|
yield (first_page, last_page)
|
||||||
|
|
||||||
|
|
||||||
|
def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
|
||||||
|
if timeout is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
num_batches = int(num_pages / PAGE_BATCH_SIZE)
|
||||||
|
return timeout / num_batches
|
||||||
|
|
||||||
|
|
||||||
class DangerzoneConverter:
|
class DangerzoneConverter:
|
||||||
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
|
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
|
||||||
self.percentage: float = 0.0
|
self.percentage: float = 0.0
|
||||||
|
|
|
@ -23,6 +23,7 @@ from .common import (
|
||||||
PAGE_BATCH_SIZE,
|
PAGE_BATCH_SIZE,
|
||||||
DangerzoneConverter,
|
DangerzoneConverter,
|
||||||
batch_iterator,
|
batch_iterator,
|
||||||
|
get_batch_timeout,
|
||||||
running_on_qubes,
|
running_on_qubes,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -283,10 +284,7 @@ class DocumentToPixels(DangerzoneConverter):
|
||||||
# Get a more precise timeout, based on the number of pages
|
# Get a more precise timeout, based on the number of pages
|
||||||
timeout = self.calculate_timeout(size, num_pages)
|
timeout = self.calculate_timeout(size, num_pages)
|
||||||
|
|
||||||
if timeout is None:
|
timeout_per_batch = get_batch_timeout(timeout, num_pages)
|
||||||
timeout_per_batch = None
|
|
||||||
else:
|
|
||||||
timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
|
|
||||||
for first_page, last_page in batch_iterator(num_pages):
|
for first_page, last_page in batch_iterator(num_pages):
|
||||||
# XXX send data from the previous loop's conversion to
|
# XXX send data from the previous loop's conversion to
|
||||||
# always be able to process and send data at the same time
|
# always be able to process and send data at the same time
|
||||||
|
|
|
@ -13,7 +13,12 @@ import shutil
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from .common import DangerzoneConverter, running_on_qubes
|
from .common import (
|
||||||
|
DangerzoneConverter,
|
||||||
|
batch_iterator,
|
||||||
|
get_batch_timeout,
|
||||||
|
running_on_qubes,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PixelsToPDF(DangerzoneConverter):
|
class PixelsToPDF(DangerzoneConverter):
|
||||||
|
@ -89,20 +94,29 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
timeout = self.calculate_timeout(total_size, num_pages)
|
timeout = self.calculate_timeout(total_size, num_pages)
|
||||||
|
|
||||||
# Merge pages into a single PDF
|
# Merge pages into a single PDF
|
||||||
|
timeout_per_batch = get_batch_timeout(timeout, num_pages)
|
||||||
self.update_progress(f"Merging {num_pages} pages into a single PDF")
|
self.update_progress(f"Merging {num_pages} pages into a single PDF")
|
||||||
args = ["pdfunite"]
|
for first_page, last_page in batch_iterator(num_pages):
|
||||||
for page in range(1, num_pages + 1):
|
args = ["pdfunite"]
|
||||||
args.append(f"{tempdir}/page-{page}.pdf")
|
accumulator = f"{tempdir}/safe-output.pdf" # PDF which accumulates pages
|
||||||
args.append(f"{tempdir}/safe-output.pdf")
|
accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
|
||||||
await self.run_command(
|
if first_page > 1: # Append at the beginning
|
||||||
args,
|
args.append(accumulator)
|
||||||
error_message="Merging pages into a single PDF failed",
|
for page in range(first_page, last_page + 1):
|
||||||
timeout_message=(
|
args.append(f"{tempdir}/page-{page}.pdf")
|
||||||
"Error merging pages into a single PDF, pdfunite timed out after"
|
args.append(accumulator_temp)
|
||||||
f" {timeout} seconds"
|
await self.run_command(
|
||||||
),
|
args,
|
||||||
timeout=timeout,
|
error_message="Merging pages into a single PDF failed",
|
||||||
)
|
timeout_message=(
|
||||||
|
"Error merging pages into a single PDF, pdfunite timed out after"
|
||||||
|
f" {timeout_per_batch} seconds"
|
||||||
|
),
|
||||||
|
timeout=timeout_per_batch,
|
||||||
|
)
|
||||||
|
for page in range(first_page, last_page + 1):
|
||||||
|
os.remove(f"{tempdir}/page-{page}.pdf")
|
||||||
|
os.rename(accumulator_temp, accumulator)
|
||||||
|
|
||||||
self.percentage += 2
|
self.percentage += 2
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue