mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-05-04 20:51:49 +02:00
Process PDF->RGB in groups of 50 pages
PDFtoPPM was producing RGB files faster than they were getting consumed. Since the RGB files were only getting removed after they were sent, this was leading to /tmp in the server getting clogged. This solution consists in processing and sending images in chunks of 50 pages. This solution is slightly inefficient since it can't process and send data simultaneously. That will be solved in a future commit. Fixes #574
This commit is contained in:
parent
53115b3ffa
commit
3046cb7b8b
2 changed files with 78 additions and 62 deletions
|
@ -10,12 +10,14 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
from typing import Callable, Dict, Generator, List, Optional, Tuple, Union
|
||||||
|
|
||||||
TIMEOUT_PER_PAGE: float = 30 # (seconds)
|
TIMEOUT_PER_PAGE: float = 30 # (seconds)
|
||||||
TIMEOUT_PER_MB: float = 30 # (seconds)
|
TIMEOUT_PER_MB: float = 30 # (seconds)
|
||||||
TIMEOUT_MIN: float = 60 # (seconds)
|
TIMEOUT_MIN: float = 60 # (seconds)
|
||||||
|
|
||||||
|
PAGE_BATCH_SIZE = 50 # number of pages to be processed simulatenously
|
||||||
|
|
||||||
|
|
||||||
def running_on_qubes() -> bool:
|
def running_on_qubes() -> bool:
|
||||||
# https://www.qubes-os.org/faq/#what-is-the-canonical-way-to-detect-qubes-vm
|
# https://www.qubes-os.org/faq/#what-is-the-canonical-way-to-detect-qubes-vm
|
||||||
|
@ -44,6 +46,16 @@ def calculate_timeout(size: float, pages: Optional[float] = None) -> float:
|
||||||
return timeout
|
return timeout
|
||||||
|
|
||||||
|
|
||||||
|
def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
|
||||||
|
"""Iterates over batches of PAGE_BATCH_SIZE pages"""
|
||||||
|
for first_page in range(1, num_pages + 1, PAGE_BATCH_SIZE):
|
||||||
|
if first_page + PAGE_BATCH_SIZE >= num_pages: # Last batch
|
||||||
|
last_page = num_pages
|
||||||
|
else:
|
||||||
|
last_page = first_page + PAGE_BATCH_SIZE - 1
|
||||||
|
yield (first_page, last_page)
|
||||||
|
|
||||||
|
|
||||||
class DangerzoneConverter:
|
class DangerzoneConverter:
|
||||||
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
|
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
|
||||||
self.percentage: float = 0.0
|
self.percentage: float = 0.0
|
||||||
|
|
|
@ -19,7 +19,14 @@ from typing import Dict, List, Optional
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
from . import errors
|
from . import errors
|
||||||
from .common import DangerzoneConverter, running_on_qubes
|
from .common import (
|
||||||
|
PAGE_BATCH_SIZE,
|
||||||
|
DangerzoneConverter,
|
||||||
|
batch_iterator,
|
||||||
|
running_on_qubes,
|
||||||
|
)
|
||||||
|
|
||||||
|
PAGE_BASE = "/tmp/page"
|
||||||
|
|
||||||
|
|
||||||
class DocumentToPixels(DangerzoneConverter):
|
class DocumentToPixels(DangerzoneConverter):
|
||||||
|
@ -276,39 +283,70 @@ class DocumentToPixels(DangerzoneConverter):
|
||||||
# Get a more precise timeout, based on the number of pages
|
# Get a more precise timeout, based on the number of pages
|
||||||
timeout = self.calculate_timeout(size, num_pages)
|
timeout = self.calculate_timeout(size, num_pages)
|
||||||
|
|
||||||
async def pdftoppm_progress_callback(line: bytes) -> None:
|
if timeout is None:
|
||||||
"""Function called for every line the 'pdftoppm' command outputs
|
timeout_per_batch = None
|
||||||
|
else:
|
||||||
|
timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
|
||||||
|
for first_page, last_page in batch_iterator(num_pages):
|
||||||
|
await self.pdf_to_rgb(first_page, last_page, pdf_filename, timeout_per_batch)
|
||||||
|
await self.send_rgb_files(first_page, last_page, num_pages)
|
||||||
|
|
||||||
Sample pdftoppm output:
|
final_files = (
|
||||||
|
glob.glob("/tmp/page-*.rgb")
|
||||||
|
+ glob.glob("/tmp/page-*.width")
|
||||||
|
+ glob.glob("/tmp/page-*.height")
|
||||||
|
)
|
||||||
|
|
||||||
$ pdftoppm sample.pdf /tmp/safe -progress
|
# XXX: Sanity check to avoid situations like #560.
|
||||||
1 4 /tmp/safe-1.ppm
|
if not running_on_qubes() and len(final_files) != 3 * num_pages:
|
||||||
2 4 /tmp/safe-2.ppm
|
raise errors.PageCountMismatch()
|
||||||
3 4 /tmp/safe-3.ppm
|
|
||||||
4 4 /tmp/safe-4.ppm
|
|
||||||
|
|
||||||
Each successful line is in the format "{page} {page_num} {ppm_filename}"
|
# Move converted files into /tmp/dangerzone
|
||||||
"""
|
for filename in final_files:
|
||||||
try:
|
shutil.move(filename, "/tmp/dangerzone")
|
||||||
(page_str, num_pages_str, _) = line.decode().split()
|
|
||||||
num_pages = int(num_pages_str)
|
|
||||||
page = int(page_str)
|
|
||||||
except ValueError as e:
|
|
||||||
# Ignore all non-progress related output, since pdftoppm sends
|
|
||||||
# everything to stderr and thus, errors can't be distinguished
|
|
||||||
# easily. We rely instead on the exit code.
|
|
||||||
return
|
|
||||||
|
|
||||||
|
self.update_progress("Converted document to pixels")
|
||||||
|
|
||||||
|
async def pdf_to_rgb(
|
||||||
|
self,
|
||||||
|
first_page: int,
|
||||||
|
last_page: int,
|
||||||
|
pdf_filename: str,
|
||||||
|
timeout: Optional[float],
|
||||||
|
) -> None:
|
||||||
|
await self.run_command(
|
||||||
|
[
|
||||||
|
"pdftoppm",
|
||||||
|
pdf_filename,
|
||||||
|
PAGE_BASE,
|
||||||
|
"-progress",
|
||||||
|
"-f",
|
||||||
|
str(first_page),
|
||||||
|
"-l",
|
||||||
|
str(last_page),
|
||||||
|
],
|
||||||
|
error_message="Conversion from PDF to PPM failed",
|
||||||
|
timeout_message=(
|
||||||
|
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
|
||||||
|
" seconds"
|
||||||
|
),
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def send_rgb_files(
|
||||||
|
self, first_page: int, last_page: int, num_pages: int
|
||||||
|
) -> None:
|
||||||
|
for page in range(first_page, last_page + 1):
|
||||||
percentage_per_page = 45.0 / num_pages
|
percentage_per_page = 45.0 / num_pages
|
||||||
self.percentage += percentage_per_page
|
self.percentage += percentage_per_page
|
||||||
self.update_progress(f"Converting page {page}/{num_pages} to pixels")
|
self.update_progress(f"Converting pages {page}/{num_pages} to pixels")
|
||||||
|
|
||||||
zero_padding = "0" * (len(num_pages_str) - len(page_str))
|
zero_padding = "0" * (len(str(num_pages)) - len(str(page)))
|
||||||
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
|
ppm_filename = f"{PAGE_BASE}-{zero_padding}{page}.ppm"
|
||||||
rgb_filename = f"{page_base}-{page}.rgb"
|
rgb_filename = f"{PAGE_BASE}-{page}.rgb"
|
||||||
width_filename = f"{page_base}-{page}.width"
|
width_filename = f"{PAGE_BASE}-{page}.width"
|
||||||
height_filename = f"{page_base}-{page}.height"
|
height_filename = f"{PAGE_BASE}-{page}.height"
|
||||||
filename_base = f"{page_base}-{page}"
|
filename_base = f"{PAGE_BASE}-{page}"
|
||||||
|
|
||||||
with open(ppm_filename, "rb") as f:
|
with open(ppm_filename, "rb") as f:
|
||||||
# NOTE: PPM files have multiple ways of writing headers.
|
# NOTE: PPM files have multiple ways of writing headers.
|
||||||
|
@ -339,40 +377,6 @@ class DocumentToPixels(DangerzoneConverter):
|
||||||
# Delete the ppm file
|
# Delete the ppm file
|
||||||
os.remove(ppm_filename)
|
os.remove(ppm_filename)
|
||||||
|
|
||||||
page_base = "/tmp/page"
|
|
||||||
|
|
||||||
await self.run_command(
|
|
||||||
[
|
|
||||||
"pdftoppm",
|
|
||||||
pdf_filename,
|
|
||||||
page_base,
|
|
||||||
"-progress",
|
|
||||||
],
|
|
||||||
error_message="Conversion from PDF to PPM failed",
|
|
||||||
timeout_message=(
|
|
||||||
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
|
|
||||||
" seconds"
|
|
||||||
),
|
|
||||||
stderr_callback=pdftoppm_progress_callback,
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
final_files = (
|
|
||||||
glob.glob("/tmp/page-*.rgb")
|
|
||||||
+ glob.glob("/tmp/page-*.width")
|
|
||||||
+ glob.glob("/tmp/page-*.height")
|
|
||||||
)
|
|
||||||
|
|
||||||
# XXX: Sanity check to avoid situations like #560.
|
|
||||||
if not running_on_qubes() and len(final_files) != 3 * num_pages:
|
|
||||||
raise errors.PageCountMismatch()
|
|
||||||
|
|
||||||
# Move converted files into /tmp/dangerzone
|
|
||||||
for filename in final_files:
|
|
||||||
shutil.move(filename, "/tmp/dangerzone")
|
|
||||||
|
|
||||||
self.update_progress("Converted document to pixels")
|
|
||||||
|
|
||||||
async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
|
async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
|
||||||
self.update_progress(f"Installing LibreOffice extension '{libreoffice_ext}'")
|
self.update_progress(f"Installing LibreOffice extension '{libreoffice_ext}'")
|
||||||
unzip_args = [
|
unzip_args = [
|
||||||
|
|
Loading…
Reference in a new issue