From c15cbd65c684109e0a09c9063e81af5adc5ba8cc Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 31 Oct 2023 13:20:43 +0000 Subject: [PATCH] Save RGB as PNG immediately to save space Storing all RGB files in the host were leading to a fast-filling `/tmp`. This solution essentially converts all the RGB files to PNGs (which are compressed) saving valuable space in the process. This conversion is made with the Pillow (PIL) module, without the need for any external dependencies. Fixes #526 --- dangerzone/conversion/errors.py | 5 ++ dangerzone/conversion/pixels_to_pdf.py | 54 +++++----------------- dangerzone/isolation_provider/base.py | 22 ++++++++- dangerzone/isolation_provider/container.py | 25 +++++++++- dangerzone/isolation_provider/qubes.py | 15 +++--- 5 files changed, 66 insertions(+), 55 deletions(-) diff --git a/dangerzone/conversion/errors.py b/dangerzone/conversion/errors.py index 2aeb38d..6f5753c 100644 --- a/dangerzone/conversion/errors.py +++ b/dangerzone/conversion/errors.py @@ -104,6 +104,11 @@ class PDFtoPPMInvalidDepth(PDFtoPPMException): error_message = "Error converting PDF to Pixels (Invalid PPM depth)" +class PPMtoPNGError(ConversionException): + error_code = ERROR_SHIFT + 55 + error_message = "Document page could not be reassembled from individual pixels" + + class InterruptedConversion(ConversionException): """Protocol received num of bytes different than expected""" diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 56b1d66..c60ee81 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -18,33 +18,22 @@ from .common import DangerzoneConverter, running_on_qubes class PixelsToPDF(DangerzoneConverter): async def convert( - self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None + self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = "/tmp" ) -> None: self.percentage = 50.0 - if tempdir is None: - tempdir = "/tmp" - num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) + num_pages = len(glob.glob(f"{tempdir}/page-*.png")) total_size = 0.0 # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages for page in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - png_filename = f"{tempdir}/page-{page}.png" - ocr_filename = f"{tempdir}/page-{page}" - pdf_filename = f"{tempdir}/page-{page}.pdf" - - with open(width_filename) as f: - width = f.read().strip() - with open(height_filename) as f: - height = f.read().strip() + filename_base = f"{tempdir}/page-{page}" + png_filename = f"{filename_base}.png" + pdf_filename = f"{filename_base}.pdf" # The first few operations happen on a per-page basis. - page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 + page_size = os.path.getsize(png_filename) / 1024**2 total_size += page_size timeout = self.calculate_timeout(page_size, 1) @@ -52,29 +41,11 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page}/{num_pages} from pixels to searchable PDF" ) - await self.run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"png:{png_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PNG failed", - timeout_message=( - "Error converting pixels to PNG, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) await self.run_command( [ "tesseract", png_filename, - ocr_filename, + filename_base, "-l", ocr_lang, "--dpi", @@ -97,11 +68,7 @@ class PixelsToPDF(DangerzoneConverter): [ "gm", "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", + f"png:{png_filename}", f"pdf:{pdf_filename}", ], error_message=f"Page {page}/{num_pages} conversion to PDF failed", @@ -112,6 +79,9 @@ class PixelsToPDF(DangerzoneConverter): timeout=timeout, ) + # remove PNG file when it is no longer needed + os.remove(png_filename) + self.percentage += percentage_per_page # Next operations apply to the all the pages, so we need to recalculate the @@ -165,7 +135,7 @@ async def main() -> int: converter = PixelsToPDF() try: - await converter.convert(ocr_lang) + await converter.convert(ocr_lang, tempdir="/tmp/dangerzone") error_code = 0 # Success! except (RuntimeError, TimeoutError, ValueError) as e: diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 6265146..9efd439 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,11 +1,14 @@ +import io import logging import subprocess from abc import ABC, abstractmethod +from pathlib import Path from typing import Callable, Optional from colorama import Fore, Style +from PIL import Image, UnidentifiedImageError -from ..conversion.errors import ConversionException +from ..conversion import errors from ..document import Document from ..util import replace_control_chars @@ -37,7 +40,7 @@ class IsolationProvider(ABC): document.mark_as_converting() try: success = self._convert(document, ocr_lang) - except ConversionException as e: + except errors.ConversionException as e: success = False self.print_progress_trusted(document, True, str(e), 0) except Exception as e: @@ -101,6 +104,21 @@ class IsolationProvider(ABC): armor_end = DOC_TO_PIXELS_LOG_END return armor_start + conversion_string + armor_end + def convert_pixels_to_png( + self, tempdir: str, page: int, width: int, height: int, rgb_data: bytes + ) -> None: + """ + Reconstruct PPM files and save as PNG to save space + """ + ppm_header = f"P6\n{width} {height}\n255\n".encode() + ppm_data = io.BytesIO(ppm_header + rgb_data) + png_path = Path(tempdir) / f"page-{page}.png" + + try: + Image.open(ppm_data).save(png_path, "PNG") + except UnidentifiedImageError as e: + raise errors.PPMtoPNGError() from e + # From global_common: diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 61d641b..9235c7f 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -1,3 +1,4 @@ +import glob import gzip import json import logging @@ -11,7 +12,7 @@ import sys import tempfile from typing import Any, Callable, List, Optional, Tuple -from ..conversion.errors import exception_from_error_code +from ..conversion import errors from ..document import Document from ..util import ( get_resource_path, @@ -304,11 +305,31 @@ class Container(IsolationProvider): f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}" ) + num_pages = len(glob.glob(f"{pixel_dir}/page-*.rgb")) + for page in range(1, num_pages + 1): + filename_base = f"{pixel_dir}/page-{page}" + rgb_filename = f"{filename_base}.rgb" + width_filename = f"{filename_base}.width" + height_filename = f"{filename_base}.height" + with open(width_filename) as f: + width = int(f.read().strip()) + with open(height_filename) as f: + height = int(f.read().strip()) + with open(rgb_filename, "rb") as rgb_f: + untrusted_pixels = rgb_f.read() + self.convert_pixels_to_png( + str(pixel_dir), page, width, height, rgb_data=untrusted_pixels + ) + + os.remove(rgb_filename) + os.remove(width_filename) + os.remove(height_filename) + if ret != 0: log.error("documents-to-pixels failed") # XXX Reconstruct exception from error code - raise exception_from_error_code(ret) # type: ignore [misc] + raise errors.exception_from_error_code(ret) # type: ignore [misc] else: # TODO: validate convert to pixels output diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index e06b79d..9dc15e3 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -76,7 +76,6 @@ class Qubes(IsolationProvider): ) -> bool: success = False - Path(f"{tempdir}/dangerzone").mkdir() percentage = 0.0 with open(document.input_filename, "rb") as f: @@ -122,13 +121,9 @@ class Qubes(IsolationProvider): timeout=sw.remaining, ) - # Wrapper code - with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) + self.convert_pixels_to_png( + tempdir, page, width, height, rgb_data=untrusted_pixels + ) percentage += percentage_per_page @@ -165,7 +160,9 @@ class Qubes(IsolationProvider): ) log.info(text) - shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) + shutil.move( + Path(tempdir) / "safe-output-compressed.pdf", document.output_filename + ) success = True return success