diff --git a/dangerzone/conversion/errors.py b/dangerzone/conversion/errors.py index 2aeb38d..6f5753c 100644 --- a/dangerzone/conversion/errors.py +++ b/dangerzone/conversion/errors.py @@ -104,6 +104,11 @@ class PDFtoPPMInvalidDepth(PDFtoPPMException): error_message = "Error converting PDF to Pixels (Invalid PPM depth)" +class PPMtoPNGError(ConversionException): + error_code = ERROR_SHIFT + 55 + error_message = "Document page could not be reassembled from individual pixels" + + class InterruptedConversion(ConversionException): """Protocol received num of bytes different than expected""" diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 56b1d66..c60ee81 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -18,33 +18,22 @@ from .common import DangerzoneConverter, running_on_qubes class PixelsToPDF(DangerzoneConverter): async def convert( - self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None + self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = "/tmp" ) -> None: self.percentage = 50.0 - if tempdir is None: - tempdir = "/tmp" - num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) + num_pages = len(glob.glob(f"{tempdir}/page-*.png")) total_size = 0.0 # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages for page in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - png_filename = f"{tempdir}/page-{page}.png" - ocr_filename = f"{tempdir}/page-{page}" - pdf_filename = f"{tempdir}/page-{page}.pdf" - - with open(width_filename) as f: - width = f.read().strip() - with open(height_filename) as f: - height = f.read().strip() + filename_base = f"{tempdir}/page-{page}" + png_filename = f"{filename_base}.png" + pdf_filename = f"{filename_base}.pdf" # The first few operations happen on a per-page basis. - page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 + page_size = os.path.getsize(png_filename) / 1024**2 total_size += page_size timeout = self.calculate_timeout(page_size, 1) @@ -52,29 +41,11 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page}/{num_pages} from pixels to searchable PDF" ) - await self.run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"png:{png_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PNG failed", - timeout_message=( - "Error converting pixels to PNG, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) await self.run_command( [ "tesseract", png_filename, - ocr_filename, + filename_base, "-l", ocr_lang, "--dpi", @@ -97,11 +68,7 @@ class PixelsToPDF(DangerzoneConverter): [ "gm", "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", + f"png:{png_filename}", f"pdf:{pdf_filename}", ], error_message=f"Page {page}/{num_pages} conversion to PDF failed", @@ -112,6 +79,9 @@ class PixelsToPDF(DangerzoneConverter): timeout=timeout, ) + # remove PNG file when it is no longer needed + os.remove(png_filename) + self.percentage += percentage_per_page # Next operations apply to the all the pages, so we need to recalculate the @@ -165,7 +135,7 @@ async def main() -> int: converter = PixelsToPDF() try: - await converter.convert(ocr_lang) + await converter.convert(ocr_lang, tempdir="/tmp/dangerzone") error_code = 0 # Success! except (RuntimeError, TimeoutError, ValueError) as e: diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 6265146..9efd439 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,11 +1,14 @@ +import io import logging import subprocess from abc import ABC, abstractmethod +from pathlib import Path from typing import Callable, Optional from colorama import Fore, Style +from PIL import Image, UnidentifiedImageError -from ..conversion.errors import ConversionException +from ..conversion import errors from ..document import Document from ..util import replace_control_chars @@ -37,7 +40,7 @@ class IsolationProvider(ABC): document.mark_as_converting() try: success = self._convert(document, ocr_lang) - except ConversionException as e: + except errors.ConversionException as e: success = False self.print_progress_trusted(document, True, str(e), 0) except Exception as e: @@ -101,6 +104,21 @@ class IsolationProvider(ABC): armor_end = DOC_TO_PIXELS_LOG_END return armor_start + conversion_string + armor_end + def convert_pixels_to_png( + self, tempdir: str, page: int, width: int, height: int, rgb_data: bytes + ) -> None: + """ + Reconstruct PPM files and save as PNG to save space + """ + ppm_header = f"P6\n{width} {height}\n255\n".encode() + ppm_data = io.BytesIO(ppm_header + rgb_data) + png_path = Path(tempdir) / f"page-{page}.png" + + try: + Image.open(ppm_data).save(png_path, "PNG") + except UnidentifiedImageError as e: + raise errors.PPMtoPNGError() from e + # From global_common: diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 61d641b..9235c7f 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -1,3 +1,4 @@ +import glob import gzip import json import logging @@ -11,7 +12,7 @@ import sys import tempfile from typing import Any, Callable, List, Optional, Tuple -from ..conversion.errors import exception_from_error_code +from ..conversion import errors from ..document import Document from ..util import ( get_resource_path, @@ -304,11 +305,31 @@ class Container(IsolationProvider): f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}" ) + num_pages = len(glob.glob(f"{pixel_dir}/page-*.rgb")) + for page in range(1, num_pages + 1): + filename_base = f"{pixel_dir}/page-{page}" + rgb_filename = f"{filename_base}.rgb" + width_filename = f"{filename_base}.width" + height_filename = f"{filename_base}.height" + with open(width_filename) as f: + width = int(f.read().strip()) + with open(height_filename) as f: + height = int(f.read().strip()) + with open(rgb_filename, "rb") as rgb_f: + untrusted_pixels = rgb_f.read() + self.convert_pixels_to_png( + str(pixel_dir), page, width, height, rgb_data=untrusted_pixels + ) + + os.remove(rgb_filename) + os.remove(width_filename) + os.remove(height_filename) + if ret != 0: log.error("documents-to-pixels failed") # XXX Reconstruct exception from error code - raise exception_from_error_code(ret) # type: ignore [misc] + raise errors.exception_from_error_code(ret) # type: ignore [misc] else: # TODO: validate convert to pixels output diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index e06b79d..9dc15e3 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -76,7 +76,6 @@ class Qubes(IsolationProvider): ) -> bool: success = False - Path(f"{tempdir}/dangerzone").mkdir() percentage = 0.0 with open(document.input_filename, "rb") as f: @@ -122,13 +121,9 @@ class Qubes(IsolationProvider): timeout=sw.remaining, ) - # Wrapper code - with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) + self.convert_pixels_to_png( + tempdir, page, width, height, rgb_data=untrusted_pixels + ) percentage += percentage_per_page @@ -165,7 +160,9 @@ class Qubes(IsolationProvider): ) log.info(text) - shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) + shutil.move( + Path(tempdir) / "safe-output-compressed.pdf", document.output_filename + ) success = True return success