From 137f21da8d6d92a519b4d32bacdfdbe8ab6da22e Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Thu, 14 Mar 2024 10:43:19 +0200 Subject: [PATCH] Perform on-host pixels to PDF conversion Extend the base isolation provider to immediately convert each page to a PDF, and optionally use OCR. In contract with the way we did things previously, there are no more two separate stages (document to pixels, pixels to PDF). We now handle each page individually, for two main reasons: 1. We don't want to buffer pixel data, either on disk or in memory, since they take a lot of space, and can potentially leave traces. 2. We can perform these operations in parallel, saving time. This is more evident when OCR is not used, where the time to convert a page to pixels, and then back to a PDF are comparable. --- dangerzone/isolation_provider/base.py | 79 +++++++++++++++++++++------ 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 0a46ce0..b66162b 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -10,12 +10,13 @@ from abc import ABC, abstractmethod from pathlib import Path from typing import IO, Callable, Iterator, Optional +import fitz from colorama import Fore, Style from ..conversion import errors -from ..conversion.common import INT_BYTES +from ..conversion.common import DEFAULT_DPI, INT_BYTES from ..document import Document -from ..util import replace_control_chars +from ..util import get_tessdata_dir, replace_control_chars log = logging.getLogger(__name__) @@ -111,8 +112,7 @@ class IsolationProvider(ABC): with tempfile.TemporaryDirectory() as t: Path(f"{t}/pixels").mkdir() with self.doc_to_pixels_proc(document) as conversion_proc: - self.doc_to_pixels(document, t, conversion_proc) - self.pixels_to_pdf(document, t, ocr_lang) + self._convert(document, t, ocr_lang, conversion_proc) document.mark_as_safe() if document.archive_after_conversion: document.archive() @@ -126,8 +126,42 @@ class IsolationProvider(ABC): self.print_progress(document, True, str(e), 0) document.mark_as_failed() - def doc_to_pixels( - self, document: Document, tempdir: str, p: subprocess.Popen + def _pixels_to_pdf( + self, + untrusted_data: bytes, + untrusted_width: int, + untrusted_height: int, + ocr_lang: Optional[str], + ) -> fitz.Document: + """Convert a byte array of RGB pixels into a PDF page, optionally with OCR.""" + pixmap = fitz.Pixmap( + fitz.Colorspace(fitz.CS_RGB), + untrusted_width, + untrusted_height, + untrusted_data, + False, + ) + pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) + + if ocr_lang: # OCR the document + page_pdf_bytes = pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), + ) + else: # Don't OCR + page_doc = fitz.Document() + page_doc.insert_file(pixmap) + page_pdf_bytes = page_doc.tobytes(deflate_images=True) + + return fitz.open("pdf", page_pdf_bytes) + + def _convert( + self, + document: Document, + tempdir: str, + ocr_lang: Optional[str], + p: subprocess.Popen, ) -> None: percentage = 0.0 with open(document.input_filename, "rb") as f: @@ -142,10 +176,13 @@ class IsolationProvider(ABC): n_pages = read_int(p.stdout) if n_pages == 0 or n_pages > errors.MAX_PAGES: raise errors.MaxPagesException() - percentage_per_page = 49.0 / n_pages + step = 100 / n_pages / 2 + + safe_doc = fitz.Document() for page in range(1, n_pages + 1): text = f"Converting page {page}/{n_pages} to pixels" + percentage += step self.print_progress(document, False, text, percentage) width = read_int(p.stdout) @@ -161,21 +198,31 @@ class IsolationProvider(ABC): num_pixels, ) - # Wrapper code - with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) + if ocr_lang: + text = ( + f"Converting page {page}/{n_pages} from pixels to" + " searchable PDF" + ) + else: + text = f"Converting page {page}/{n_pages} from pixels to PDF" + percentage += step + self.print_progress(document, False, text, percentage) - percentage += percentage_per_page + page_pdf = self._pixels_to_pdf( + untrusted_pixels, + width, + height, + ocr_lang, + ) + safe_doc.insert_pdf(page_pdf) # Ensure nothing else is read after all bitmaps are obtained p.stdout.close() + safe_doc.save(document.output_filename) + # TODO handle leftover code input - text = "Converted document to pixels" + text = "Converted document" self.print_progress(document, False, text, percentage) if getattr(sys, "dangerzone_dev", False):