Perform on-host pixels to PDF conversion

Extend the base isolation provider to immediately convert each page to a PDF, and optionally use OCR. In contract with the way we did things previously, there are no more two separate stages (document to pixels, pixels to PDF). We now handle each page individually, for two main reasons: 1. We don't want to buffer pixel data, either on disk or in memory, since they take a lot of space, and can potentially leave traces. 2. We can perform these operations in parallel, saving time. This is more evident when OCR is not used, where the time to convert a page to pixels, and then back to a PDF are comparable.
2025-04-29 02:12:36 +02:00 · 2024-03-14 10:43:19 +02:00 · 2024-03-14 10:43:19 +02:00 · 137f21da8d
commit 137f21da8d
parent cde8ee70bb
1 changed files with 63 additions and 16 deletions
--- a/dangerzone/isolation_provider/base.py
+++ b/dangerzone/isolation_provider/base.py
@ -10,12 +10,13 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import IO, Callable, Iterator, Optional
 import fitz
 from colorama import Fore, Style
 from ..conversion import errors
-from ..conversion.common import INT_BYTES
+from ..conversion.common import DEFAULT_DPI, INT_BYTES
 from ..document import Document
-from ..util import replace_control_chars
+from ..util import get_tessdata_dir, replace_control_chars
 log = logging.getLogger(__name__)
@ -111,8 +112,7 @@ class IsolationProvider(ABC):
            with tempfile.TemporaryDirectory() as t:
                Path(f"{t}/pixels").mkdir()
                with self.doc_to_pixels_proc(document) as conversion_proc:
-                    self.doc_to_pixels(document, t, conversion_proc)
+                    self._convert(document, t, ocr_lang, conversion_proc)
                self.pixels_to_pdf(document, t, ocr_lang)
            document.mark_as_safe()
            if document.archive_after_conversion:
                document.archive()
@ -126,8 +126,42 @@ class IsolationProvider(ABC):
            self.print_progress(document, True, str(e), 0)
            document.mark_as_failed()
-    def doc_to_pixels(
+    def _pixels_to_pdf(
-        self, document: Document, tempdir: str, p: subprocess.Popen
+        self,
        untrusted_data: bytes,
        untrusted_width: int,
        untrusted_height: int,
        ocr_lang: Optional[str],
    ) -> fitz.Document:
        """Convert a byte array of RGB pixels into a PDF page, optionally with OCR."""
        pixmap = fitz.Pixmap(
            fitz.Colorspace(fitz.CS_RGB),
            untrusted_width,
            untrusted_height,
            untrusted_data,
            False,
        )
        pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
        if ocr_lang:  # OCR the document
            page_pdf_bytes = pixmap.pdfocr_tobytes(
                compress=True,
                language=ocr_lang,
                tessdata=get_tessdata_dir(),
            )
        else:  # Don't OCR
            page_doc = fitz.Document()
            page_doc.insert_file(pixmap)
            page_pdf_bytes = page_doc.tobytes(deflate_images=True)
        return fitz.open("pdf", page_pdf_bytes)
    def _convert(
        self,
        document: Document,
        tempdir: str,
        ocr_lang: Optional[str],
        p: subprocess.Popen,
    ) -> None:
        percentage = 0.0
        with open(document.input_filename, "rb") as f:
@ -142,10 +176,13 @@ class IsolationProvider(ABC):
            n_pages = read_int(p.stdout)
            if n_pages == 0 or n_pages > errors.MAX_PAGES:
                raise errors.MaxPagesException()
-            percentage_per_page = 49.0 / n_pages
+            step = 100 / n_pages / 2
            safe_doc = fitz.Document()
            for page in range(1, n_pages + 1):
                text = f"Converting page {page}/{n_pages} to pixels"
                percentage += step
                self.print_progress(document, False, text, percentage)
                width = read_int(p.stdout)
@ -161,21 +198,31 @@ class IsolationProvider(ABC):
                    num_pixels,
                )
-                # Wrapper code
+                if ocr_lang:
-                with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width:
+                    text = (
-                    f_width.write(str(width))
+                        f"Converting page {page}/{n_pages} from pixels to"
-                with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height:
+                        " searchable PDF"
-                    f_height.write(str(height))
+                    )
-                with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb:
+                else:
-                    f_rgb.write(untrusted_pixels)
+                    text = f"Converting page {page}/{n_pages} from pixels to PDF"
                percentage += step
                self.print_progress(document, False, text, percentage)
-                percentage += percentage_per_page
+                page_pdf = self._pixels_to_pdf(
                    untrusted_pixels,
                    width,
                    height,
                    ocr_lang,
                )
                safe_doc.insert_pdf(page_pdf)
        # Ensure nothing else is read after all bitmaps are obtained
        p.stdout.close()
        safe_doc.save(document.output_filename)
        # TODO handle leftover code input
-        text = "Converted document to pixels"
+        text = "Converted document"
        self.print_progress(document, False, text, percentage)
        if getattr(sys, "dangerzone_dev", False):