Perform on-host pixels to PDF conversion

Extend the base isolation provider to immediately convert each page to
a PDF, and optionally use OCR. In contract with the way we did things
previously, there are no more two separate stages (document to pixels,
pixels to PDF). We now handle each page individually, for two main
reasons:

1. We don't want to buffer pixel data, either on disk or in memory,
   since they take a lot of space, and can potentially leave traces.
2. We can perform these operations in parallel, saving time. This is
   more evident when OCR is not used, where the time to convert a page
   to pixels, and then back to a PDF are comparable.
This commit is contained in:
Alex Pyrgiotis 2024-03-14 10:43:19 +02:00
parent 08f5ef6558
commit e34c36f7bc
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
3 changed files with 71 additions and 27 deletions

View file

@ -5,17 +5,17 @@ import platform
import signal import signal
import subprocess import subprocess
import sys import sys
import tempfile
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import IO, Callable, Iterator, Optional from typing import IO, Callable, Iterator, Optional
import fitz
from colorama import Fore, Style from colorama import Fore, Style
from ..conversion import errors from ..conversion import errors
from ..conversion.common import INT_BYTES from ..conversion.common import DEFAULT_DPI, INT_BYTES
from ..document import Document from ..document import Document
from ..util import replace_control_chars from ..util import get_tessdata_dir, replace_control_chars
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -108,11 +108,8 @@ class IsolationProvider(ABC):
self.progress_callback = progress_callback self.progress_callback = progress_callback
document.mark_as_converting() document.mark_as_converting()
try: try:
with tempfile.TemporaryDirectory() as t: with self.doc_to_pixels_proc(document) as conversion_proc:
Path(f"{t}/pixels").mkdir() self.convert_with_proc(document, ocr_lang, conversion_proc)
with self.doc_to_pixels_proc(document) as conversion_proc:
self.doc_to_pixels(document, t, conversion_proc)
self.pixels_to_pdf(document, t, ocr_lang)
document.mark_as_safe() document.mark_as_safe()
if document.archive_after_conversion: if document.archive_after_conversion:
document.archive() document.archive()
@ -126,8 +123,45 @@ class IsolationProvider(ABC):
self.print_progress(document, True, str(e), 0) self.print_progress(document, True, str(e), 0)
document.mark_as_failed() document.mark_as_failed()
def doc_to_pixels( def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
self, document: Document, tempdir: str, p: subprocess.Popen """Get a single page as pixels, OCR it, and return a PDF as bytes."""
return pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
tessdata=str(get_tessdata_dir()),
)
def pixels_to_pdf_page(
self,
untrusted_data: bytes,
untrusted_width: int,
untrusted_height: int,
ocr_lang: Optional[str],
) -> fitz.Document:
"""Convert a byte array of RGB pixels into a PDF page, optionally with OCR."""
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB),
untrusted_width,
untrusted_height,
untrusted_data,
False,
)
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
if ocr_lang: # OCR the document
page_pdf_bytes = self.ocr_page(pixmap, ocr_lang)
else: # Don't OCR
page_doc = fitz.Document()
page_doc.insert_file(pixmap)
page_pdf_bytes = page_doc.tobytes(deflate_images=True)
return fitz.open("pdf", page_pdf_bytes)
def convert_with_proc(
self,
document: Document,
ocr_lang: Optional[str],
p: subprocess.Popen,
) -> None: ) -> None:
percentage = 0.0 percentage = 0.0
with open(document.input_filename, "rb") as f: with open(document.input_filename, "rb") as f:
@ -142,10 +176,15 @@ class IsolationProvider(ABC):
n_pages = read_int(p.stdout) n_pages = read_int(p.stdout)
if n_pages == 0 or n_pages > errors.MAX_PAGES: if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException() raise errors.MaxPagesException()
percentage_per_page = 49.0 / n_pages step = 100 / n_pages
safe_doc = fitz.Document()
for page in range(1, n_pages + 1): for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels" searchable = "searchable " if ocr_lang else ""
text = (
f"Converting page {page}/{n_pages} from pixels to {searchable}PDF"
)
self.print_progress(document, False, text, percentage) self.print_progress(document, False, text, percentage)
width = read_int(p.stdout) width = read_int(p.stdout)
@ -161,22 +200,27 @@ class IsolationProvider(ABC):
num_pixels, num_pixels,
) )
# Wrapper code page_pdf = self.pixels_to_pdf_page(
with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: untrusted_pixels,
f_width.write(str(width)) width,
with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: height,
f_height.write(str(height)) ocr_lang,
with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: )
f_rgb.write(untrusted_pixels) safe_doc.insert_pdf(page_pdf)
percentage += percentage_per_page percentage += step
# Ensure nothing else is read after all bitmaps are obtained # Ensure nothing else is read after all bitmaps are obtained
p.stdout.close() p.stdout.close()
# Saving it with a different name first, because PyMuPDF cannot handle
# non-Unicode chars.
safe_doc.save(document.sanitized_output_filename)
os.replace(document.sanitized_output_filename, document.output_filename)
# TODO handle leftover code input # TODO handle leftover code input
text = "Converted document to pixels" text = "Successfully converted document"
self.print_progress(document, False, text, percentage) self.print_progress(document, False, text, 100)
if getattr(sys, "dangerzone_dev", False): if getattr(sys, "dangerzone_dev", False):
assert p.stderr assert p.stderr

View file

@ -29,7 +29,7 @@ class IsolationProviderTest:
p = provider.start_doc_to_pixels_proc(doc) p = provider.start_doc_to_pixels_proc(doc)
with pytest.raises(errors.ConverterProcException): with pytest.raises(errors.ConverterProcException):
provider.doc_to_pixels(doc, tmpdir, p) provider.convert_with_proc(doc, None, p)
assert provider.get_proc_exception(p) == errors.MaxPagesException assert provider.get_proc_exception(p) == errors.MaxPagesException
def test_max_pages_client_enforcement( def test_max_pages_client_enforcement(
@ -46,7 +46,7 @@ class IsolationProviderTest:
doc = Document(sample_doc) doc = Document(sample_doc)
p = provider.start_doc_to_pixels_proc(doc) p = provider.start_doc_to_pixels_proc(doc)
with pytest.raises(errors.MaxPagesException): with pytest.raises(errors.MaxPagesException):
provider.doc_to_pixels(doc, tmpdir, p) provider.convert_with_proc(doc, None, p)
def test_max_dimensions( def test_max_dimensions(
self, self,
@ -60,12 +60,12 @@ class IsolationProviderTest:
doc = Document(sample_bad_width) doc = Document(sample_bad_width)
p = provider.start_doc_to_pixels_proc(doc) p = provider.start_doc_to_pixels_proc(doc)
with pytest.raises(errors.MaxPageWidthException): with pytest.raises(errors.MaxPageWidthException):
provider.doc_to_pixels(doc, tmpdir, p) provider.convert_with_proc(doc, None, p)
doc = Document(sample_bad_height) doc = Document(sample_bad_height)
p = provider.start_doc_to_pixels_proc(doc) p = provider.start_doc_to_pixels_proc(doc)
with pytest.raises(errors.MaxPageHeightException): with pytest.raises(errors.MaxPageHeightException):
provider.doc_to_pixels(doc, tmpdir, p) provider.convert_with_proc(doc, None, p)
class IsolationProviderTermination: class IsolationProviderTermination:

View file

@ -79,7 +79,7 @@ class TestQubes(IsolationProviderTest):
) )
with pytest.raises(errors.ConverterProcException): with pytest.raises(errors.ConverterProcException):
doc = Document(sample_doc) doc = Document(sample_doc)
provider.doc_to_pixels(doc, tmpdir, proc) provider.convert_with_proc(doc, None, proc)
assert provider.get_proc_exception(proc) == errors.QubesQrexecFailed assert provider.get_proc_exception(proc) == errors.QubesQrexecFailed