diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 0a46ce0..9bf980b 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -5,17 +5,17 @@ import platform import signal import subprocess import sys -import tempfile from abc import ABC, abstractmethod from pathlib import Path from typing import IO, Callable, Iterator, Optional +import fitz from colorama import Fore, Style from ..conversion import errors -from ..conversion.common import INT_BYTES +from ..conversion.common import DEFAULT_DPI, INT_BYTES from ..document import Document -from ..util import replace_control_chars +from ..util import get_tessdata_dir, replace_control_chars log = logging.getLogger(__name__) @@ -108,11 +108,8 @@ class IsolationProvider(ABC): self.progress_callback = progress_callback document.mark_as_converting() try: - with tempfile.TemporaryDirectory() as t: - Path(f"{t}/pixels").mkdir() - with self.doc_to_pixels_proc(document) as conversion_proc: - self.doc_to_pixels(document, t, conversion_proc) - self.pixels_to_pdf(document, t, ocr_lang) + with self.doc_to_pixels_proc(document) as conversion_proc: + self.convert_with_proc(document, ocr_lang, conversion_proc) document.mark_as_safe() if document.archive_after_conversion: document.archive() @@ -126,8 +123,45 @@ class IsolationProvider(ABC): self.print_progress(document, True, str(e), 0) document.mark_as_failed() - def doc_to_pixels( - self, document: Document, tempdir: str, p: subprocess.Popen + def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes: + """Get a single page as pixels, OCR it, and return a PDF as bytes.""" + return pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=str(get_tessdata_dir()), + ) + + def pixels_to_pdf_page( + self, + untrusted_data: bytes, + untrusted_width: int, + untrusted_height: int, + ocr_lang: Optional[str], + ) -> fitz.Document: + """Convert a byte array of RGB pixels into a PDF page, optionally with OCR.""" + pixmap = fitz.Pixmap( + fitz.Colorspace(fitz.CS_RGB), + untrusted_width, + untrusted_height, + untrusted_data, + False, + ) + pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) + + if ocr_lang: # OCR the document + page_pdf_bytes = self.ocr_page(pixmap, ocr_lang) + else: # Don't OCR + page_doc = fitz.Document() + page_doc.insert_file(pixmap) + page_pdf_bytes = page_doc.tobytes(deflate_images=True) + + return fitz.open("pdf", page_pdf_bytes) + + def convert_with_proc( + self, + document: Document, + ocr_lang: Optional[str], + p: subprocess.Popen, ) -> None: percentage = 0.0 with open(document.input_filename, "rb") as f: @@ -142,10 +176,15 @@ class IsolationProvider(ABC): n_pages = read_int(p.stdout) if n_pages == 0 or n_pages > errors.MAX_PAGES: raise errors.MaxPagesException() - percentage_per_page = 49.0 / n_pages + step = 100 / n_pages + + safe_doc = fitz.Document() for page in range(1, n_pages + 1): - text = f"Converting page {page}/{n_pages} to pixels" + searchable = "searchable " if ocr_lang else "" + text = ( + f"Converting page {page}/{n_pages} from pixels to {searchable}PDF" + ) self.print_progress(document, False, text, percentage) width = read_int(p.stdout) @@ -161,22 +200,27 @@ class IsolationProvider(ABC): num_pixels, ) - # Wrapper code - with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) + page_pdf = self.pixels_to_pdf_page( + untrusted_pixels, + width, + height, + ocr_lang, + ) + safe_doc.insert_pdf(page_pdf) - percentage += percentage_per_page + percentage += step # Ensure nothing else is read after all bitmaps are obtained p.stdout.close() + # Saving it with a different name first, because PyMuPDF cannot handle + # non-Unicode chars. + safe_doc.save(document.sanitized_output_filename) + os.replace(document.sanitized_output_filename, document.output_filename) + # TODO handle leftover code input - text = "Converted document to pixels" - self.print_progress(document, False, text, percentage) + text = "Successfully converted document" + self.print_progress(document, False, text, 100) if getattr(sys, "dangerzone_dev", False): assert p.stderr diff --git a/tests/isolation_provider/base.py b/tests/isolation_provider/base.py index abf4889..6ba76ce 100644 --- a/tests/isolation_provider/base.py +++ b/tests/isolation_provider/base.py @@ -29,7 +29,7 @@ class IsolationProviderTest: p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.ConverterProcException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) assert provider.get_proc_exception(p) == errors.MaxPagesException def test_max_pages_client_enforcement( @@ -46,7 +46,7 @@ class IsolationProviderTest: doc = Document(sample_doc) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPagesException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) def test_max_dimensions( self, @@ -60,12 +60,12 @@ class IsolationProviderTest: doc = Document(sample_bad_width) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPageWidthException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) doc = Document(sample_bad_height) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPageHeightException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) class IsolationProviderTermination: diff --git a/tests/isolation_provider/test_qubes.py b/tests/isolation_provider/test_qubes.py index 77ea939..756f9be 100644 --- a/tests/isolation_provider/test_qubes.py +++ b/tests/isolation_provider/test_qubes.py @@ -79,7 +79,7 @@ class TestQubes(IsolationProviderTest): ) with pytest.raises(errors.ConverterProcException): doc = Document(sample_doc) - provider.doc_to_pixels(doc, tmpdir, proc) + provider.convert_with_proc(doc, None, proc) assert provider.get_proc_exception(proc) == errors.QubesQrexecFailed