mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Perform on-host pixels to PDF conversion
Extend the base isolation provider to immediately convert each page to a PDF, and optionally use OCR. In contract with the way we did things previously, there are no more two separate stages (document to pixels, pixels to PDF). We now handle each page individually, for two main reasons: 1. We don't want to buffer pixel data, either on disk or in memory, since they take a lot of space, and can potentially leave traces. 2. We can perform these operations in parallel, saving time. This is more evident when OCR is not used, where the time to convert a page to pixels, and then back to a PDF are comparable.
This commit is contained in:
parent
08f5ef6558
commit
e34c36f7bc
3 changed files with 71 additions and 27 deletions
|
@ -5,17 +5,17 @@ import platform
|
|||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Callable, Iterator, Optional
|
||||
|
||||
import fitz
|
||||
from colorama import Fore, Style
|
||||
|
||||
from ..conversion import errors
|
||||
from ..conversion.common import INT_BYTES
|
||||
from ..conversion.common import DEFAULT_DPI, INT_BYTES
|
||||
from ..document import Document
|
||||
from ..util import replace_control_chars
|
||||
from ..util import get_tessdata_dir, replace_control_chars
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -108,11 +108,8 @@ class IsolationProvider(ABC):
|
|||
self.progress_callback = progress_callback
|
||||
document.mark_as_converting()
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as t:
|
||||
Path(f"{t}/pixels").mkdir()
|
||||
with self.doc_to_pixels_proc(document) as conversion_proc:
|
||||
self.doc_to_pixels(document, t, conversion_proc)
|
||||
self.pixels_to_pdf(document, t, ocr_lang)
|
||||
self.convert_with_proc(document, ocr_lang, conversion_proc)
|
||||
document.mark_as_safe()
|
||||
if document.archive_after_conversion:
|
||||
document.archive()
|
||||
|
@ -126,8 +123,45 @@ class IsolationProvider(ABC):
|
|||
self.print_progress(document, True, str(e), 0)
|
||||
document.mark_as_failed()
|
||||
|
||||
def doc_to_pixels(
|
||||
self, document: Document, tempdir: str, p: subprocess.Popen
|
||||
def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
|
||||
"""Get a single page as pixels, OCR it, and return a PDF as bytes."""
|
||||
return pixmap.pdfocr_tobytes(
|
||||
compress=True,
|
||||
language=ocr_lang,
|
||||
tessdata=str(get_tessdata_dir()),
|
||||
)
|
||||
|
||||
def pixels_to_pdf_page(
|
||||
self,
|
||||
untrusted_data: bytes,
|
||||
untrusted_width: int,
|
||||
untrusted_height: int,
|
||||
ocr_lang: Optional[str],
|
||||
) -> fitz.Document:
|
||||
"""Convert a byte array of RGB pixels into a PDF page, optionally with OCR."""
|
||||
pixmap = fitz.Pixmap(
|
||||
fitz.Colorspace(fitz.CS_RGB),
|
||||
untrusted_width,
|
||||
untrusted_height,
|
||||
untrusted_data,
|
||||
False,
|
||||
)
|
||||
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
|
||||
|
||||
if ocr_lang: # OCR the document
|
||||
page_pdf_bytes = self.ocr_page(pixmap, ocr_lang)
|
||||
else: # Don't OCR
|
||||
page_doc = fitz.Document()
|
||||
page_doc.insert_file(pixmap)
|
||||
page_pdf_bytes = page_doc.tobytes(deflate_images=True)
|
||||
|
||||
return fitz.open("pdf", page_pdf_bytes)
|
||||
|
||||
def convert_with_proc(
|
||||
self,
|
||||
document: Document,
|
||||
ocr_lang: Optional[str],
|
||||
p: subprocess.Popen,
|
||||
) -> None:
|
||||
percentage = 0.0
|
||||
with open(document.input_filename, "rb") as f:
|
||||
|
@ -142,10 +176,15 @@ class IsolationProvider(ABC):
|
|||
n_pages = read_int(p.stdout)
|
||||
if n_pages == 0 or n_pages > errors.MAX_PAGES:
|
||||
raise errors.MaxPagesException()
|
||||
percentage_per_page = 49.0 / n_pages
|
||||
step = 100 / n_pages
|
||||
|
||||
safe_doc = fitz.Document()
|
||||
|
||||
for page in range(1, n_pages + 1):
|
||||
text = f"Converting page {page}/{n_pages} to pixels"
|
||||
searchable = "searchable " if ocr_lang else ""
|
||||
text = (
|
||||
f"Converting page {page}/{n_pages} from pixels to {searchable}PDF"
|
||||
)
|
||||
self.print_progress(document, False, text, percentage)
|
||||
|
||||
width = read_int(p.stdout)
|
||||
|
@ -161,22 +200,27 @@ class IsolationProvider(ABC):
|
|||
num_pixels,
|
||||
)
|
||||
|
||||
# Wrapper code
|
||||
with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width:
|
||||
f_width.write(str(width))
|
||||
with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height:
|
||||
f_height.write(str(height))
|
||||
with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb:
|
||||
f_rgb.write(untrusted_pixels)
|
||||
page_pdf = self.pixels_to_pdf_page(
|
||||
untrusted_pixels,
|
||||
width,
|
||||
height,
|
||||
ocr_lang,
|
||||
)
|
||||
safe_doc.insert_pdf(page_pdf)
|
||||
|
||||
percentage += percentage_per_page
|
||||
percentage += step
|
||||
|
||||
# Ensure nothing else is read after all bitmaps are obtained
|
||||
p.stdout.close()
|
||||
|
||||
# Saving it with a different name first, because PyMuPDF cannot handle
|
||||
# non-Unicode chars.
|
||||
safe_doc.save(document.sanitized_output_filename)
|
||||
os.replace(document.sanitized_output_filename, document.output_filename)
|
||||
|
||||
# TODO handle leftover code input
|
||||
text = "Converted document to pixels"
|
||||
self.print_progress(document, False, text, percentage)
|
||||
text = "Successfully converted document"
|
||||
self.print_progress(document, False, text, 100)
|
||||
|
||||
if getattr(sys, "dangerzone_dev", False):
|
||||
assert p.stderr
|
||||
|
|
|
@ -29,7 +29,7 @@ class IsolationProviderTest:
|
|||
|
||||
p = provider.start_doc_to_pixels_proc(doc)
|
||||
with pytest.raises(errors.ConverterProcException):
|
||||
provider.doc_to_pixels(doc, tmpdir, p)
|
||||
provider.convert_with_proc(doc, None, p)
|
||||
assert provider.get_proc_exception(p) == errors.MaxPagesException
|
||||
|
||||
def test_max_pages_client_enforcement(
|
||||
|
@ -46,7 +46,7 @@ class IsolationProviderTest:
|
|||
doc = Document(sample_doc)
|
||||
p = provider.start_doc_to_pixels_proc(doc)
|
||||
with pytest.raises(errors.MaxPagesException):
|
||||
provider.doc_to_pixels(doc, tmpdir, p)
|
||||
provider.convert_with_proc(doc, None, p)
|
||||
|
||||
def test_max_dimensions(
|
||||
self,
|
||||
|
@ -60,12 +60,12 @@ class IsolationProviderTest:
|
|||
doc = Document(sample_bad_width)
|
||||
p = provider.start_doc_to_pixels_proc(doc)
|
||||
with pytest.raises(errors.MaxPageWidthException):
|
||||
provider.doc_to_pixels(doc, tmpdir, p)
|
||||
provider.convert_with_proc(doc, None, p)
|
||||
|
||||
doc = Document(sample_bad_height)
|
||||
p = provider.start_doc_to_pixels_proc(doc)
|
||||
with pytest.raises(errors.MaxPageHeightException):
|
||||
provider.doc_to_pixels(doc, tmpdir, p)
|
||||
provider.convert_with_proc(doc, None, p)
|
||||
|
||||
|
||||
class IsolationProviderTermination:
|
||||
|
|
|
@ -79,7 +79,7 @@ class TestQubes(IsolationProviderTest):
|
|||
)
|
||||
with pytest.raises(errors.ConverterProcException):
|
||||
doc = Document(sample_doc)
|
||||
provider.doc_to_pixels(doc, tmpdir, proc)
|
||||
provider.convert_with_proc(doc, None, proc)
|
||||
assert provider.get_proc_exception(proc) == errors.QubesQrexecFailed
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue