Perform on-host pixels to PDF conversion

Extend the base isolation provider to immediately convert each page to
a PDF, and optionally use OCR. In contract with the way we did things
previously, there are no more two separate stages (document to pixels,
pixels to PDF). We now handle each page individually, for two main
reasons:

1. We don't want to buffer pixel data, either on disk or in memory,
   since they take a lot of space, and can potentially leave traces.
2. We can perform these operations in parallel, saving time. This is
   more evident when OCR is not used, where the time to convert a page
   to pixels, and then back to a PDF are comparable.
This commit is contained in:
Alex Pyrgiotis 2024-03-14 10:43:19 +02:00
parent cde8ee70bb
commit 137f21da8d
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA

View file

@ -10,12 +10,13 @@ from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Callable, Iterator, Optional
import fitz
from colorama import Fore, Style
from ..conversion import errors
from ..conversion.common import INT_BYTES
from ..conversion.common import DEFAULT_DPI, INT_BYTES
from ..document import Document
from ..util import replace_control_chars
from ..util import get_tessdata_dir, replace_control_chars
log = logging.getLogger(__name__)
@ -111,8 +112,7 @@ class IsolationProvider(ABC):
with tempfile.TemporaryDirectory() as t:
Path(f"{t}/pixels").mkdir()
with self.doc_to_pixels_proc(document) as conversion_proc:
self.doc_to_pixels(document, t, conversion_proc)
self.pixels_to_pdf(document, t, ocr_lang)
self._convert(document, t, ocr_lang, conversion_proc)
document.mark_as_safe()
if document.archive_after_conversion:
document.archive()
@ -126,8 +126,42 @@ class IsolationProvider(ABC):
self.print_progress(document, True, str(e), 0)
document.mark_as_failed()
def doc_to_pixels(
self, document: Document, tempdir: str, p: subprocess.Popen
def _pixels_to_pdf(
self,
untrusted_data: bytes,
untrusted_width: int,
untrusted_height: int,
ocr_lang: Optional[str],
) -> fitz.Document:
"""Convert a byte array of RGB pixels into a PDF page, optionally with OCR."""
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB),
untrusted_width,
untrusted_height,
untrusted_data,
False,
)
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
if ocr_lang: # OCR the document
page_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
tessdata=get_tessdata_dir(),
)
else: # Don't OCR
page_doc = fitz.Document()
page_doc.insert_file(pixmap)
page_pdf_bytes = page_doc.tobytes(deflate_images=True)
return fitz.open("pdf", page_pdf_bytes)
def _convert(
self,
document: Document,
tempdir: str,
ocr_lang: Optional[str],
p: subprocess.Popen,
) -> None:
percentage = 0.0
with open(document.input_filename, "rb") as f:
@ -142,10 +176,13 @@ class IsolationProvider(ABC):
n_pages = read_int(p.stdout)
if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException()
percentage_per_page = 49.0 / n_pages
step = 100 / n_pages / 2
safe_doc = fitz.Document()
for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels"
percentage += step
self.print_progress(document, False, text, percentage)
width = read_int(p.stdout)
@ -161,21 +198,31 @@ class IsolationProvider(ABC):
num_pixels,
)
# Wrapper code
with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width:
f_width.write(str(width))
with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height:
f_height.write(str(height))
with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb:
f_rgb.write(untrusted_pixels)
if ocr_lang:
text = (
f"Converting page {page}/{n_pages} from pixels to"
" searchable PDF"
)
else:
text = f"Converting page {page}/{n_pages} from pixels to PDF"
percentage += step
self.print_progress(document, False, text, percentage)
percentage += percentage_per_page
page_pdf = self._pixels_to_pdf(
untrusted_pixels,
width,
height,
ocr_lang,
)
safe_doc.insert_pdf(page_pdf)
# Ensure nothing else is read after all bitmaps are obtained
p.stdout.close()
safe_doc.save(document.output_filename)
# TODO handle leftover code input
text = "Converted document to pixels"
text = "Converted document"
self.print_progress(document, False, text, percentage)
if getattr(sys, "dangerzone_dev", False):