diff --git a/Dockerfile b/Dockerfile index 5f551ff..4b794b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion RUN adduser -s /bin/sh -D dangerzone USER dangerzone -# /tmp/input_file is where the first convert expects the input file to be, and -# /tmp where it will write the pixel files -# -# /dangerzone is where the second script expects files to be put by the first one -# -# /safezone is where the wrapper eventually moves the sanitized files. -VOLUME /dangerzone /tmp/input_file /safezone +# /safezone is a directory through which Pixels to PDF receives files +VOLUME /safezone diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 2112206..b367bc0 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -10,7 +10,7 @@ import subprocess import sys import time from abc import abstractmethod -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union TIMEOUT_PER_PAGE: float = 30 # (seconds) TIMEOUT_PER_MB: float = 30 # (seconds) @@ -58,6 +58,49 @@ class DangerzoneConverter: self.progress_callback = progress_callback self.captured_output: bytes = b"" + @classmethod + def _read_bytes(cls) -> bytes: + """Read bytes from the stdin.""" + data = sys.stdin.buffer.read() + if data is None: + raise EOFError + return data + + @classmethod + def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + file.buffer.write(data) + + @classmethod + def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + cls._write_bytes(text.encode(), file=file) + + @classmethod + def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + cls._write_bytes(num.to_bytes(2, signed=False), file=file) + + # ==== ASYNC METHODS ==== + # We run sync methods in async wrappers, because pure async methods are more difficult: + # https://stackoverflow.com/a/52702646 + # + # In practice, because they are I/O bound and we don't have many running concurrently, + # they shouldn't cause a problem. + + @classmethod + async def read_bytes(cls) -> bytes: + return await asyncio.to_thread(cls._read_bytes) + + @classmethod + async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_bytes, data, file=file) + + @classmethod + async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_text, text, file=file) + + @classmethod + async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_int, num, file=file) + async def read_stream( self, sr: asyncio.StreamReader, callback: Optional[Callable] = None ) -> bytes: @@ -150,13 +193,4 @@ class DangerzoneConverter: pass def update_progress(self, text: str, *, error: bool = False) -> None: - if running_on_qubes(): - if self.progress_callback: - self.progress_callback(error, text, int(self.percentage)) - else: - print( - json.dumps( - {"error": error, "text": text, "percentage": int(self.percentage)} - ) - ) - sys.stdout.flush() + pass diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 786f25f..9ba0baf 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -13,7 +13,7 @@ import os import re import shutil import sys -from typing import Dict, List, Optional +from typing import Dict, List, Optional, TextIO import fitz import magic @@ -23,26 +23,17 @@ from .common import DEFAULT_DPI, DangerzoneConverter, running_on_qubes class DocumentToPixels(DangerzoneConverter): - # XXX: These functions write page data and metadata to a separate file. For now, - # they act as an anchor point for Qubes to stream back page data/metadata in - # real time. In the future, they will be completely replaced by their streaming - # counterparts. See: - # - # https://github.com/freedomofpress/dangerzone/issues/443 async def write_page_count(self, count: int) -> None: - pass + return await self.write_int(count) - async def write_page_width(self, width: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(width)) + async def write_page_width(self, width: int) -> None: + return await self.write_int(width) - async def write_page_height(self, height: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(height)) + async def write_page_height(self, height: int) -> None: + return await self.write_int(height) - async def write_page_data(self, data: bytes, filename: str) -> None: - with open(filename, "wb") as f: - f.write(data) + async def write_page_data(self, data: bytes) -> None: + return await self.write_bytes(data) async def convert(self) -> None: conversions: Dict[str, Dict[str, Optional[str]]] = { @@ -241,9 +232,6 @@ class DocumentToPixels(DangerzoneConverter): for page in doc.pages(): # TODO check if page.number is doc-controlled page_num = page.number + 1 # pages start in 1 - rgb_filename = f"{page_base}-{page_num}.rgb" - width_filename = f"{page_base}-{page_num}.width" - height_filename = f"{page_base}-{page_num}.height" self.percentage += percentage_per_page self.update_progress( @@ -251,23 +239,9 @@ class DocumentToPixels(DangerzoneConverter): ) pix = page.get_pixmap(dpi=DEFAULT_DPI) rgb_buf = pix.samples_mv - await self.write_page_width(pix.width, width_filename) - await self.write_page_height(pix.height, height_filename) - await self.write_page_data(rgb_buf, rgb_filename) - - final_files = ( - glob.glob("/tmp/page-*.rgb") - + glob.glob("/tmp/page-*.width") - + glob.glob("/tmp/page-*.height") - ) - - # XXX: Sanity check to avoid situations like #560. - if not running_on_qubes() and len(final_files) != 3 * doc.page_count: - raise errors.PageCountMismatch() - - # Move converted files into /tmp/dangerzone - for filename in final_files: - shutil.move(filename, "/tmp/dangerzone") + await self.write_page_width(pix.width) + await self.write_page_height(pix.height) + await self.write_page_data(rgb_buf) self.update_progress("Converted document to pixels") @@ -298,18 +272,28 @@ class DocumentToPixels(DangerzoneConverter): return mime_type -async def main() -> int: - converter = DocumentToPixels() +async def main() -> None: + try: + data = await DocumentToPixels.read_bytes() + except EOFError: + sys.exit(1) + + with open("/tmp/input_file", "wb") as f: + f.write(data) try: + converter = DocumentToPixels() await converter.convert() - error_code = 0 # Success! - except errors.ConversionException as e: # Expected Errors - error_code = e.error_code + except errors.ConversionException as e: + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) + sys.exit(e.error_code) except Exception as e: - converter.update_progress(str(e), error=True) + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) error_code = errors.UnexpectedConversionError.error_code - return error_code + sys.exit(error_code) + + # Write debug information + await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr) if __name__ == "__main__": diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py deleted file mode 100644 index b94c51f..0000000 --- a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py +++ /dev/null @@ -1,108 +0,0 @@ -import asyncio -import os -import shutil -import sys -import tempfile -from pathlib import Path -from typing import Optional, TextIO - -from . import errors -from .doc_to_pixels import DocumentToPixels - - -def _read_bytes() -> bytes: - """Read bytes from the stdin.""" - data = sys.stdin.buffer.read() - if data is None: - raise EOFError - return data - - -def _write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - file.buffer.write(data) - - -def _write_text(text: str, file: TextIO = sys.stdout) -> None: - _write_bytes(text.encode(), file=file) - - -def _write_int(num: int, file: TextIO = sys.stdout) -> None: - _write_bytes(num.to_bytes(2, signed=False), file=file) - - -# ==== ASYNC METHODS ==== -# We run sync methods in async wrappers, because pure async methods are more difficult: -# https://stackoverflow.com/a/52702646 -# -# In practice, because they are I/O bound and we don't have many running concurrently, -# they shouldn't cause a problem. - - -async def read_bytes() -> bytes: - return await asyncio.to_thread(_read_bytes) - - -async def write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_bytes, data, file=file) - - -async def write_text(text: str, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_text, text, file=file) - - -async def write_int(num: int, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_int, num, file=file) - - -class QubesDocumentToPixels(DocumentToPixels): - # Override the write_page_* functions to stream data back to the caller, instead of - # writing it to separate files. This way, we have more accurate progress reports and - # client-side timeouts. See also: - # - # https://github.com/freedomofpress/dangerzone/issues/443 - # https://github.com/freedomofpress/dangerzone/issues/557 - - async def write_page_count(self, count: int) -> None: - return await write_int(count) - - async def write_page_width(self, width: int, filename: str) -> None: - return await write_int(width) - - async def write_page_height(self, height: int, filename: str) -> None: - return await write_int(height) - - async def write_page_data(self, data: bytes, filename: str) -> None: - return await write_bytes(data) - - -async def main() -> None: - out_dir = Path("/tmp/dangerzone") - if out_dir.exists(): - shutil.rmtree(out_dir) - out_dir.mkdir() - - try: - data = await read_bytes() - except EOFError: - sys.exit(1) - - with open("/tmp/input_file", "wb") as f: - f.write(data) - - try: - converter = QubesDocumentToPixels() - await converter.convert() - except errors.ConversionException as e: - await write_bytes(str(e).encode(), file=sys.stderr) - sys.exit(e.error_code) - except Exception as e: - await write_bytes(str(e).encode(), file=sys.stderr) - error_code = errors.UnexpectedConversionError.error_code - sys.exit(error_code) - - # Write debug information - await write_bytes(converter.captured_output, file=sys.stderr) - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 2217e00..0ff492c 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -22,12 +22,12 @@ class PixelsToPDF(DangerzoneConverter): ) -> None: self.percentage = 50.0 if tempdir is None: - tempdir = "/tmp" + tempdir = "/safezone" # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems import fitz - num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) + num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb")) total_size = 0.0 safe_doc = fitz.Document() @@ -35,7 +35,7 @@ class PixelsToPDF(DangerzoneConverter): # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages for page_num in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page_num}" + filename_base = f"{tempdir}/pixels/page-{page_num}" rgb_filename = f"{filename_base}.rgb" width_filename = f"{filename_base}.width" height_filename = f"{filename_base}.height" @@ -90,6 +90,18 @@ class PixelsToPDF(DangerzoneConverter): safe_doc.save(safe_pdf_path, deflate_images=True) + def update_progress(self, text: str, *, error: bool = False) -> None: + if running_on_qubes(): + if self.progress_callback: + self.progress_callback(error, text, int(self.percentage)) + else: + print( + json.dumps( + {"error": error, "text": text, "percentage": int(self.percentage)} + ) + ) + sys.stdout.flush() + async def main() -> int: ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 6265146..cf21eaa 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,13 +1,18 @@ import logging +import os import subprocess +import sys +import tempfile from abc import ABC, abstractmethod -from typing import Callable, Optional +from pathlib import Path +from typing import IO, Callable, Optional from colorama import Fore, Style -from ..conversion.errors import ConversionException +from ..conversion import errors +from ..conversion.common import calculate_timeout from ..document import Document -from ..util import replace_control_chars +from ..util import Stopwatch, nonblocking_read, replace_control_chars log = logging.getLogger(__name__) @@ -18,11 +23,43 @@ PIXELS_TO_PDF_LOG_START = "----- PIXELS TO PDF LOG START -----" PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" +def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: + """Read bytes from a file-like object.""" + buf = nonblocking_read(f, size, timeout) + if exact and len(buf) != size: + raise errors.InterruptedConversion + return buf + + +def read_int(f: IO[bytes], timeout: float) -> int: + """Read 2 bytes from a file-like object, and decode them as int.""" + untrusted_int = read_bytes(f, 2, timeout) + return int.from_bytes(untrusted_int, signed=False) + + +def read_debug_text(f: IO[bytes], size: int) -> str: + """Read arbitrarily long text (for debug purposes)""" + timeout = calculate_timeout(size) + untrusted_text = read_bytes(f, size, timeout, exact=False) + return untrusted_text.decode("ascii", errors="replace") + + class IsolationProvider(ABC): """ Abstracts an isolation provider """ + STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up. + + def __init__(self) -> None: + self.percentage = 0.0 + self.proc: Optional[subprocess.Popen] = None + + if getattr(sys, "dangerzone_dev", False) == True: + self.proc_stderr = subprocess.PIPE + else: + self.proc_stderr = subprocess.DEVNULL + @abstractmethod def install(self) -> bool: pass @@ -36,29 +73,104 @@ class IsolationProvider(ABC): self.progress_callback = progress_callback document.mark_as_converting() try: - success = self._convert(document, ocr_lang) - except ConversionException as e: - success = False + with tempfile.TemporaryDirectory() as t: + Path(f"{t}/pixels").mkdir() + self.doc_to_pixels(document, t) + # TODO: validate convert to pixels output + self.pixels_to_pdf(document, t, ocr_lang) + document.mark_as_safe() + if document.archive_after_conversion: + document.archive() + except errors.InterruptedConversion: + assert self.proc is not None + error_code = self.proc.wait(3) + # XXX Reconstruct exception from error code + exception = errors.exception_from_error_code(error_code) + document.mark_as_failed() + except errors.ConversionException as e: self.print_progress_trusted(document, True, str(e), 0) + document.mark_as_failed() except Exception as e: - success = False log.exception( f"An exception occurred while converting document '{document.id}'" ) self.print_progress_trusted(document, True, str(e), 0) - if success: - document.mark_as_safe() - if document.archive_after_conversion: - document.archive() - else: document.mark_as_failed() + def doc_to_pixels(self, document: Document, tempdir: str) -> None: + with open(document.input_filename, "rb") as f: + self.proc = self.start_doc_to_pixels_proc() + try: + assert self.proc.stdin is not None + self.proc.stdin.write(f.read()) + self.proc.stdin.close() + except BrokenPipeError as e: + raise errors.InterruptedConversion() + + # Get file size (in MiB) + size = os.path.getsize(document.input_filename) / 1024**2 + timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS + + assert self.proc is not None + assert self.proc.stdout is not None + os.set_blocking(self.proc.stdout.fileno(), False) + + n_pages = read_int(self.proc.stdout, timeout) + if n_pages == 0 or n_pages > errors.MAX_PAGES: + raise errors.MaxPagesException() + percentage_per_page = 50.0 / n_pages + + timeout = calculate_timeout(size, n_pages) + sw = Stopwatch(timeout) + sw.start() + for page in range(1, n_pages + 1): + text = f"Converting page {page}/{n_pages} to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + width = read_int(self.proc.stdout, timeout=sw.remaining) + height = read_int(self.proc.stdout, timeout=sw.remaining) + if not (1 <= width <= errors.MAX_PAGE_WIDTH): + raise errors.MaxPageWidthException() + if not (1 <= height <= errors.MAX_PAGE_HEIGHT): + raise errors.MaxPageHeightException() + + num_pixels = width * height * 3 # three color channels + untrusted_pixels = read_bytes( + self.proc.stdout, + num_pixels, + timeout=sw.remaining, + ) + + # Wrapper code + with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: + f_width.write(str(width)) + with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: + f_height.write(str(height)) + with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: + f_rgb.write(untrusted_pixels) + + self.percentage += percentage_per_page + + # Ensure nothing else is read after all bitmaps are obtained + self.proc.stdout.close() + + # TODO handle leftover code input + text = "Converted document to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + if getattr(sys, "dangerzone_dev", False): + assert self.proc.stderr is not None + os.set_blocking(self.proc.stderr.fileno(), False) + untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) + self.proc.stderr.close() + log.info( + f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" + ) + @abstractmethod - def _convert( - self, - document: Document, - ocr_lang: Optional[str], - ) -> bool: + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: pass def _print_progress( @@ -101,6 +213,10 @@ class IsolationProvider(ABC): armor_end = DOC_TO_PIXELS_LOG_END return armor_start + conversion_string + armor_end + @abstractmethod + def start_doc_to_pixels_proc(self) -> subprocess.Popen: + pass + # From global_common: diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 27bd988..bdd3504 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -2,16 +2,14 @@ import gzip import json import logging import os -import pathlib import platform import shlex import shutil import subprocess import sys -import tempfile -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, List, Optional -from ..conversion.errors import exception_from_error_code +from ..conversion import errors from ..document import Document from ..util import ( get_resource_path, @@ -19,12 +17,7 @@ from ..util import ( get_tmp_dir, replace_control_chars, ) -from .base import ( - MAX_CONVERSION_LOG_CHARS, - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, -) +from .base import IsolationProvider # Define startupinfo for subprocesses if platform.system() == "Windows": @@ -45,6 +38,7 @@ class NoContainerTechException(Exception): class Container(IsolationProvider): # Name of the dangerzone container CONTAINER_NAME = "dangerzone.rocks/dangerzone" + STARTUP_TIME_SECONDS = 5 def __init__(self, enable_timeouts: bool) -> None: self.enable_timeouts = 1 if enable_timeouts else 0 @@ -179,34 +173,24 @@ class Container(IsolationProvider): def exec( self, - document: Document, args: List[str], - ) -> int: + ) -> subprocess.Popen: args_str = " ".join(shlex.quote(s) for s in args) log.info("> " + args_str) - with subprocess.Popen( + return subprocess.Popen( args, - stdin=None, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - bufsize=1, - universal_newlines=True, + stderr=self.proc_stderr, startupinfo=startupinfo, - ) as p: - if p.stdout is not None: - for untrusted_line in p.stdout: - self.parse_progress(document, untrusted_line) - - p.communicate() - return p.returncode + ) def exec_container( self, - document: Document, command: List[str], extra_args: List[str] = [], - ) -> int: + ) -> subprocess.Popen: container_runtime = self.get_runtime() if self.get_runtime_name() == "podman": @@ -218,6 +202,7 @@ class Container(IsolationProvider): # drop all linux kernel capabilities security_args += ["--cap-drop", "all"] user_args = ["-u", "dangerzone"] + enable_stdin = ["-i"] prevent_leakage_args = ["--rm"] @@ -226,60 +211,55 @@ class Container(IsolationProvider): + user_args + security_args + prevent_leakage_args + + enable_stdin + extra_args + [self.CONTAINER_NAME] + command ) args = [container_runtime] + args - return self.exec(document, args) + return self.exec(args) - def _convert( - self, - document: Document, - ocr_lang: Optional[str], - ) -> bool: - # Create a temporary directory inside the cache directory for this run. Then, - # create some subdirectories for the various stages of the file conversion: - # - # * unsafe: Where the input file will be copied - # * pixel: Where the RGB data will be stored - # * safe: Where the final PDF file will be stored - with tempfile.TemporaryDirectory(dir=get_tmp_dir()) as t: - tmp_dir = pathlib.Path(t) - unsafe_dir = tmp_dir / "unsafe" - unsafe_dir.mkdir() - pixel_dir = tmp_dir / "pixels" - pixel_dir.mkdir() - safe_dir = tmp_dir / "safe" - safe_dir.mkdir() + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + # Convert pixels to safe PDF + command = [ + "/usr/bin/python3", + "-m", + "dangerzone.conversion.pixels_to_pdf", + ] + extra_args = [ + "-v", + f"{tempdir}:/safezone:Z", + "-e", + "TESSDATA_PREFIX=/usr/share/tessdata", + "-e", + f"OCR={0 if ocr_lang is None else 1}", + "-e", + f"OCR_LANGUAGE={ocr_lang}", + "-e", + f"ENABLE_TIMEOUTS={self.enable_timeouts}", + ] - return self._convert_with_tmpdirs( - document=document, - unsafe_dir=unsafe_dir, - pixel_dir=pixel_dir, - safe_dir=safe_dir, - ocr_lang=ocr_lang, - ) - - def _convert_with_tmpdirs( - self, - document: Document, - unsafe_dir: pathlib.Path, - pixel_dir: pathlib.Path, - safe_dir: pathlib.Path, - ocr_lang: Optional[str], - ) -> bool: - success = False - - if ocr_lang: - ocr = "1" + pixels_to_pdf_proc = self.exec_container(command, extra_args) + for line in pixels_to_pdf_proc.stdout: + self.parse_progress(document, line) + error_code = pixels_to_pdf_proc.wait() + if error_code != 0: + log.error("pixels-to-pdf failed") + raise errors.exception_from_error_code(error_code) # type: ignore [misc] else: - ocr = "0" + # Move the final file to the right place + if os.path.exists(document.output_filename): + os.remove(document.output_filename) - copied_file = unsafe_dir / "input_file" - shutil.copyfile(f"{document.input_filename}", copied_file) + container_output_filename = os.path.join( + tempdir, "safe-output-compressed.pdf" + ) + shutil.move(container_output_filename, document.output_filename) + def start_doc_to_pixels_proc(self) -> subprocess.Popen: # Convert document to pixels command = [ "/usr/bin/python3", @@ -287,60 +267,10 @@ class Container(IsolationProvider): "dangerzone.conversion.doc_to_pixels", ] extra_args = [ - "-v", - f"{copied_file}:/tmp/input_file:Z", - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", "-e", f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] - ret = self.exec_container(document, command, extra_args) - - if ret != 0: - log.error("documents-to-pixels failed") - - # XXX Reconstruct exception from error code - raise exception_from_error_code(ret) # type: ignore [misc] - else: - # TODO: validate convert to pixels output - - # Convert pixels to safe PDF - command = [ - "/usr/bin/python3", - "-m", - "dangerzone.conversion.pixels_to_pdf", - ] - extra_args = [ - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", - "-v", - f"{safe_dir}:/safezone:Z", - "-e", - "TESSDATA_PREFIX=/usr/share/tessdata", - "-e", - f"OCR={ocr}", - "-e", - f"OCR_LANGUAGE={ocr_lang}", - "-e", - f"ENABLE_TIMEOUTS={self.enable_timeouts}", - ] - ret = self.exec_container(document, command, extra_args) - if ret != 0: - log.error("pixels-to-pdf failed") - else: - # Move the final file to the right place - if os.path.exists(document.output_filename): - os.remove(document.output_filename) - - container_output_filename = os.path.join( - safe_dir, "safe-output-compressed.pdf" - ) - shutil.move(container_output_filename, document.output_filename) - - # We did it - success = True - - return success + return self.exec_container(command, extra_args) def get_max_parallel_conversions(self) -> int: # FIXME hardcoded 1 until timeouts are more limited and better handled diff --git a/dangerzone/isolation_provider/dummy.py b/dangerzone/isolation_provider/dummy.py index 3528656..fa671f1 100644 --- a/dangerzone/isolation_provider/dummy.py +++ b/dangerzone/isolation_provider/dummy.py @@ -1,8 +1,10 @@ import logging import os import shutil +import subprocess import sys import time +from pathlib import Path from typing import Callable, Optional from ..document import Document @@ -30,20 +32,20 @@ class Dummy(IsolationProvider): def install(self) -> bool: return True - def _convert( + def convert( self, document: Document, ocr_lang: Optional[str], - ) -> bool: + progress_callback: Optional[Callable] = None, + ) -> None: + self.progress_callback = None log.debug("Dummy converter started:") log.debug( f" - document: {os.path.basename(document.input_filename)} ({document.id})" ) log.debug(f" - ocr : {ocr_lang}") log.debug("\n(simulating conversion)") - success = True - progress = [ [False, "Converting to PDF using GraphicsMagick", 0.0], [False, "Separating document into pages", 3.0], @@ -54,19 +56,26 @@ class Dummy(IsolationProvider): [False, "Compressing PDF", 97.0], [False, "Safe PDF created", 100.0], ] - for error, text, percentage in progress: self.print_progress(document, error, text, percentage) # type: ignore [arg-type] if error: success = False time.sleep(0.2) - if success: shutil.copy( get_resource_path("dummy_document.pdf"), document.output_filename ) + document.mark_as_safe() + if document.archive_after_conversion: + document.archive() - return success + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + pass + + def start_doc_to_pixels_proc(self) -> subprocess.Popen: + return subprocess.Popen("True") def get_max_parallel_conversions(self) -> int: return 1 diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index b9f1fcd..b64da43 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -1,5 +1,4 @@ import asyncio -import glob import inspect import io import logging @@ -7,60 +6,25 @@ import os import shutil import subprocess import sys -import tempfile -import time import zipfile from pathlib import Path from typing import IO, Callable, Optional from ..conversion import errors -from ..conversion.common import calculate_timeout, running_on_qubes +from ..conversion.common import running_on_qubes from ..conversion.pixels_to_pdf import PixelsToPDF from ..document import Document -from ..util import ( - Stopwatch, - get_resource_path, - get_subprocess_startupinfo, - get_tmp_dir, - nonblocking_read, -) -from .base import ( - MAX_CONVERSION_LOG_CHARS, - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, -) +from ..util import get_resource_path +from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider log = logging.getLogger(__name__) -# The maximum time a qube takes to start up. -STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes - - -def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: - """Read bytes from a file-like object.""" - buf = nonblocking_read(f, size, timeout) - if exact and len(buf) != size: - raise errors.InterruptedConversion - return buf - - -def read_int(f: IO[bytes], timeout: float) -> int: - """Read 2 bytes from a file-like object, and decode them as int.""" - untrusted_int = read_bytes(f, 2, timeout) - return int.from_bytes(untrusted_int, signed=False) - - -def read_debug_text(f: IO[bytes], size: int) -> str: - """Read arbitrarily long text (for debug purposes)""" - timeout = calculate_timeout(size) - untrusted_text = read_bytes(f, size, timeout, exact=False) - return untrusted_text.decode("ascii", errors="replace") - class Qubes(IsolationProvider): """Uses a disposable qube for performing the conversion""" + STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes + def __init__(self) -> None: self.proc: Optional[subprocess.Popen] = None super().__init__() @@ -68,86 +32,9 @@ class Qubes(IsolationProvider): def install(self) -> bool: return True - def _convert_with_tmpdirs( - self, - document: Document, - tempdir: str, - ocr_lang: Optional[str] = None, - ) -> bool: - success = False - - Path(f"{tempdir}/dangerzone").mkdir() - percentage = 0.0 - - with open(document.input_filename, "rb") as f: - self.proc = self.qrexec_subprocess() - try: - assert self.proc.stdin is not None - self.proc.stdin.write(f.read()) - self.proc.stdin.close() - except BrokenPipeError as e: - raise errors.InterruptedConversion() - - # Get file size (in MiB) - size = os.path.getsize(document.input_filename) / 1024**2 - timeout = calculate_timeout(size) + STARTUP_TIME_SECONDS - - assert self.proc is not None - assert self.proc.stdout is not None - os.set_blocking(self.proc.stdout.fileno(), False) - - n_pages = read_int(self.proc.stdout, timeout) - if n_pages == 0 or n_pages > errors.MAX_PAGES: - raise errors.MaxPagesException() - percentage_per_page = 50.0 / n_pages - - timeout = calculate_timeout(size, n_pages) - sw = Stopwatch(timeout) - sw.start() - for page in range(1, n_pages + 1): - text = f"Converting page {page}/{n_pages} to pixels" - self.print_progress_trusted(document, False, text, percentage) - - width = read_int(self.proc.stdout, timeout=sw.remaining) - height = read_int(self.proc.stdout, timeout=sw.remaining) - if not (1 <= width <= errors.MAX_PAGE_WIDTH): - raise errors.MaxPageWidthException() - if not (1 <= height <= errors.MAX_PAGE_HEIGHT): - raise errors.MaxPageHeightException() - - num_pixels = width * height * 3 # three color channels - untrusted_pixels = read_bytes( - self.proc.stdout, - num_pixels, - timeout=sw.remaining, - ) - - # Wrapper code - with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) - - percentage += percentage_per_page - - # Ensure nothing else is read after all bitmaps are obtained - self.proc.stdout.close() - - # TODO handle leftover code input - text = "Converted document to pixels" - self.print_progress_trusted(document, False, text, percentage) - - if getattr(sys, "dangerzone_dev", False): - assert self.proc.stderr is not None - os.set_blocking(self.proc.stderr.fileno(), False) - untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) - self.proc.stderr.close() - log.info( - f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" - ) - + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: self.print_progress_trusted(document, error, text, percentage) @@ -166,28 +53,11 @@ class Qubes(IsolationProvider): log.info(text) shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) - success = True - - return success - - def _convert( - self, - document: Document, - ocr_lang: Optional[str] = None, - ) -> bool: - try: - with tempfile.TemporaryDirectory() as t: - return self._convert_with_tmpdirs(document, t, ocr_lang) - except errors.InterruptedConversion: - assert self.proc is not None - error_code = self.proc.wait(3) - # XXX Reconstruct exception from error code - raise errors.exception_from_error_code(error_code) # type: ignore [misc] def get_max_parallel_conversions(self) -> int: return 1 - def qrexec_subprocess(self) -> subprocess.Popen: + def start_doc_to_pixels_proc(self) -> subprocess.Popen: dev_mode = getattr(sys, "dangerzone_dev", False) == True if dev_mode: # Use dz.ConvertDev RPC call instead, if we are in development mode. diff --git a/qubes/dz.Convert b/qubes/dz.Convert index d6b2e3e..b9dded3 100755 --- a/qubes/dz.Convert +++ b/qubes/dz.Convert @@ -1,2 +1,2 @@ #!/bin/sh -python -m dangerzone.conversion.doc_to_pixels_qubes_wrapper +python -m dangerzone.conversion.doc_to_pixels diff --git a/qubes/dz.ConvertDev b/qubes/dz.ConvertDev index cb93b79..74a9845 100755 --- a/qubes/dz.ConvertDev +++ b/qubes/dz.ConvertDev @@ -34,7 +34,7 @@ def main(): say(f"Importing the conversion module") sys.path.insert(0, t.name) - from dangerzone.conversion.doc_to_pixels_qubes_wrapper import main + from dangerzone.conversion.doc_to_pixels import main return asyncio.run(main()) diff --git a/tests/isolation_provider/base.py b/tests/isolation_provider/base.py index 52a7d0f..137a40a 100644 --- a/tests/isolation_provider/base.py +++ b/tests/isolation_provider/base.py @@ -13,7 +13,8 @@ from .. import pdf_11k_pages, sanitized_text, uncommon_text @pytest.mark.skipif( - os.environ.get("DUMMY_CONVERSION", False), reason="dummy conversions not supported" + os.environ.get("DUMMY_CONVERSION", False), + reason="dummy conversions not supported", ) @pytest.mark.skipif(not running_on_qubes(), reason="Not on a Qubes system") class IsolationProviderTest: diff --git a/tests/isolation_provider/test_qubes.py b/tests/isolation_provider/test_qubes.py index 78ce9e2..103244c 100644 --- a/tests/isolation_provider/test_qubes.py +++ b/tests/isolation_provider/test_qubes.py @@ -69,7 +69,7 @@ class TestQubes(IsolationProviderTest): ) -> None: provider.progress_callback = mocker.MagicMock() - def qrexec_subprocess() -> subprocess.Popen: + def start_doc_to_pixels_proc() -> subprocess.Popen: p = subprocess.Popen( # XXX error 126 simulates a qrexec-policy failure. Source: # https://github.com/QubesOS/qubes-core-qrexec/blob/fdcbfd7/daemon/qrexec-daemon.c#L1022 @@ -81,7 +81,9 @@ class TestQubes(IsolationProviderTest): ) return p - monkeypatch.setattr(provider, "qrexec_subprocess", qrexec_subprocess) + monkeypatch.setattr( + provider, "start_doc_to_pixels_proc", start_doc_to_pixels_proc + ) with pytest.raises(errors.QubesQrexecFailed) as e: doc = Document(sample_doc) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1b4774e..0965d7c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -290,7 +290,6 @@ class TestCliConversion(TestCliBasic): def test_dummy_conversion(self, tmp_path: Path, sample_pdf: str) -> None: result = self.run_cli([sample_pdf, "--unsafe-dummy-conversion"]) - result.assert_success() def test_dummy_conversion_bulk(self, tmp_path: Path, sample_pdf: str) -> None: filenames = ["1.pdf", "2.pdf", "3.pdf"]