From 874b8865e26ffa52af235a2228ce61042c41d400 Mon Sep 17 00:00:00 2001 From: deeplow Date: Thu, 22 Jun 2023 12:14:04 +0100 Subject: [PATCH] Qubes: strategy for capturing conversion logs Use qrexec stdout to send conversion data (pixels) and stderr to send conversion progress at the end of the conversion. This happens regardless of whether or not the conversion is in developer mode or not. It's the client that decides if it reads the debug data from stderr or not. In this case, it only reads it if developer mode is enabled. --- dangerzone/conversion/common.py | 151 +++++++++--------- dangerzone/conversion/doc_to_pixels.py | 12 +- .../conversion/doc_to_pixels_qubes_wrapper.py | 3 + dangerzone/conversion/pixels_to_pdf.py | 12 +- dangerzone/isolation_provider/qubes.py | 17 ++ 5 files changed, 109 insertions(+), 86 deletions(-) diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 93cd1b2..b308c0c 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -22,84 +22,87 @@ def running_on_qubes() -> bool: return os.path.exists("/usr/share/qubes/marker-vm") -async def read_stream( - sr: asyncio.StreamReader, callback: Optional[Callable] = None -) -> bytes: - """Consume a byte stream line-by-line. - - Read all lines in a stream until EOF. If a user has passed a callback, call it for - each line. - - Note that the lines are in bytes, since we can't assume that all command output will - be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode, - if they know its encoding. - """ - buf = b"" - while True: - line = await sr.readline() - if sr.at_eof(): - break - if callback is not None: - callback(line) - # TODO: This would be a good place to log the received line, mostly for debug - # logging. - buf += line - return buf - - -async def run_command( - args: List[str], - *, - error_message: str, - timeout_message: str, - timeout: Optional[float], - stdout_callback: Optional[Callable] = None, - stderr_callback: Optional[Callable] = None, -) -> Tuple[bytes, bytes]: - """Run a command and get its output. - - Run a command using asyncio.subprocess, consume its standard streams, and return its - output in bytes. - - :raises RuntimeError: if the process returns a non-zero exit status - :raises TimeoutError: if the process times out - """ - # Start the provided command, and return a handle. The command will run in the - # background. - proc = await asyncio.subprocess.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - - assert proc.stdout is not None - assert proc.stderr is not None - - # Create asynchronous tasks that will consume the standard streams of the command, - # and call callbacks if necessary. - stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback)) - stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback)) - - # Wait until the command has finished, for a specific timeout. Then, verify that the - # command has completed successfully. In any other case, raise an exception. - try: - ret = await asyncio.wait_for(proc.wait(), timeout=timeout) - except asyncio.exceptions.TimeoutError: - raise TimeoutError(timeout_message) - if ret != 0: - raise RuntimeError(error_message) - - # Wait until the tasks that consume the command's standard streams have exited as - # well, and return their output. - stdout = await stdout_task - stderr = await stderr_task - return (stdout, stderr) - - class DangerzoneConverter: def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.percentage: float = 0.0 self.progress_callback = progress_callback + self.captured_output: bytes = b"" + + async def read_stream( + self, sr: asyncio.StreamReader, callback: Optional[Callable] = None + ) -> bytes: + """Consume a byte stream line-by-line. + + Read all lines in a stream until EOF. If a user has passed a callback, call it for + each line. + + Note that the lines are in bytes, since we can't assume that all command output will + be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode, + if they know its encoding. + """ + buf = b"" + while True: + line = await sr.readline() + if sr.at_eof(): + break + self.captured_output += line + if callback is not None: + callback(line) + buf += line + return buf + + async def run_command( + self, + args: List[str], + *, + error_message: str, + timeout_message: str, + timeout: Optional[float], + stdout_callback: Optional[Callable] = None, + stderr_callback: Optional[Callable] = None, + ) -> Tuple[bytes, bytes]: + """Run a command and get its output. + + Run a command using asyncio.subprocess, consume its standard streams, and return its + output in bytes. + + :raises RuntimeError: if the process returns a non-zero exit status + :raises TimeoutError: if the process times out + """ + # Start the provided command, and return a handle. The command will run in the + # background. + proc = await asyncio.subprocess.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + assert proc.stdout is not None + assert proc.stderr is not None + + # Create asynchronous tasks that will consume the standard streams of the command, + # and call callbacks if necessary. + stdout_task = asyncio.create_task( + self.read_stream(proc.stdout, stdout_callback) + ) + stderr_task = asyncio.create_task( + self.read_stream(proc.stderr, stderr_callback) + ) + + # Wait until the command has finished, for a specific timeout. Then, verify that the + # command has completed successfully. In any other case, raise an exception. + try: + ret = await asyncio.wait_for(proc.wait(), timeout=timeout) + except asyncio.exceptions.TimeoutError: + raise TimeoutError(timeout_message) + if ret != 0: + raise RuntimeError(error_message) + + # Wait until the tasks that consume the command's standard streams have exited as + # well, and return their output. + stdout = await stdout_task + stderr = await stderr_task + return (stdout, stderr) def calculate_timeout( self, size: float, pages: Optional[float] = None diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 622ec03..994a9d5 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -18,7 +18,7 @@ from typing import Dict, Optional import magic -from .common import DangerzoneConverter, run_command, running_on_qubes +from .common import DangerzoneConverter, running_on_qubes class DocumentToPixels(DangerzoneConverter): @@ -189,7 +189,7 @@ class DocumentToPixels(DangerzoneConverter): "/tmp", "/tmp/input_file", ] - await run_command( + await self.run_command( args, error_message="Conversion to PDF with LibreOffice failed", timeout_message=( @@ -213,7 +213,7 @@ class DocumentToPixels(DangerzoneConverter): "/tmp/input_file", "/tmp/input_file.pdf", ] - await run_command( + await self.run_command( args, error_message="Conversion to PDF with GraphicsMagick failed", timeout_message=( @@ -231,7 +231,7 @@ class DocumentToPixels(DangerzoneConverter): # Obtain number of pages self.update_progress("Calculating number of pages") - stdout, _ = await run_command( + stdout, _ = await self.run_command( ["pdfinfo", pdf_filename], error_message="PDF file is corrupted", timeout_message=( @@ -317,7 +317,7 @@ class DocumentToPixels(DangerzoneConverter): page_base = "/tmp/page" - await run_command( + await self.run_command( [ "pdftoppm", pdf_filename, @@ -351,7 +351,7 @@ class DocumentToPixels(DangerzoneConverter): f"/usr/lib/libreoffice/share/extensions/{libreoffice_ext}/", f"/libreoffice_ext/{libreoffice_ext}", ] - await run_command( + await self.run_command( unzip_args, error_message="LibreOffice extension installation failed (unzipping)", timeout_message="unzipping LibreOffice extension timed out 5 seconds", diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py index 82c0feb..676b018 100644 --- a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py +++ b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py @@ -84,6 +84,9 @@ async def main() -> None: rgb_data = rgb_file.read() await write_bytes(rgb_data) + # Write debug information + await write_bytes(converter.captured_output, file=sys.stderr) + if __name__ == "__main__": sys.exit(asyncio.run(main())) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 905e8e8..be708f7 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -12,7 +12,7 @@ import os import shutil import sys -from .common import DangerzoneConverter, run_command, running_on_qubes +from .common import DangerzoneConverter, running_on_qubes class PixelsToPDF(DangerzoneConverter): @@ -47,7 +47,7 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page}/{num_pages} from pixels to searchable PDF" ) - await run_command( + await self.run_command( [ "gm", "convert", @@ -65,7 +65,7 @@ class PixelsToPDF(DangerzoneConverter): ), timeout=timeout, ) - await run_command( + await self.run_command( [ "tesseract", png_filename, @@ -88,7 +88,7 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page}/{num_pages} from pixels to PDF" ) - await run_command( + await self.run_command( [ "gm", "convert", @@ -119,7 +119,7 @@ class PixelsToPDF(DangerzoneConverter): for page in range(1, num_pages + 1): args.append(f"/tmp/page-{page}.pdf") args.append(f"/tmp/safe-output.pdf") - await run_command( + await self.run_command( args, error_message="Merging pages into a single PDF failed", timeout_message=( @@ -133,7 +133,7 @@ class PixelsToPDF(DangerzoneConverter): # Compress self.update_progress("Compressing PDF") - await run_command( + await self.run_command( ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], error_message="Compressing PDF failed", timeout_message=( diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index 63fd9bf..8ed9b5e 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -28,6 +28,8 @@ CONVERTED_FILE_PATH = ( "/tmp/safe-output-compressed.pdf" ) +MAX_CONVERSION_LOG_CHARS = 150 * 50 # up to ~150 lines of 50 characters + def read_bytes(p: subprocess.Popen, buff_size: int) -> bytes: """Read bytes from stdout.""" @@ -40,6 +42,15 @@ def read_int(p: subprocess.Popen) -> int: return int.from_bytes(untrusted_int, signed=False) +def read_debug_text(p: subprocess.Popen) -> str: + """Read arbitrarily long text (for debug purposes)""" + if p.stderr: + untrusted_text = p.stderr.read(MAX_CONVERSION_LOG_CHARS) + return untrusted_text.decode("ascii", errors="replace") + else: + return "" + + class Qubes(IsolationProvider): """Uses a disposable qube for performing the conversion""" @@ -75,6 +86,7 @@ class Qubes(IsolationProvider): ["/usr/bin/qrexec-client-vm", "@dispvm:dz-dvm", "dz.ConvertDev"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) assert p.stdin is not None @@ -89,6 +101,7 @@ class Qubes(IsolationProvider): ["/usr/bin/qrexec-client-vm", "@dispvm:dz-dvm", "dz.Convert"], stdin=f, stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, ) n_pages = read_int(p) @@ -126,6 +139,10 @@ class Qubes(IsolationProvider): text = "Converted document to pixels" self.print_progress_trusted(document, False, text, percentage) + if getattr(sys, "dangerzone_dev", False): + text = f"Conversion output (doc to pixels):\n{read_debug_text(p)}" + log.info(text) + # FIXME pass OCR stuff properly (see #455) old_environ = dict(os.environ) if ocr_lang: