Stream pages in containers: merge isolation providers

Merge Qubes and Containers isolation providers core code into the class
parent IsolationProviders abstract class.

This is done by streaming pages in containers for exclusively in first
conversion process. The commit is rather large due to the multiple
interdependencies of the code, making it difficult to split into various
commits.

The main conversion method (_convert) now in the superclass simply calls
two methods:
  - doc_to_pixels()
  - pixels_to_pdf()

Critically, doc_to_pixels is implemented in the superclass, diverging
only in a specialized method called "start_doc_to_pixels_proc()". This
method obtains the process responsible that communicates with the
isolation provider (container / disp VM) via `podman/docker` and qrexec
on Containers and Qubes respectively.

Known regressions:
  - progress reports stopped working on containers

Fixes #443
This commit is contained in:
deeplow 2023-11-22 11:12:47 +00:00
parent 331b6514e8
commit 0a099540c8
No known key found for this signature in database
GPG key ID: 577982871529A52A
14 changed files with 306 additions and 462 deletions

View file

@ -77,10 +77,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion
RUN adduser -s /bin/sh -D dangerzone RUN adduser -s /bin/sh -D dangerzone
USER dangerzone USER dangerzone
# /tmp/input_file is where the first convert expects the input file to be, and # /safezone is a directory through which Pixels to PDF receives files
# /tmp where it will write the pixel files VOLUME /safezone
#
# /dangerzone is where the second script expects files to be put by the first one
#
# /safezone is where the wrapper eventually moves the sanitized files.
VOLUME /dangerzone /tmp/input_file /safezone

View file

@ -10,7 +10,7 @@ import subprocess
import sys import sys
import time import time
from abc import abstractmethod from abc import abstractmethod
from typing import Callable, Dict, List, Optional, Tuple, Union from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union
TIMEOUT_PER_PAGE: float = 30 # (seconds) TIMEOUT_PER_PAGE: float = 30 # (seconds)
TIMEOUT_PER_MB: float = 30 # (seconds) TIMEOUT_PER_MB: float = 30 # (seconds)
@ -58,6 +58,49 @@ class DangerzoneConverter:
self.progress_callback = progress_callback self.progress_callback = progress_callback
self.captured_output: bytes = b"" self.captured_output: bytes = b""
@classmethod
def _read_bytes(cls) -> bytes:
"""Read bytes from the stdin."""
data = sys.stdin.buffer.read()
if data is None:
raise EOFError
return data
@classmethod
def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
file.buffer.write(data)
@classmethod
def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
cls._write_bytes(text.encode(), file=file)
@classmethod
def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
cls._write_bytes(num.to_bytes(2, signed=False), file=file)
# ==== ASYNC METHODS ====
# We run sync methods in async wrappers, because pure async methods are more difficult:
# https://stackoverflow.com/a/52702646
#
# In practice, because they are I/O bound and we don't have many running concurrently,
# they shouldn't cause a problem.
@classmethod
async def read_bytes(cls) -> bytes:
return await asyncio.to_thread(cls._read_bytes)
@classmethod
async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_bytes, data, file=file)
@classmethod
async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_text, text, file=file)
@classmethod
async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_int, num, file=file)
async def read_stream( async def read_stream(
self, sr: asyncio.StreamReader, callback: Optional[Callable] = None self, sr: asyncio.StreamReader, callback: Optional[Callable] = None
) -> bytes: ) -> bytes:
@ -150,13 +193,4 @@ class DangerzoneConverter:
pass pass
def update_progress(self, text: str, *, error: bool = False) -> None: def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes(): pass
if self.progress_callback:
self.progress_callback(error, text, int(self.percentage))
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()

View file

@ -13,7 +13,7 @@ import os
import re import re
import shutil import shutil
import sys import sys
from typing import Dict, List, Optional from typing import Dict, List, Optional, TextIO
import fitz import fitz
import magic import magic
@ -23,26 +23,17 @@ from .common import DEFAULT_DPI, DangerzoneConverter, running_on_qubes
class DocumentToPixels(DangerzoneConverter): class DocumentToPixels(DangerzoneConverter):
# XXX: These functions write page data and metadata to a separate file. For now,
# they act as an anchor point for Qubes to stream back page data/metadata in
# real time. In the future, they will be completely replaced by their streaming
# counterparts. See:
#
# https://github.com/freedomofpress/dangerzone/issues/443
async def write_page_count(self, count: int) -> None: async def write_page_count(self, count: int) -> None:
pass return await self.write_int(count)
async def write_page_width(self, width: int, filename: str) -> None: async def write_page_width(self, width: int) -> None:
with open(filename, "w") as f: return await self.write_int(width)
f.write(str(width))
async def write_page_height(self, height: int, filename: str) -> None: async def write_page_height(self, height: int) -> None:
with open(filename, "w") as f: return await self.write_int(height)
f.write(str(height))
async def write_page_data(self, data: bytes, filename: str) -> None: async def write_page_data(self, data: bytes) -> None:
with open(filename, "wb") as f: return await self.write_bytes(data)
f.write(data)
async def convert(self) -> None: async def convert(self) -> None:
conversions: Dict[str, Dict[str, Optional[str]]] = { conversions: Dict[str, Dict[str, Optional[str]]] = {
@ -241,9 +232,6 @@ class DocumentToPixels(DangerzoneConverter):
for page in doc.pages(): for page in doc.pages():
# TODO check if page.number is doc-controlled # TODO check if page.number is doc-controlled
page_num = page.number + 1 # pages start in 1 page_num = page.number + 1 # pages start in 1
rgb_filename = f"{page_base}-{page_num}.rgb"
width_filename = f"{page_base}-{page_num}.width"
height_filename = f"{page_base}-{page_num}.height"
self.percentage += percentage_per_page self.percentage += percentage_per_page
self.update_progress( self.update_progress(
@ -251,23 +239,9 @@ class DocumentToPixels(DangerzoneConverter):
) )
pix = page.get_pixmap(dpi=DEFAULT_DPI) pix = page.get_pixmap(dpi=DEFAULT_DPI)
rgb_buf = pix.samples_mv rgb_buf = pix.samples_mv
await self.write_page_width(pix.width, width_filename) await self.write_page_width(pix.width)
await self.write_page_height(pix.height, height_filename) await self.write_page_height(pix.height)
await self.write_page_data(rgb_buf, rgb_filename) await self.write_page_data(rgb_buf)
final_files = (
glob.glob("/tmp/page-*.rgb")
+ glob.glob("/tmp/page-*.width")
+ glob.glob("/tmp/page-*.height")
)
# XXX: Sanity check to avoid situations like #560.
if not running_on_qubes() and len(final_files) != 3 * doc.page_count:
raise errors.PageCountMismatch()
# Move converted files into /tmp/dangerzone
for filename in final_files:
shutil.move(filename, "/tmp/dangerzone")
self.update_progress("Converted document to pixels") self.update_progress("Converted document to pixels")
@ -298,18 +272,28 @@ class DocumentToPixels(DangerzoneConverter):
return mime_type return mime_type
async def main() -> int: async def main() -> None:
converter = DocumentToPixels() try:
data = await DocumentToPixels.read_bytes()
except EOFError:
sys.exit(1)
with open("/tmp/input_file", "wb") as f:
f.write(data)
try: try:
converter = DocumentToPixels()
await converter.convert() await converter.convert()
error_code = 0 # Success! except errors.ConversionException as e:
except errors.ConversionException as e: # Expected Errors await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
error_code = e.error_code sys.exit(e.error_code)
except Exception as e: except Exception as e:
converter.update_progress(str(e), error=True) await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
error_code = errors.UnexpectedConversionError.error_code error_code = errors.UnexpectedConversionError.error_code
return error_code sys.exit(error_code)
# Write debug information
await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,108 +0,0 @@
import asyncio
import os
import shutil
import sys
import tempfile
from pathlib import Path
from typing import Optional, TextIO
from . import errors
from .doc_to_pixels import DocumentToPixels
def _read_bytes() -> bytes:
"""Read bytes from the stdin."""
data = sys.stdin.buffer.read()
if data is None:
raise EOFError
return data
def _write_bytes(data: bytes, file: TextIO = sys.stdout) -> None:
file.buffer.write(data)
def _write_text(text: str, file: TextIO = sys.stdout) -> None:
_write_bytes(text.encode(), file=file)
def _write_int(num: int, file: TextIO = sys.stdout) -> None:
_write_bytes(num.to_bytes(2, signed=False), file=file)
# ==== ASYNC METHODS ====
# We run sync methods in async wrappers, because pure async methods are more difficult:
# https://stackoverflow.com/a/52702646
#
# In practice, because they are I/O bound and we don't have many running concurrently,
# they shouldn't cause a problem.
async def read_bytes() -> bytes:
return await asyncio.to_thread(_read_bytes)
async def write_bytes(data: bytes, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(_write_bytes, data, file=file)
async def write_text(text: str, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(_write_text, text, file=file)
async def write_int(num: int, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(_write_int, num, file=file)
class QubesDocumentToPixels(DocumentToPixels):
# Override the write_page_* functions to stream data back to the caller, instead of
# writing it to separate files. This way, we have more accurate progress reports and
# client-side timeouts. See also:
#
# https://github.com/freedomofpress/dangerzone/issues/443
# https://github.com/freedomofpress/dangerzone/issues/557
async def write_page_count(self, count: int) -> None:
return await write_int(count)
async def write_page_width(self, width: int, filename: str) -> None:
return await write_int(width)
async def write_page_height(self, height: int, filename: str) -> None:
return await write_int(height)
async def write_page_data(self, data: bytes, filename: str) -> None:
return await write_bytes(data)
async def main() -> None:
out_dir = Path("/tmp/dangerzone")
if out_dir.exists():
shutil.rmtree(out_dir)
out_dir.mkdir()
try:
data = await read_bytes()
except EOFError:
sys.exit(1)
with open("/tmp/input_file", "wb") as f:
f.write(data)
try:
converter = QubesDocumentToPixels()
await converter.convert()
except errors.ConversionException as e:
await write_bytes(str(e).encode(), file=sys.stderr)
sys.exit(e.error_code)
except Exception as e:
await write_bytes(str(e).encode(), file=sys.stderr)
error_code = errors.UnexpectedConversionError.error_code
sys.exit(error_code)
# Write debug information
await write_bytes(converter.captured_output, file=sys.stderr)
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View file

@ -22,12 +22,12 @@ class PixelsToPDF(DangerzoneConverter):
) -> None: ) -> None:
self.percentage = 50.0 self.percentage = 50.0
if tempdir is None: if tempdir is None:
tempdir = "/tmp" tempdir = "/safezone"
# XXX lazy loading of fitz module to avoid import issues on non-Qubes systems # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
import fitz import fitz
num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
total_size = 0.0 total_size = 0.0
safe_doc = fitz.Document() safe_doc = fitz.Document()
@ -35,7 +35,7 @@ class PixelsToPDF(DangerzoneConverter):
# Convert RGB files to PDF files # Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages percentage_per_page = 45.0 / num_pages
for page_num in range(1, num_pages + 1): for page_num in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page_num}" filename_base = f"{tempdir}/pixels/page-{page_num}"
rgb_filename = f"{filename_base}.rgb" rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width" width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height" height_filename = f"{filename_base}.height"
@ -90,6 +90,18 @@ class PixelsToPDF(DangerzoneConverter):
safe_doc.save(safe_pdf_path, deflate_images=True) safe_doc.save(safe_pdf_path, deflate_images=True)
def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes():
if self.progress_callback:
self.progress_callback(error, text, int(self.percentage))
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()
async def main() -> int: async def main() -> int:
ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None

View file

@ -1,13 +1,18 @@
import logging import logging
import os
import subprocess import subprocess
import sys
import tempfile
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Optional from pathlib import Path
from typing import IO, Callable, Optional
from colorama import Fore, Style from colorama import Fore, Style
from ..conversion.errors import ConversionException from ..conversion import errors
from ..conversion.common import calculate_timeout
from ..document import Document from ..document import Document
from ..util import replace_control_chars from ..util import Stopwatch, nonblocking_read, replace_control_chars
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -18,11 +23,43 @@ PIXELS_TO_PDF_LOG_START = "----- PIXELS TO PDF LOG START -----"
PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----"
def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes:
"""Read bytes from a file-like object."""
buf = nonblocking_read(f, size, timeout)
if exact and len(buf) != size:
raise errors.InterruptedConversion
return buf
def read_int(f: IO[bytes], timeout: float) -> int:
"""Read 2 bytes from a file-like object, and decode them as int."""
untrusted_int = read_bytes(f, 2, timeout)
return int.from_bytes(untrusted_int, signed=False)
def read_debug_text(f: IO[bytes], size: int) -> str:
"""Read arbitrarily long text (for debug purposes)"""
timeout = calculate_timeout(size)
untrusted_text = read_bytes(f, size, timeout, exact=False)
return untrusted_text.decode("ascii", errors="replace")
class IsolationProvider(ABC): class IsolationProvider(ABC):
""" """
Abstracts an isolation provider Abstracts an isolation provider
""" """
STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up.
def __init__(self) -> None:
self.percentage = 0.0
self.proc: Optional[subprocess.Popen] = None
if getattr(sys, "dangerzone_dev", False) == True:
self.proc_stderr = subprocess.PIPE
else:
self.proc_stderr = subprocess.DEVNULL
@abstractmethod @abstractmethod
def install(self) -> bool: def install(self) -> bool:
pass pass
@ -36,29 +73,104 @@ class IsolationProvider(ABC):
self.progress_callback = progress_callback self.progress_callback = progress_callback
document.mark_as_converting() document.mark_as_converting()
try: try:
success = self._convert(document, ocr_lang) with tempfile.TemporaryDirectory() as t:
except ConversionException as e: Path(f"{t}/pixels").mkdir()
success = False self.doc_to_pixels(document, t)
# TODO: validate convert to pixels output
self.pixels_to_pdf(document, t, ocr_lang)
document.mark_as_safe()
if document.archive_after_conversion:
document.archive()
except errors.InterruptedConversion:
assert self.proc is not None
error_code = self.proc.wait(3)
# XXX Reconstruct exception from error code
exception = errors.exception_from_error_code(error_code)
document.mark_as_failed()
except errors.ConversionException as e:
self.print_progress_trusted(document, True, str(e), 0) self.print_progress_trusted(document, True, str(e), 0)
document.mark_as_failed()
except Exception as e: except Exception as e:
success = False
log.exception( log.exception(
f"An exception occurred while converting document '{document.id}'" f"An exception occurred while converting document '{document.id}'"
) )
self.print_progress_trusted(document, True, str(e), 0) self.print_progress_trusted(document, True, str(e), 0)
if success:
document.mark_as_safe()
if document.archive_after_conversion:
document.archive()
else:
document.mark_as_failed() document.mark_as_failed()
def doc_to_pixels(self, document: Document, tempdir: str) -> None:
with open(document.input_filename, "rb") as f:
self.proc = self.start_doc_to_pixels_proc()
try:
assert self.proc.stdin is not None
self.proc.stdin.write(f.read())
self.proc.stdin.close()
except BrokenPipeError as e:
raise errors.InterruptedConversion()
# Get file size (in MiB)
size = os.path.getsize(document.input_filename) / 1024**2
timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS
assert self.proc is not None
assert self.proc.stdout is not None
os.set_blocking(self.proc.stdout.fileno(), False)
n_pages = read_int(self.proc.stdout, timeout)
if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException()
percentage_per_page = 50.0 / n_pages
timeout = calculate_timeout(size, n_pages)
sw = Stopwatch(timeout)
sw.start()
for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels"
self.print_progress_trusted(document, False, text, self.percentage)
width = read_int(self.proc.stdout, timeout=sw.remaining)
height = read_int(self.proc.stdout, timeout=sw.remaining)
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
raise errors.MaxPageWidthException()
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
raise errors.MaxPageHeightException()
num_pixels = width * height * 3 # three color channels
untrusted_pixels = read_bytes(
self.proc.stdout,
num_pixels,
timeout=sw.remaining,
)
# Wrapper code
with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width:
f_width.write(str(width))
with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height:
f_height.write(str(height))
with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb:
f_rgb.write(untrusted_pixels)
self.percentage += percentage_per_page
# Ensure nothing else is read after all bitmaps are obtained
self.proc.stdout.close()
# TODO handle leftover code input
text = "Converted document to pixels"
self.print_progress_trusted(document, False, text, self.percentage)
if getattr(sys, "dangerzone_dev", False):
assert self.proc.stderr is not None
os.set_blocking(self.proc.stderr.fileno(), False)
untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS)
self.proc.stderr.close()
log.info(
f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}"
)
@abstractmethod @abstractmethod
def _convert( def pixels_to_pdf(
self, self, document: Document, tempdir: str, ocr_lang: Optional[str]
document: Document, ) -> None:
ocr_lang: Optional[str],
) -> bool:
pass pass
def _print_progress( def _print_progress(
@ -101,6 +213,10 @@ class IsolationProvider(ABC):
armor_end = DOC_TO_PIXELS_LOG_END armor_end = DOC_TO_PIXELS_LOG_END
return armor_start + conversion_string + armor_end return armor_start + conversion_string + armor_end
@abstractmethod
def start_doc_to_pixels_proc(self) -> subprocess.Popen:
pass
# From global_common: # From global_common:

View file

@ -2,16 +2,14 @@ import gzip
import json import json
import logging import logging
import os import os
import pathlib
import platform import platform
import shlex import shlex
import shutil import shutil
import subprocess import subprocess
import sys import sys
import tempfile from typing import Any, List, Optional
from typing import Any, Callable, List, Optional, Tuple
from ..conversion.errors import exception_from_error_code from ..conversion import errors
from ..document import Document from ..document import Document
from ..util import ( from ..util import (
get_resource_path, get_resource_path,
@ -19,12 +17,7 @@ from ..util import (
get_tmp_dir, get_tmp_dir,
replace_control_chars, replace_control_chars,
) )
from .base import ( from .base import IsolationProvider
MAX_CONVERSION_LOG_CHARS,
PIXELS_TO_PDF_LOG_END,
PIXELS_TO_PDF_LOG_START,
IsolationProvider,
)
# Define startupinfo for subprocesses # Define startupinfo for subprocesses
if platform.system() == "Windows": if platform.system() == "Windows":
@ -45,6 +38,7 @@ class NoContainerTechException(Exception):
class Container(IsolationProvider): class Container(IsolationProvider):
# Name of the dangerzone container # Name of the dangerzone container
CONTAINER_NAME = "dangerzone.rocks/dangerzone" CONTAINER_NAME = "dangerzone.rocks/dangerzone"
STARTUP_TIME_SECONDS = 5
def __init__(self, enable_timeouts: bool) -> None: def __init__(self, enable_timeouts: bool) -> None:
self.enable_timeouts = 1 if enable_timeouts else 0 self.enable_timeouts = 1 if enable_timeouts else 0
@ -179,34 +173,24 @@ class Container(IsolationProvider):
def exec( def exec(
self, self,
document: Document,
args: List[str], args: List[str],
) -> int: ) -> subprocess.Popen:
args_str = " ".join(shlex.quote(s) for s in args) args_str = " ".join(shlex.quote(s) for s in args)
log.info("> " + args_str) log.info("> " + args_str)
with subprocess.Popen( return subprocess.Popen(
args, args,
stdin=None, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, stderr=self.proc_stderr,
bufsize=1,
universal_newlines=True,
startupinfo=startupinfo, startupinfo=startupinfo,
) as p: )
if p.stdout is not None:
for untrusted_line in p.stdout:
self.parse_progress(document, untrusted_line)
p.communicate()
return p.returncode
def exec_container( def exec_container(
self, self,
document: Document,
command: List[str], command: List[str],
extra_args: List[str] = [], extra_args: List[str] = [],
) -> int: ) -> subprocess.Popen:
container_runtime = self.get_runtime() container_runtime = self.get_runtime()
if self.get_runtime_name() == "podman": if self.get_runtime_name() == "podman":
@ -218,6 +202,7 @@ class Container(IsolationProvider):
# drop all linux kernel capabilities # drop all linux kernel capabilities
security_args += ["--cap-drop", "all"] security_args += ["--cap-drop", "all"]
user_args = ["-u", "dangerzone"] user_args = ["-u", "dangerzone"]
enable_stdin = ["-i"]
prevent_leakage_args = ["--rm"] prevent_leakage_args = ["--rm"]
@ -226,60 +211,55 @@ class Container(IsolationProvider):
+ user_args + user_args
+ security_args + security_args
+ prevent_leakage_args + prevent_leakage_args
+ enable_stdin
+ extra_args + extra_args
+ [self.CONTAINER_NAME] + [self.CONTAINER_NAME]
+ command + command
) )
args = [container_runtime] + args args = [container_runtime] + args
return self.exec(document, args) return self.exec(args)
def _convert( def pixels_to_pdf(
self, self, document: Document, tempdir: str, ocr_lang: Optional[str]
document: Document, ) -> None:
ocr_lang: Optional[str], # Convert pixels to safe PDF
) -> bool: command = [
# Create a temporary directory inside the cache directory for this run. Then, "/usr/bin/python3",
# create some subdirectories for the various stages of the file conversion: "-m",
# "dangerzone.conversion.pixels_to_pdf",
# * unsafe: Where the input file will be copied ]
# * pixel: Where the RGB data will be stored extra_args = [
# * safe: Where the final PDF file will be stored "-v",
with tempfile.TemporaryDirectory(dir=get_tmp_dir()) as t: f"{tempdir}:/safezone:Z",
tmp_dir = pathlib.Path(t) "-e",
unsafe_dir = tmp_dir / "unsafe" "TESSDATA_PREFIX=/usr/share/tessdata",
unsafe_dir.mkdir() "-e",
pixel_dir = tmp_dir / "pixels" f"OCR={0 if ocr_lang is None else 1}",
pixel_dir.mkdir() "-e",
safe_dir = tmp_dir / "safe" f"OCR_LANGUAGE={ocr_lang}",
safe_dir.mkdir() "-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]
return self._convert_with_tmpdirs( pixels_to_pdf_proc = self.exec_container(command, extra_args)
document=document, for line in pixels_to_pdf_proc.stdout:
unsafe_dir=unsafe_dir, self.parse_progress(document, line)
pixel_dir=pixel_dir, error_code = pixels_to_pdf_proc.wait()
safe_dir=safe_dir, if error_code != 0:
ocr_lang=ocr_lang, log.error("pixels-to-pdf failed")
) raise errors.exception_from_error_code(error_code) # type: ignore [misc]
def _convert_with_tmpdirs(
self,
document: Document,
unsafe_dir: pathlib.Path,
pixel_dir: pathlib.Path,
safe_dir: pathlib.Path,
ocr_lang: Optional[str],
) -> bool:
success = False
if ocr_lang:
ocr = "1"
else: else:
ocr = "0" # Move the final file to the right place
if os.path.exists(document.output_filename):
os.remove(document.output_filename)
copied_file = unsafe_dir / "input_file" container_output_filename = os.path.join(
shutil.copyfile(f"{document.input_filename}", copied_file) tempdir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, document.output_filename)
def start_doc_to_pixels_proc(self) -> subprocess.Popen:
# Convert document to pixels # Convert document to pixels
command = [ command = [
"/usr/bin/python3", "/usr/bin/python3",
@ -287,60 +267,10 @@ class Container(IsolationProvider):
"dangerzone.conversion.doc_to_pixels", "dangerzone.conversion.doc_to_pixels",
] ]
extra_args = [ extra_args = [
"-v",
f"{copied_file}:/tmp/input_file:Z",
"-v",
f"{pixel_dir}:/tmp/dangerzone:Z",
"-e", "-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}", f"ENABLE_TIMEOUTS={self.enable_timeouts}",
] ]
ret = self.exec_container(document, command, extra_args) return self.exec_container(command, extra_args)
if ret != 0:
log.error("documents-to-pixels failed")
# XXX Reconstruct exception from error code
raise exception_from_error_code(ret) # type: ignore [misc]
else:
# TODO: validate convert to pixels output
# Convert pixels to safe PDF
command = [
"/usr/bin/python3",
"-m",
"dangerzone.conversion.pixels_to_pdf",
]
extra_args = [
"-v",
f"{pixel_dir}:/tmp/dangerzone:Z",
"-v",
f"{safe_dir}:/safezone:Z",
"-e",
"TESSDATA_PREFIX=/usr/share/tessdata",
"-e",
f"OCR={ocr}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
"-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]
ret = self.exec_container(document, command, extra_args)
if ret != 0:
log.error("pixels-to-pdf failed")
else:
# Move the final file to the right place
if os.path.exists(document.output_filename):
os.remove(document.output_filename)
container_output_filename = os.path.join(
safe_dir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, document.output_filename)
# We did it
success = True
return success
def get_max_parallel_conversions(self) -> int: def get_max_parallel_conversions(self) -> int:
# FIXME hardcoded 1 until timeouts are more limited and better handled # FIXME hardcoded 1 until timeouts are more limited and better handled

View file

@ -1,8 +1,10 @@
import logging import logging
import os import os
import shutil import shutil
import subprocess
import sys import sys
import time import time
from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
from ..document import Document from ..document import Document
@ -30,20 +32,20 @@ class Dummy(IsolationProvider):
def install(self) -> bool: def install(self) -> bool:
return True return True
def _convert( def convert(
self, self,
document: Document, document: Document,
ocr_lang: Optional[str], ocr_lang: Optional[str],
) -> bool: progress_callback: Optional[Callable] = None,
) -> None:
self.progress_callback = None
log.debug("Dummy converter started:") log.debug("Dummy converter started:")
log.debug( log.debug(
f" - document: {os.path.basename(document.input_filename)} ({document.id})" f" - document: {os.path.basename(document.input_filename)} ({document.id})"
) )
log.debug(f" - ocr : {ocr_lang}") log.debug(f" - ocr : {ocr_lang}")
log.debug("\n(simulating conversion)") log.debug("\n(simulating conversion)")
success = True success = True
progress = [ progress = [
[False, "Converting to PDF using GraphicsMagick", 0.0], [False, "Converting to PDF using GraphicsMagick", 0.0],
[False, "Separating document into pages", 3.0], [False, "Separating document into pages", 3.0],
@ -54,19 +56,26 @@ class Dummy(IsolationProvider):
[False, "Compressing PDF", 97.0], [False, "Compressing PDF", 97.0],
[False, "Safe PDF created", 100.0], [False, "Safe PDF created", 100.0],
] ]
for error, text, percentage in progress: for error, text, percentage in progress:
self.print_progress(document, error, text, percentage) # type: ignore [arg-type] self.print_progress(document, error, text, percentage) # type: ignore [arg-type]
if error: if error:
success = False success = False
time.sleep(0.2) time.sleep(0.2)
if success: if success:
shutil.copy( shutil.copy(
get_resource_path("dummy_document.pdf"), document.output_filename get_resource_path("dummy_document.pdf"), document.output_filename
) )
document.mark_as_safe()
if document.archive_after_conversion:
document.archive()
return success def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
pass
def start_doc_to_pixels_proc(self) -> subprocess.Popen:
return subprocess.Popen("True")
def get_max_parallel_conversions(self) -> int: def get_max_parallel_conversions(self) -> int:
return 1 return 1

View file

@ -1,5 +1,4 @@
import asyncio import asyncio
import glob
import inspect import inspect
import io import io
import logging import logging
@ -7,60 +6,25 @@ import os
import shutil import shutil
import subprocess import subprocess
import sys import sys
import tempfile
import time
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from typing import IO, Callable, Optional from typing import IO, Callable, Optional
from ..conversion import errors from ..conversion import errors
from ..conversion.common import calculate_timeout, running_on_qubes from ..conversion.common import running_on_qubes
from ..conversion.pixels_to_pdf import PixelsToPDF from ..conversion.pixels_to_pdf import PixelsToPDF
from ..document import Document from ..document import Document
from ..util import ( from ..util import get_resource_path
Stopwatch, from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider
get_resource_path,
get_subprocess_startupinfo,
get_tmp_dir,
nonblocking_read,
)
from .base import (
MAX_CONVERSION_LOG_CHARS,
PIXELS_TO_PDF_LOG_END,
PIXELS_TO_PDF_LOG_START,
IsolationProvider,
)
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# The maximum time a qube takes to start up.
STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes
def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes:
"""Read bytes from a file-like object."""
buf = nonblocking_read(f, size, timeout)
if exact and len(buf) != size:
raise errors.InterruptedConversion
return buf
def read_int(f: IO[bytes], timeout: float) -> int:
"""Read 2 bytes from a file-like object, and decode them as int."""
untrusted_int = read_bytes(f, 2, timeout)
return int.from_bytes(untrusted_int, signed=False)
def read_debug_text(f: IO[bytes], size: int) -> str:
"""Read arbitrarily long text (for debug purposes)"""
timeout = calculate_timeout(size)
untrusted_text = read_bytes(f, size, timeout, exact=False)
return untrusted_text.decode("ascii", errors="replace")
class Qubes(IsolationProvider): class Qubes(IsolationProvider):
"""Uses a disposable qube for performing the conversion""" """Uses a disposable qube for performing the conversion"""
STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes
def __init__(self) -> None: def __init__(self) -> None:
self.proc: Optional[subprocess.Popen] = None self.proc: Optional[subprocess.Popen] = None
super().__init__() super().__init__()
@ -68,86 +32,9 @@ class Qubes(IsolationProvider):
def install(self) -> bool: def install(self) -> bool:
return True return True
def _convert_with_tmpdirs( def pixels_to_pdf(
self, self, document: Document, tempdir: str, ocr_lang: Optional[str]
document: Document, ) -> None:
tempdir: str,
ocr_lang: Optional[str] = None,
) -> bool:
success = False
Path(f"{tempdir}/dangerzone").mkdir()
percentage = 0.0
with open(document.input_filename, "rb") as f:
self.proc = self.qrexec_subprocess()
try:
assert self.proc.stdin is not None
self.proc.stdin.write(f.read())
self.proc.stdin.close()
except BrokenPipeError as e:
raise errors.InterruptedConversion()
# Get file size (in MiB)
size = os.path.getsize(document.input_filename) / 1024**2
timeout = calculate_timeout(size) + STARTUP_TIME_SECONDS
assert self.proc is not None
assert self.proc.stdout is not None
os.set_blocking(self.proc.stdout.fileno(), False)
n_pages = read_int(self.proc.stdout, timeout)
if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException()
percentage_per_page = 50.0 / n_pages
timeout = calculate_timeout(size, n_pages)
sw = Stopwatch(timeout)
sw.start()
for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels"
self.print_progress_trusted(document, False, text, percentage)
width = read_int(self.proc.stdout, timeout=sw.remaining)
height = read_int(self.proc.stdout, timeout=sw.remaining)
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
raise errors.MaxPageWidthException()
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
raise errors.MaxPageHeightException()
num_pixels = width * height * 3 # three color channels
untrusted_pixels = read_bytes(
self.proc.stdout,
num_pixels,
timeout=sw.remaining,
)
# Wrapper code
with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width:
f_width.write(str(width))
with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height:
f_height.write(str(height))
with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb:
f_rgb.write(untrusted_pixels)
percentage += percentage_per_page
# Ensure nothing else is read after all bitmaps are obtained
self.proc.stdout.close()
# TODO handle leftover code input
text = "Converted document to pixels"
self.print_progress_trusted(document, False, text, percentage)
if getattr(sys, "dangerzone_dev", False):
assert self.proc.stderr is not None
os.set_blocking(self.proc.stderr.fileno(), False)
untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS)
self.proc.stderr.close()
log.info(
f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}"
)
def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: def print_progress_wrapper(error: bool, text: str, percentage: float) -> None:
self.print_progress_trusted(document, error, text, percentage) self.print_progress_trusted(document, error, text, percentage)
@ -166,28 +53,11 @@ class Qubes(IsolationProvider):
log.info(text) log.info(text)
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
success = True
return success
def _convert(
self,
document: Document,
ocr_lang: Optional[str] = None,
) -> bool:
try:
with tempfile.TemporaryDirectory() as t:
return self._convert_with_tmpdirs(document, t, ocr_lang)
except errors.InterruptedConversion:
assert self.proc is not None
error_code = self.proc.wait(3)
# XXX Reconstruct exception from error code
raise errors.exception_from_error_code(error_code) # type: ignore [misc]
def get_max_parallel_conversions(self) -> int: def get_max_parallel_conversions(self) -> int:
return 1 return 1
def qrexec_subprocess(self) -> subprocess.Popen: def start_doc_to_pixels_proc(self) -> subprocess.Popen:
dev_mode = getattr(sys, "dangerzone_dev", False) == True dev_mode = getattr(sys, "dangerzone_dev", False) == True
if dev_mode: if dev_mode:
# Use dz.ConvertDev RPC call instead, if we are in development mode. # Use dz.ConvertDev RPC call instead, if we are in development mode.

View file

@ -1,2 +1,2 @@
#!/bin/sh #!/bin/sh
python -m dangerzone.conversion.doc_to_pixels_qubes_wrapper python -m dangerzone.conversion.doc_to_pixels

View file

@ -34,7 +34,7 @@ def main():
say(f"Importing the conversion module") say(f"Importing the conversion module")
sys.path.insert(0, t.name) sys.path.insert(0, t.name)
from dangerzone.conversion.doc_to_pixels_qubes_wrapper import main from dangerzone.conversion.doc_to_pixels import main
return asyncio.run(main()) return asyncio.run(main())

View file

@ -13,7 +13,8 @@ from .. import pdf_11k_pages, sanitized_text, uncommon_text
@pytest.mark.skipif( @pytest.mark.skipif(
os.environ.get("DUMMY_CONVERSION", False), reason="dummy conversions not supported" os.environ.get("DUMMY_CONVERSION", False),
reason="dummy conversions not supported",
) )
@pytest.mark.skipif(not running_on_qubes(), reason="Not on a Qubes system") @pytest.mark.skipif(not running_on_qubes(), reason="Not on a Qubes system")
class IsolationProviderTest: class IsolationProviderTest:

View file

@ -69,7 +69,7 @@ class TestQubes(IsolationProviderTest):
) -> None: ) -> None:
provider.progress_callback = mocker.MagicMock() provider.progress_callback = mocker.MagicMock()
def qrexec_subprocess() -> subprocess.Popen: def start_doc_to_pixels_proc() -> subprocess.Popen:
p = subprocess.Popen( p = subprocess.Popen(
# XXX error 126 simulates a qrexec-policy failure. Source: # XXX error 126 simulates a qrexec-policy failure. Source:
# https://github.com/QubesOS/qubes-core-qrexec/blob/fdcbfd7/daemon/qrexec-daemon.c#L1022 # https://github.com/QubesOS/qubes-core-qrexec/blob/fdcbfd7/daemon/qrexec-daemon.c#L1022
@ -81,7 +81,9 @@ class TestQubes(IsolationProviderTest):
) )
return p return p
monkeypatch.setattr(provider, "qrexec_subprocess", qrexec_subprocess) monkeypatch.setattr(
provider, "start_doc_to_pixels_proc", start_doc_to_pixels_proc
)
with pytest.raises(errors.QubesQrexecFailed) as e: with pytest.raises(errors.QubesQrexecFailed) as e:
doc = Document(sample_doc) doc = Document(sample_doc)

View file

@ -290,7 +290,6 @@ class TestCliConversion(TestCliBasic):
def test_dummy_conversion(self, tmp_path: Path, sample_pdf: str) -> None: def test_dummy_conversion(self, tmp_path: Path, sample_pdf: str) -> None:
result = self.run_cli([sample_pdf, "--unsafe-dummy-conversion"]) result = self.run_cli([sample_pdf, "--unsafe-dummy-conversion"])
result.assert_success()
def test_dummy_conversion_bulk(self, tmp_path: Path, sample_pdf: str) -> None: def test_dummy_conversion_bulk(self, tmp_path: Path, sample_pdf: str) -> None:
filenames = ["1.pdf", "2.pdf", "3.pdf"] filenames = ["1.pdf", "2.pdf", "3.pdf"]