dangerzone/dangerzone/isolation_provider/container.py

import gzip
import json
import logging
import os
import pathlib
import pipes
import platform
import shutil
import subprocess
import tempfile
from typing import Callable, List, Optional, Tuple

from ..document import Document
from ..util import get_resource_path, get_subprocess_startupinfo, get_tmp_dir
from .base import IsolationProvider

# Define startupinfo for subprocesses
if platform.system() == "Windows":
    startupinfo = subprocess.STARTUPINFO()  # type: ignore [attr-defined]
    startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW  # type: ignore [attr-defined]
else:
    startupinfo = None


log = logging.getLogger(__name__)


class NoContainerTechException(Exception):
    def __init__(self, container_tech: str) -> None:
        super().__init__(f"{container_tech} is not installed")


class Container(IsolationProvider):

    # Name of the dangerzone container
    CONTAINER_NAME = "dangerzone.rocks/dangerzone"

    def __init__(self, enable_timeouts: bool) -> None:
        self.enable_timeouts = 1 if enable_timeouts else 0
        super().__init__()

    @staticmethod
    def get_runtime_name() -> str:
        if platform.system() == "Linux":
            runtime_name = "podman"
        else:
            # Windows, Darwin, and unknown use docker for now, dangerzone-vm eventually
            runtime_name = "docker"
        return runtime_name

    @staticmethod
    def get_runtime() -> str:
        container_tech = Container.get_runtime_name()
        runtime = shutil.which(container_tech)
        if runtime is None:
            raise NoContainerTechException(container_tech)
        return runtime

    @staticmethod
    def install() -> bool:
        """
        Make sure the podman container is installed. Linux only.
        """
        if Container.is_container_installed():
            return True

        # Load the container into podman
        log.info("Installing Dangerzone container image...")

        p = subprocess.Popen(
            [Container.get_runtime(), "load"],
            stdin=subprocess.PIPE,
            startupinfo=get_subprocess_startupinfo(),
        )

        chunk_size = 10240
        compressed_container_path = get_resource_path("container.tar.gz")
        with gzip.open(compressed_container_path) as f:
            while True:
                chunk = f.read(chunk_size)
                if len(chunk) > 0:
                    if p.stdin:
                        p.stdin.write(chunk)
                else:
                    break
        p.communicate()

        if not Container.is_container_installed():
            log.error("Failed to install the container image")
            return False

        log.info("Container image installed")
        return True

    @staticmethod
    def is_container_installed() -> bool:
        """
        See if the podman container is installed. Linux only.
        """
        # Get the image id
        with open(get_resource_path("image-id.txt")) as f:
            expected_image_id = f.read().strip()

        # See if this image is already installed
        installed = False
        found_image_id = subprocess.check_output(
            [
                Container.get_runtime(),
                "image",
                "list",
                "--format",
                "{{.ID}}",
                Container.CONTAINER_NAME,
            ],
            text=True,
            startupinfo=get_subprocess_startupinfo(),
        )
        found_image_id = found_image_id.strip()

        if found_image_id == expected_image_id:
            installed = True
        elif found_image_id == "":
            pass
        else:
            log.info("Deleting old dangerzone container image")

            try:
                subprocess.check_output(
                    [Container.get_runtime(), "rmi", "--force", found_image_id],
                    startupinfo=get_subprocess_startupinfo(),
                )
            except:
                log.warning("Couldn't delete old container image, so leaving it there")

        return installed

    def parse_progress(self, document: Document, line: str) -> Tuple[bool, str, int]:
        """
        Parses a line returned by the container.
        """
        try:
            status = json.loads(line)
        except:
            error_message = f"Invalid JSON returned from container:\n\n\t {line}"
            log.error(error_message)
            return (True, error_message, -1)

        self.print_progress(
            document, status["error"], status["text"], status["percentage"]
        )
        return (status["error"], status["text"], status["percentage"])

    def exec(
        self,
        document: Document,
        args: List[str],
        stdout_callback: Optional[Callable] = None,
    ) -> int:
        args_str = " ".join(pipes.quote(s) for s in args)
        log.info("> " + args_str)

        with subprocess.Popen(
            args,
            stdin=None,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True,
            startupinfo=startupinfo,
        ) as p:
            if p.stdout is not None:
                for line in p.stdout:
                    (error, text, percentage) = self.parse_progress(document, line)
                    if stdout_callback:
                        stdout_callback(error, text, percentage)

            p.communicate()
            return p.returncode

    def exec_container(
        self,
        document: Document,
        command: List[str],
        extra_args: List[str] = [],
        stdout_callback: Optional[Callable] = None,
    ) -> int:
        container_runtime = self.get_runtime()

        if self.get_runtime_name() == "podman":
            security_args = ["--security-opt", "no-new-privileges"]
            security_args += ["--userns", "keep-id"]
        else:
            security_args = ["--security-opt=no-new-privileges:true"]

        # drop all linux kernel capabilities
        security_args += ["--cap-drop", "all"]
        user_args = ["-u", "dangerzone"]

        prevent_leakage_args = ["--rm"]

        args = (
            ["run", "--network", "none"]
            + user_args
            + security_args
            + prevent_leakage_args
            + extra_args
            + [self.CONTAINER_NAME]
            + command
        )

        args = [container_runtime] + args
        return self.exec(document, args, stdout_callback)

    def _convert(
        self,
        document: Document,
        ocr_lang: Optional[str],
        stdout_callback: Optional[Callable] = None,
    ) -> bool:
        # Create a temporary directory inside the cache directory for this run. Then,
        # create some subdirectories for the various stages of the file conversion:
        #
        # * unsafe: Where the input file will be copied
        # * pixel: Where the RGB data will be stored
        # * safe: Where the final PDF file will be stored
        with tempfile.TemporaryDirectory(dir=get_tmp_dir()) as t:
            tmp_dir = pathlib.Path(t)
            unsafe_dir = tmp_dir / "unsafe"
            unsafe_dir.mkdir()
            pixel_dir = tmp_dir / "pixels"
            pixel_dir.mkdir()
            safe_dir = tmp_dir / "safe"
            safe_dir.mkdir()

            return self._convert_with_tmpdirs(
                document=document,
                unsafe_dir=unsafe_dir,
                pixel_dir=pixel_dir,
                safe_dir=safe_dir,
                ocr_lang=ocr_lang,
                stdout_callback=stdout_callback,
            )

    def _convert_with_tmpdirs(
        self,
        document: Document,
        unsafe_dir: pathlib.Path,
        pixel_dir: pathlib.Path,
        safe_dir: pathlib.Path,
        ocr_lang: Optional[str],
        stdout_callback: Optional[Callable] = None,
    ) -> bool:
        success = False

        if ocr_lang:
            ocr = "1"
        else:
            ocr = "0"

        copied_file = unsafe_dir / "input_file"
        shutil.copyfile(f"{document.input_filename}", copied_file)

        # Convert document to pixels
        command = [
            "/usr/bin/python3",
            "/usr/local/bin/dangerzone.py",
            "document-to-pixels",
        ]
        extra_args = [
            "-v",
            f"{copied_file}:/tmp/input_file:Z",
            "-v",
            f"{pixel_dir}:/dangerzone:Z",
            "-e",
            f"ENABLE_TIMEOUTS={self.enable_timeouts}",
        ]
        ret = self.exec_container(document, command, extra_args, stdout_callback)
        if ret != 0:
            log.error("documents-to-pixels failed")
        else:
            # TODO: validate convert to pixels output

            # Convert pixels to safe PDF
            command = [
                "/usr/bin/python3",
                "/usr/local/bin/dangerzone.py",
                "pixels-to-pdf",
            ]
            extra_args = [
                "-v",
                f"{pixel_dir}:/dangerzone:Z",
                "-v",
                f"{safe_dir}:/safezone:Z",
                "-e",
                f"OCR={ocr}",
                "-e",
                f"OCR_LANGUAGE={ocr_lang}",
                "-e",
                f"ENABLE_TIMEOUTS={self.enable_timeouts}",
            ]
            ret = self.exec_container(document, command, extra_args, stdout_callback)
            if ret != 0:
                log.error("pixels-to-pdf failed")
            else:
                # Move the final file to the right place
                if os.path.exists(document.output_filename):
                    os.remove(document.output_filename)

                container_output_filename = os.path.join(
                    safe_dir, "safe-output-compressed.pdf"
                )
                shutil.move(container_output_filename, document.output_filename)

                # We did it
                success = True

        return success

    def get_max_parallel_conversions(self) -> int:

        # FIXME hardcoded 1 until timeouts are more limited and better handled
        # https://github.com/freedomofpress/dangerzone/issues/257
        return 1

        n_cpu = 1  # type: ignore [unreachable]
        if platform.system() == "Linux":
            # if on linux containers run natively
            cpu_count = os.cpu_count()
            if cpu_count is not None:
                n_cpu = cpu_count

        elif self.get_runtime_name() == "docker":
            # For Windows and MacOS containers run in VM
            # So we obtain the CPU count for the VM
            n_cpu_str = subprocess.check_output(
                [self.get_runtime(), "info", "--format", "{{.NCPU}}"],
                text=True,
                startupinfo=get_subprocess_startupinfo(),
            )
            n_cpu = int(n_cpu_str.strip())

        return 2 * n_cpu + 1