dangerzone/dangerzone/container.py
deeplow 1f18f77b64
Disable parallel conversions
Temporarily limit conversions to one at a time until timeout limitations
are resolved: https://github.com/freedomofpress/dangerzone/issues/257
2022-11-23 15:20:28 +00:00

389 lines
11 KiB
Python

import gzip
import json
import logging
import os
import pipes
import platform
import shutil
import subprocess
import tempfile
from typing import Callable, List, Optional, Tuple
import appdirs
from colorama import Fore, Style
from .document import Document
from .util import get_resource_path, get_subprocess_startupinfo
container_name = "dangerzone.rocks/dangerzone"
# Define startupinfo for subprocesses
if platform.system() == "Windows":
startupinfo = subprocess.STARTUPINFO() # type: ignore [attr-defined]
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW # type: ignore [attr-defined]
else:
startupinfo = None
log = logging.getLogger(__name__)
# Name of the dangerzone container
container_name = "dangerzone.rocks/dangerzone"
class NoContainerTechException(Exception):
def __init__(self, container_tech: str) -> None:
super().__init__(f"{container_tech} is not installed")
def get_runtime_name() -> str:
if platform.system() == "Linux":
runtime_name = "podman"
else:
# Windows, Darwin, and unknown use docker for now, dangerzone-vm eventually
runtime_name = "docker"
return runtime_name
def get_runtime() -> str:
container_tech = get_runtime_name()
runtime = shutil.which(container_tech)
if runtime is None:
raise NoContainerTechException(container_tech)
return runtime
def install() -> bool:
"""
Make sure the podman container is installed. Linux only.
"""
if is_container_installed():
return True
# Load the container into podman
log.info("Installing Dangerzone container image...")
p = subprocess.Popen(
[get_runtime(), "load"],
stdin=subprocess.PIPE,
startupinfo=get_subprocess_startupinfo(),
)
chunk_size = 10240
compressed_container_path = get_resource_path("container.tar.gz")
with gzip.open(compressed_container_path) as f:
while True:
chunk = f.read(chunk_size)
if len(chunk) > 0:
if p.stdin:
p.stdin.write(chunk)
else:
break
p.communicate()
if not is_container_installed():
log.error("Failed to install the container image")
return False
log.info("Container image installed")
return True
def is_container_installed() -> bool:
"""
See if the podman container is installed. Linux only.
"""
# Get the image id
with open(get_resource_path("image-id.txt")) as f:
expected_image_id = f.read().strip()
# See if this image is already installed
installed = False
found_image_id = subprocess.check_output(
[
get_runtime(),
"image",
"list",
"--format",
"{{.ID}}",
container_name,
],
text=True,
startupinfo=get_subprocess_startupinfo(),
)
found_image_id = found_image_id.strip()
if found_image_id == expected_image_id:
installed = True
elif found_image_id == "":
pass
else:
log.info("Deleting old dangerzone container image")
try:
subprocess.check_output(
[get_runtime(), "rmi", "--force", found_image_id],
startupinfo=get_subprocess_startupinfo(),
)
except:
log.warning("Couldn't delete old container image, so leaving it there")
return installed
def parse_progress(document: Document, line: str) -> Tuple[bool, str, int]:
"""
Parses a line returned by the container.
"""
try:
status = json.loads(line)
except:
error_message = f"Invalid JSON returned from container:\n\n\t {line}"
log.error(error_message)
return (True, error_message, -1)
s = Style.BRIGHT + Fore.YELLOW + f"[doc {document.id}] "
s += Fore.CYAN + f"{status['percentage']}% "
if status["error"]:
s += Style.RESET_ALL + Fore.RED + status["text"]
log.error(s)
else:
s += Style.RESET_ALL + status["text"]
log.info(s)
return (status["error"], status["text"], status["percentage"])
def exec(
document: Document,
args: List[str],
stdout_callback: Optional[Callable] = None,
) -> int:
args_str = " ".join(pipes.quote(s) for s in args)
log.info("> " + args_str)
with subprocess.Popen(
args,
stdin=None,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
universal_newlines=True,
startupinfo=startupinfo,
) as p:
if p.stdout is not None:
for line in p.stdout:
(error, text, percentage) = parse_progress(document, line)
if error:
document.mark_as_failed()
if percentage == 100.0:
document.mark_as_safe()
if stdout_callback:
stdout_callback(error, text, percentage)
p.communicate()
return p.returncode
def exec_container(
document: Document,
command: List[str],
extra_args: List[str] = [],
stdout_callback: Optional[Callable] = None,
) -> int:
container_runtime = get_runtime()
if get_runtime_name() == "podman":
platform_args = []
security_args = ["--security-opt", "no-new-privileges"]
security_args += ["--userns", "keep-id"]
else:
platform_args = ["--platform", "linux/amd64"]
security_args = ["--security-opt=no-new-privileges:true"]
# drop all linux kernel capabilities
security_args += ["--cap-drop", "all"]
user_args = ["-u", "dangerzone"]
prevent_leakage_args = ["--rm"]
args = (
["run", "--network", "none"]
+ platform_args
+ user_args
+ security_args
+ prevent_leakage_args
+ extra_args
+ [container_name]
+ command
)
args = [container_runtime] + args
return exec(document, args, stdout_callback)
def convert(
document: Document,
ocr_lang: Optional[str],
stdout_callback: Optional[Callable] = None,
) -> bool:
success = False
document.mark_as_converting()
if ocr_lang:
ocr = "1"
else:
ocr = "0"
dz_tmp = os.path.join(appdirs.user_config_dir("dangerzone"), "tmp")
os.makedirs(dz_tmp, exist_ok=True)
tmpdir = tempfile.TemporaryDirectory(dir=dz_tmp)
pixel_dir = os.path.join(tmpdir.name, "pixels")
safe_dir = os.path.join(tmpdir.name, "safe")
os.makedirs(pixel_dir, exist_ok=True)
os.makedirs(safe_dir, exist_ok=True)
# Convert document to pixels
command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "document-to-pixels"]
extra_args = [
"-v",
f"{document.input_filename}:/tmp/input_file",
"-v",
f"{pixel_dir}:/dangerzone",
]
ret = exec_container(document, command, extra_args, stdout_callback)
if ret != 0:
log.error("documents-to-pixels failed")
else:
# TODO: validate convert to pixels output
# Convert pixels to safe PDF
command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "pixels-to-pdf"]
extra_args = [
"-v",
f"{pixel_dir}:/dangerzone",
"-v",
f"{safe_dir}:/safezone",
"-e",
f"OCR={ocr}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
]
ret = exec_container(document, command, extra_args, stdout_callback)
if ret != 0:
log.error("pixels-to-pdf failed")
else:
# Move the final file to the right place
if os.path.exists(document.output_filename):
os.remove(document.output_filename)
container_output_filename = os.path.join(
safe_dir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, document.output_filename)
# We did it
success = True
# Clean up
tmpdir.cleanup()
return success
def get_max_parallel_conversions() -> int:
# FIXME hardcoded 1 until timeouts are more limited and better handled
# https://github.com/freedomofpress/dangerzone/issues/257
return 1
n_cpu = 1 # type: ignore [unreachable]
if platform.system() == "Linux":
# if on linux containers run natively
cpu_count = os.cpu_count()
if cpu_count is not None:
n_cpu = cpu_count
elif get_runtime_name() == "docker":
# For Windows and MacOS containers run in VM
# So we obtain the CPU count for the VM
n_cpu_str = subprocess.check_output(
[get_runtime(), "info", "--format", "{{.NCPU}}"],
text=True,
startupinfo=get_subprocess_startupinfo(),
)
n_cpu = int(n_cpu_str.strip())
return 2 * n_cpu + 1
# From global_common:
# def validate_convert_to_pixel_output(self, common, output):
# """
# Take the output from the convert to pixels tasks and validate it. Returns
# a tuple like: (success (boolean), error_message (str))
# """
# max_image_width = 10000
# max_image_height = 10000
# # Did we hit an error?
# for line in output.split("\n"):
# if (
# "failed:" in line
# or "The document format is not supported" in line
# or "Error" in line
# ):
# return False, output
# # How many pages was that?
# num_pages = None
# for line in output.split("\n"):
# if line.startswith("Document has "):
# num_pages = line.split(" ")[2]
# break
# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
# return False, "Invalid number of pages returned"
# num_pages = int(num_pages)
# # Make sure we have the files we expect
# expected_filenames = []
# for i in range(1, num_pages + 1):
# expected_filenames += [
# f"page-{i}.rgb",
# f"page-{i}.width",
# f"page-{i}.height",
# ]
# expected_filenames.sort()
# actual_filenames = os.listdir(common.pixel_dir.name)
# actual_filenames.sort()
# if expected_filenames != actual_filenames:
# return (
# False,
# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
# )
# # Make sure the files are the correct sizes
# for i in range(1, num_pages + 1):
# with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
# w_str = f.read().strip()
# with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
# h_str = f.read().strip()
# w = int(w_str)
# h = int(h_str)
# if (
# not w_str.isdigit()
# or not h_str.isdigit()
# or w <= 0
# or w > max_image_width
# or h <= 0
# or h > max_image_height
# ):
# return False, f"Page {i} has invalid geometry"
# # Make sure the RGB file is the correct size
# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
# return False, f"Page {i} has an invalid RGB file size"
# return True, True