mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-29 02:12:36 +02:00

Temporarily limit conversions to one at a time until timeout limitations are resolved: https://github.com/freedomofpress/dangerzone/issues/257
389 lines
11 KiB
Python
389 lines
11 KiB
Python
import gzip
|
|
import json
|
|
import logging
|
|
import os
|
|
import pipes
|
|
import platform
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
from typing import Callable, List, Optional, Tuple
|
|
|
|
import appdirs
|
|
from colorama import Fore, Style
|
|
|
|
from .document import Document
|
|
from .util import get_resource_path, get_subprocess_startupinfo
|
|
|
|
container_name = "dangerzone.rocks/dangerzone"
|
|
|
|
# Define startupinfo for subprocesses
|
|
if platform.system() == "Windows":
|
|
startupinfo = subprocess.STARTUPINFO() # type: ignore [attr-defined]
|
|
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW # type: ignore [attr-defined]
|
|
else:
|
|
startupinfo = None
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Name of the dangerzone container
|
|
container_name = "dangerzone.rocks/dangerzone"
|
|
|
|
|
|
class NoContainerTechException(Exception):
|
|
def __init__(self, container_tech: str) -> None:
|
|
super().__init__(f"{container_tech} is not installed")
|
|
|
|
|
|
def get_runtime_name() -> str:
|
|
if platform.system() == "Linux":
|
|
runtime_name = "podman"
|
|
else:
|
|
# Windows, Darwin, and unknown use docker for now, dangerzone-vm eventually
|
|
runtime_name = "docker"
|
|
return runtime_name
|
|
|
|
|
|
def get_runtime() -> str:
|
|
container_tech = get_runtime_name()
|
|
runtime = shutil.which(container_tech)
|
|
if runtime is None:
|
|
raise NoContainerTechException(container_tech)
|
|
return runtime
|
|
|
|
|
|
def install() -> bool:
|
|
"""
|
|
Make sure the podman container is installed. Linux only.
|
|
"""
|
|
if is_container_installed():
|
|
return True
|
|
|
|
# Load the container into podman
|
|
log.info("Installing Dangerzone container image...")
|
|
|
|
p = subprocess.Popen(
|
|
[get_runtime(), "load"],
|
|
stdin=subprocess.PIPE,
|
|
startupinfo=get_subprocess_startupinfo(),
|
|
)
|
|
|
|
chunk_size = 10240
|
|
compressed_container_path = get_resource_path("container.tar.gz")
|
|
with gzip.open(compressed_container_path) as f:
|
|
while True:
|
|
chunk = f.read(chunk_size)
|
|
if len(chunk) > 0:
|
|
if p.stdin:
|
|
p.stdin.write(chunk)
|
|
else:
|
|
break
|
|
p.communicate()
|
|
|
|
if not is_container_installed():
|
|
log.error("Failed to install the container image")
|
|
return False
|
|
|
|
log.info("Container image installed")
|
|
return True
|
|
|
|
|
|
def is_container_installed() -> bool:
|
|
"""
|
|
See if the podman container is installed. Linux only.
|
|
"""
|
|
# Get the image id
|
|
with open(get_resource_path("image-id.txt")) as f:
|
|
expected_image_id = f.read().strip()
|
|
|
|
# See if this image is already installed
|
|
installed = False
|
|
found_image_id = subprocess.check_output(
|
|
[
|
|
get_runtime(),
|
|
"image",
|
|
"list",
|
|
"--format",
|
|
"{{.ID}}",
|
|
container_name,
|
|
],
|
|
text=True,
|
|
startupinfo=get_subprocess_startupinfo(),
|
|
)
|
|
found_image_id = found_image_id.strip()
|
|
|
|
if found_image_id == expected_image_id:
|
|
installed = True
|
|
elif found_image_id == "":
|
|
pass
|
|
else:
|
|
log.info("Deleting old dangerzone container image")
|
|
|
|
try:
|
|
subprocess.check_output(
|
|
[get_runtime(), "rmi", "--force", found_image_id],
|
|
startupinfo=get_subprocess_startupinfo(),
|
|
)
|
|
except:
|
|
log.warning("Couldn't delete old container image, so leaving it there")
|
|
|
|
return installed
|
|
|
|
|
|
def parse_progress(document: Document, line: str) -> Tuple[bool, str, int]:
|
|
"""
|
|
Parses a line returned by the container.
|
|
"""
|
|
try:
|
|
status = json.loads(line)
|
|
except:
|
|
error_message = f"Invalid JSON returned from container:\n\n\t {line}"
|
|
log.error(error_message)
|
|
return (True, error_message, -1)
|
|
|
|
s = Style.BRIGHT + Fore.YELLOW + f"[doc {document.id}] "
|
|
s += Fore.CYAN + f"{status['percentage']}% "
|
|
if status["error"]:
|
|
s += Style.RESET_ALL + Fore.RED + status["text"]
|
|
log.error(s)
|
|
else:
|
|
s += Style.RESET_ALL + status["text"]
|
|
log.info(s)
|
|
|
|
return (status["error"], status["text"], status["percentage"])
|
|
|
|
|
|
def exec(
|
|
document: Document,
|
|
args: List[str],
|
|
stdout_callback: Optional[Callable] = None,
|
|
) -> int:
|
|
args_str = " ".join(pipes.quote(s) for s in args)
|
|
log.info("> " + args_str)
|
|
|
|
with subprocess.Popen(
|
|
args,
|
|
stdin=None,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
bufsize=1,
|
|
universal_newlines=True,
|
|
startupinfo=startupinfo,
|
|
) as p:
|
|
if p.stdout is not None:
|
|
for line in p.stdout:
|
|
(error, text, percentage) = parse_progress(document, line)
|
|
if error:
|
|
document.mark_as_failed()
|
|
if percentage == 100.0:
|
|
document.mark_as_safe()
|
|
if stdout_callback:
|
|
stdout_callback(error, text, percentage)
|
|
|
|
p.communicate()
|
|
return p.returncode
|
|
|
|
|
|
def exec_container(
|
|
document: Document,
|
|
command: List[str],
|
|
extra_args: List[str] = [],
|
|
stdout_callback: Optional[Callable] = None,
|
|
) -> int:
|
|
container_runtime = get_runtime()
|
|
|
|
if get_runtime_name() == "podman":
|
|
platform_args = []
|
|
security_args = ["--security-opt", "no-new-privileges"]
|
|
security_args += ["--userns", "keep-id"]
|
|
else:
|
|
platform_args = ["--platform", "linux/amd64"]
|
|
security_args = ["--security-opt=no-new-privileges:true"]
|
|
|
|
# drop all linux kernel capabilities
|
|
security_args += ["--cap-drop", "all"]
|
|
user_args = ["-u", "dangerzone"]
|
|
|
|
prevent_leakage_args = ["--rm"]
|
|
|
|
args = (
|
|
["run", "--network", "none"]
|
|
+ platform_args
|
|
+ user_args
|
|
+ security_args
|
|
+ prevent_leakage_args
|
|
+ extra_args
|
|
+ [container_name]
|
|
+ command
|
|
)
|
|
|
|
args = [container_runtime] + args
|
|
return exec(document, args, stdout_callback)
|
|
|
|
|
|
def convert(
|
|
document: Document,
|
|
ocr_lang: Optional[str],
|
|
stdout_callback: Optional[Callable] = None,
|
|
) -> bool:
|
|
success = False
|
|
document.mark_as_converting()
|
|
|
|
if ocr_lang:
|
|
ocr = "1"
|
|
else:
|
|
ocr = "0"
|
|
|
|
dz_tmp = os.path.join(appdirs.user_config_dir("dangerzone"), "tmp")
|
|
os.makedirs(dz_tmp, exist_ok=True)
|
|
|
|
tmpdir = tempfile.TemporaryDirectory(dir=dz_tmp)
|
|
pixel_dir = os.path.join(tmpdir.name, "pixels")
|
|
safe_dir = os.path.join(tmpdir.name, "safe")
|
|
os.makedirs(pixel_dir, exist_ok=True)
|
|
os.makedirs(safe_dir, exist_ok=True)
|
|
|
|
# Convert document to pixels
|
|
command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "document-to-pixels"]
|
|
extra_args = [
|
|
"-v",
|
|
f"{document.input_filename}:/tmp/input_file",
|
|
"-v",
|
|
f"{pixel_dir}:/dangerzone",
|
|
]
|
|
ret = exec_container(document, command, extra_args, stdout_callback)
|
|
if ret != 0:
|
|
log.error("documents-to-pixels failed")
|
|
else:
|
|
# TODO: validate convert to pixels output
|
|
|
|
# Convert pixels to safe PDF
|
|
command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "pixels-to-pdf"]
|
|
extra_args = [
|
|
"-v",
|
|
f"{pixel_dir}:/dangerzone",
|
|
"-v",
|
|
f"{safe_dir}:/safezone",
|
|
"-e",
|
|
f"OCR={ocr}",
|
|
"-e",
|
|
f"OCR_LANGUAGE={ocr_lang}",
|
|
]
|
|
ret = exec_container(document, command, extra_args, stdout_callback)
|
|
if ret != 0:
|
|
log.error("pixels-to-pdf failed")
|
|
else:
|
|
# Move the final file to the right place
|
|
if os.path.exists(document.output_filename):
|
|
os.remove(document.output_filename)
|
|
|
|
container_output_filename = os.path.join(
|
|
safe_dir, "safe-output-compressed.pdf"
|
|
)
|
|
shutil.move(container_output_filename, document.output_filename)
|
|
|
|
# We did it
|
|
success = True
|
|
|
|
# Clean up
|
|
tmpdir.cleanup()
|
|
|
|
return success
|
|
|
|
|
|
def get_max_parallel_conversions() -> int:
|
|
|
|
# FIXME hardcoded 1 until timeouts are more limited and better handled
|
|
# https://github.com/freedomofpress/dangerzone/issues/257
|
|
return 1
|
|
|
|
n_cpu = 1 # type: ignore [unreachable]
|
|
if platform.system() == "Linux":
|
|
# if on linux containers run natively
|
|
cpu_count = os.cpu_count()
|
|
if cpu_count is not None:
|
|
n_cpu = cpu_count
|
|
|
|
elif get_runtime_name() == "docker":
|
|
# For Windows and MacOS containers run in VM
|
|
# So we obtain the CPU count for the VM
|
|
n_cpu_str = subprocess.check_output(
|
|
[get_runtime(), "info", "--format", "{{.NCPU}}"],
|
|
text=True,
|
|
startupinfo=get_subprocess_startupinfo(),
|
|
)
|
|
n_cpu = int(n_cpu_str.strip())
|
|
|
|
return 2 * n_cpu + 1
|
|
|
|
|
|
# From global_common:
|
|
|
|
# def validate_convert_to_pixel_output(self, common, output):
|
|
# """
|
|
# Take the output from the convert to pixels tasks and validate it. Returns
|
|
# a tuple like: (success (boolean), error_message (str))
|
|
# """
|
|
# max_image_width = 10000
|
|
# max_image_height = 10000
|
|
|
|
# # Did we hit an error?
|
|
# for line in output.split("\n"):
|
|
# if (
|
|
# "failed:" in line
|
|
# or "The document format is not supported" in line
|
|
# or "Error" in line
|
|
# ):
|
|
# return False, output
|
|
|
|
# # How many pages was that?
|
|
# num_pages = None
|
|
# for line in output.split("\n"):
|
|
# if line.startswith("Document has "):
|
|
# num_pages = line.split(" ")[2]
|
|
# break
|
|
# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
|
|
# return False, "Invalid number of pages returned"
|
|
# num_pages = int(num_pages)
|
|
|
|
# # Make sure we have the files we expect
|
|
# expected_filenames = []
|
|
# for i in range(1, num_pages + 1):
|
|
# expected_filenames += [
|
|
# f"page-{i}.rgb",
|
|
# f"page-{i}.width",
|
|
# f"page-{i}.height",
|
|
# ]
|
|
# expected_filenames.sort()
|
|
# actual_filenames = os.listdir(common.pixel_dir.name)
|
|
# actual_filenames.sort()
|
|
|
|
# if expected_filenames != actual_filenames:
|
|
# return (
|
|
# False,
|
|
# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
|
|
# )
|
|
|
|
# # Make sure the files are the correct sizes
|
|
# for i in range(1, num_pages + 1):
|
|
# with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
|
|
# w_str = f.read().strip()
|
|
# with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
|
|
# h_str = f.read().strip()
|
|
# w = int(w_str)
|
|
# h = int(h_str)
|
|
# if (
|
|
# not w_str.isdigit()
|
|
# or not h_str.isdigit()
|
|
# or w <= 0
|
|
# or w > max_image_width
|
|
# or h <= 0
|
|
# or h > max_image_height
|
|
# ):
|
|
# return False, f"Page {i} has invalid geometry"
|
|
|
|
# # Make sure the RGB file is the correct size
|
|
# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
|
|
# return False, f"Page {i} has an invalid RGB file size"
|
|
|
|
# return True, True
|