dangerzone/tests/test_ocr.py
Etienne Perot f03bc71855
Sandbox all Dangerzone document processing within gVisor.
This wraps the existing container image inside a gVisor-based sandbox.

gVisor is an open-source OCI-compliant container runtime.
It is a userspace reimplementation of the Linux kernel in a
memory-safe language.

It works by creating a sandboxed environment in which regular Linux
applications run, but their system calls are intercepted by gVisor.
gVisor then redirects these system calls and reinterprets them in
its own kernel. This means the host Linux kernel is isolated
from the sandboxed application, thereby providing protection against
Linux container escape attacks.

It also uses `seccomp-bpf` to provide a secondary layer of defense
against container escapes. Even if its userspace kernel gets
compromised, attackers would have to additionally have a Linux
container escape vector, and that exploit would have to fit within
the restricted `seccomp-bpf` rules that gVisor adds on itself.

Fixes #126
Fixes #224
Fixes #225
Fixes #228
2024-06-12 13:40:04 +03:00

55 lines
2 KiB
Python

import platform
import subprocess
from pathlib import Path
import pytest
from dangerzone.isolation_provider.container import Container
from dangerzone.logic import DangerzoneCore
# TODO: Perform an equivalent test on Qubes.
# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run
# Docker in these platforms. It's not a problem anyways, because the result should be
# the same in all container-based platforms.
@pytest.mark.skipif(platform.system() != "Linux", reason="Container-specific")
def test_ocr_ommisions() -> None:
# Create the command that will list all the installed languages in the container
# image.
command = [Container.get_runtime(), "run"]
command += Container.get_runtime_security_args()
command += [
Container.CONTAINER_NAME,
"find",
"/usr/share/tessdata/",
"-name",
"*.traineddata",
]
# Run the command, strip any extra whitespace, and remove the following first line
# from the result:
#
# List of available languages in "/usr/share/tessdata/" ...
installed_langs_filenames = (
subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE)
.stdout.strip()
.split("\n")
)
installed_langs = set(
[
Path(filename).name.split(".traineddata")[0]
for filename in installed_langs_filenames
]
)
# Remove the "osd" and "equ" languages from the list of installed languages, since
# they are not an actual language. Read more in:
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
installed_langs -= {"osd", "equ"}
# Grab the languages that Dangerzone offers to the user through the GUI/CLI.
offered_langs = set(DangerzoneCore(Container()).ocr_languages.values())
# Ensure that both the installed languages and the ones we offer to the user are the
# same.
assert installed_langs == offered_langs