dangerzone/tests/test_large_set.py

import os
import re
import subprocess
from pathlib import Path
from typing import List

import pytest

from dangerzone.document import SAFE_EXTENSION
from dangerzone.isolation_provider.container import CONTAINER_LOG_EXT

from .test_cli import TestCli

test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
test_docs_dir = test_docs_repo_dir / "all_documents"
TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
FORMATS_REGEX = (
    r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
)


def clone_large_test_dir():
    if not os.path.exists(test_docs_dir):
        print("initializing 'test_docs_large' submodule")
        p = subprocess.run(["git", "submodule", "init", test_docs_repo_dir])
        assert p.returncode == 0

        print("updating 'test_docs_large' submodule")
        p = subprocess.run(["git", "submodule", "update", test_docs_repo_dir])
        assert p.returncode == 0

        print("obtaining 'test_docs_large' documents")
        p = subprocess.run(["git", "lfs", "pull", test_docs_repo_dir])
        assert p.returncode == 0


def get_test_docs(min_size: int, max_size: int) -> List[Path]:
    #clone_large_test_dir()
    return sorted([
        doc
        for doc in test_docs_dir.rglob("*")
        if doc.is_file()
        and min_size < doc.stat().st_size < max_size
        and not (doc.name.endswith(SAFE_EXTENSION))
        and re.match(FORMATS_REGEX, doc.name)
    ])


def get_trained_test_docs(min_size: int, max_size: int) -> List[Path]:
    all_docs = get_test_docs(min_size, max_size)
    trained_docs = [
        doc for doc in all_docs if Path(f"{doc}.{CONTAINER_LOG_EXT}").is_file()
    ]
    return trained_docs


def get_untrained_test_docs(min_size: int, max_size: int) -> List[Path]:
    all_docs = set(get_test_docs(min_size, max_size))
    trained_docs = set(get_trained_test_docs(min_size, max_size))
    untrained_docs = all_docs - trained_docs
    return list(untrained_docs)


docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)

# Pytest parameter decorators
up_to_100K_docs_list = docs_10K[:10] +  docs_100K[:10]
for_each_up_to_100K_short = pytest.mark.parametrize(
    "doc", up_to_100K_docs_list, ids=[str(doc.name) for doc in up_to_100K_docs_list]
)
for_each_10K_doc = pytest.mark.parametrize(
    "doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
)
for_each_100K_doc = pytest.mark.parametrize(
    "doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
)
for_each_10M_doc = pytest.mark.parametrize(
    "doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
)
for_each_100M_doc = pytest.mark.parametrize(
    "doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
)


@pytest.fixture
def training(request) -> bool:
    if request.config.getoption("--train"):
        return True
    else:
        return False


class TestLargeSet(TestCli):
    def expected_container_output(self, input_file: Path) -> str:
        # obtains the expected .log file
        output_log_path = f"{input_file}.log"
        with open(output_log_path, "r") as f:
            return f.read()

    def expected_success(self, input_file: Path) -> str:
        # obtains the expected result
        expected_result_path = f"{input_file}.{CONTAINER_LOG_EXT}"
        with open(expected_result_path, "r") as f:
            last_line = f.readlines()[-1]  # result is in the last line
            if "FAILURE" in last_line:
                return False
            elif "SUCCESS" in last_line:
                return True
            else:
                raise ValueError(
                    f"Container log file ({expected_result_path}) does not contain the result"
                )

    def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
        output_file_path = str(tmp_path / "output.pdf")
        result = self.run_cli(
            ["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)]
        )
        success = self.expected_success(doc)
        if os.path.exists(output_file_path):
            f"{result.stdout, result.stderr}"
            assert success, "Document was expected to fail but it didn't!"
            result.assert_success()
        else:
            f"{result.stdout, result.stderr}"
            assert not success, "Document was expected to succeed but it didn't!"
            result.assert_failure()

    def train_doc_test(self, doc: Path, tmp_path: Path) -> None:
        if Path(f"{doc}.{CONTAINER_LOG_EXT}").exists():
            # skip already trained
            return
        output_file_path = str(tmp_path / "output.pdf")
        result = self.run_cli(
            ["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)],
            env={"DZ_LOG_CONTAINER": "yes"},
        )

    @for_each_up_to_100K_short
    def test_short_up_to_100K(self, doc: Path, tmp_path: Path, training: bool) -> None:
        if not training:
            self.run_doc_test(doc, tmp_path)
        else:
            self.train_doc_test(doc, tmp_path)

    @for_each_10K_doc
    def test_10K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
        if not training:
            self.run_doc_test(doc, tmp_path)
        else:
            self.train_doc_test(doc, tmp_path)

    @for_each_100K_doc
    def test_100K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
        if not training:
            self.run_doc_test(doc, tmp_path)
        else:
            self.train_doc_test(doc, tmp_path)

    @for_each_10M_doc
    def test_10M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
        if not training:
            self.run_doc_test(doc, tmp_path)
        else:
            self.train_doc_test(doc, tmp_path)

    @for_each_100M_doc
    def test_100M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
        if not training:
            self.run_doc_test(doc, tmp_path)
        else:
            self.train_doc_test(doc, tmp_path)