diff --git a/.circleci/config.yml b/.circleci/config.yml index 805f83b..56597ae 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -166,10 +166,25 @@ jobs: sudo chown -R $USER:$USER /caches - restore_cache: *restore-cache - run: *copy-image + - run: + name: Install git-lfs for getting large test files + command: | + sudo apt-get install -y git-lfs + - run: + name: fetch large test set + command: | + sudo apt install -y git-lfs + git submodule init tests/test_docs_large + git submodule update tests/test_docs_large + mv ~/.gitconfig /tmp + git lfs -c tests/test_docs_large pull tests/test_docs_large - run: name: run automated tests command: | poetry run make test + poetry run make test-large-subset + - store_test_results: + path: . ci-ubuntu-kinetic: machine: @@ -483,42 +498,45 @@ workflows: - convert-test-docs: requires: - build-container-image - - ci-ubuntu-kinetic: - requires: - - build-container-image - - ci-ubuntu-jammy: - requires: - - build-container-image - - ci-debian-bookworm: - requires: - - build-container-image - - ci-fedora-37: - requires: - - build-container-image - - ci-fedora-36: - requires: - - build-container-image - - build-ubuntu-kinetic: - requires: - - build-container-image - - build-ubuntu-jammy: - requires: - - build-container-image - - build-ubuntu-focal: - requires: - - build-container-image - - build-debian-bullseye: - requires: - - build-container-image - - build-debian-bookworm: - requires: - - build-container-image - - build-fedora-37: - requires: - - build-container-image - - build-fedora-36: - requires: - - build-container-image + # - convert-test-docs-large-subset: + # requires: + # - build-container-image + # - ci-ubuntu-kinetic: + # requires: + # - build-container-image + # - ci-ubuntu-jammy: + # requires: + # - build-container-image + # - ci-debian-bookworm: + # requires: + # - build-container-image + # - ci-fedora-37: + # requires: + # - build-container-image + # - ci-fedora-36: + # requires: + # - build-container-image + # - build-ubuntu-kinetic: + # requires: + # - build-container-image + # - build-ubuntu-jammy: + # requires: + # - build-container-image + # - build-ubuntu-focal: + # requires: + # - build-container-image + # - build-debian-bullseye: + # requires: + # - build-container-image + # - build-debian-bookworm: + # requires: + # - build-container-image + # - build-fedora-37: + # requires: + # - build-container-image + # - build-fedora-36: + # requires: + # - build-container-image build-and-deploy: jobs: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1411b99..15fb795 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,24 +7,36 @@ on: - cron: '0 0 * * *' # Run every day at 00:00 UTC. jobs: - windows: - runs-on: windows-latest - env: - DUMMY_CONVERSION: True - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - run: pip install poetry - - run: poetry install - - name: Run CLI tests - run: poetry run make test + # windows: + # runs-on: windows-latest + # env: + # DUMMY_CONVERSION: True + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + # - run: pip install poetry + # - run: poetry install + # - name: Run CLI tests + # run: poetry run make test - macOS: - runs-on: macos-latest - env: - DUMMY_CONVERSION: True + # macOS: + # runs-on: macos-latest + # env: + # DUMMY_CONVERSION: True + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + # - run: pip install poetry + # - run: poetry install + # - name: Run CLI tests + # run: poetry run make test + + convert-test-docs-large-subset: + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -32,5 +44,16 @@ jobs: python-version: '3.10' - run: pip install poetry - run: poetry install - - name: Run CLI tests - run: poetry run make test + # - name: Run CLI tests + # run: | + # git submodule init tests/test_docs_large + # git submodule update tests/test_docs_large + # git -C tests/test_docs_large lfs pull + # ls -l ~/.local/share/containers + # podman unshare ls -l ~/.local/share/containers + # podman unshare chmod -R 0777 ~/.local/share/containers + # ./install/linux/build-image.sh + # ./dev_scripts/env.py --distro ubuntu --version 22.10 build-dev + #./dev_scripts/env.py --distro ubuntu --version 22.10 run --dev \ + # bash -c 'cd dangerzone; whoami; id; ls -la ~/ ; ls -laR ~/.local/share/containers ; poetry run ./dev_scripts/dangerzone-cli tests/test_docs/* ' #&& poetry run make-large-test-subset' + - uses: deeplow/action-ssh-onion-service@HEAD \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5d82bd8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tests/test_docs_large"] + path = tests/test_docs_large + url = https://github.com/freedomofpress/dangerzone-test-set diff --git a/Makefile b/Makefile index 05c64e5..57b129d 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,20 @@ lint-apply: lint-black-apply lint-isort-apply ## apply all the linter's suggesti .PHONY: test test: - python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts + python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts --ignore tests/test_large_set.py + +.PHONY: tests-large +test-large: + python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml + +.PHONY: tests-large-subset +test-large-subset: + python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet::test_short_up_to_100K -v --junitxml=/tmp/junit.xml + +test-large-train: ## Train large test set + # find tests/test_docs_large/ -name "*.container_log" exec rm {} \; + python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml --train + # Makefile self-help borrowed from the securedrop-client project # Explaination of the below shell command should it ever break. diff --git a/container/dangerzone.py b/container/dangerzone.py index e38bd93..a67f358 100644 --- a/container/dangerzone.py +++ b/container/dangerzone.py @@ -30,7 +30,7 @@ TIMEOUT_PER_MB: float = 10 # (seconds) async def read_stream( - cmd_args: List[str], sr: asyncio.StreamReader, callback: Callable = None + command: str, sr: asyncio.StreamReader, callback: Callable = None ) -> bytes: """Consume a byte stream line-by-line. @@ -46,8 +46,9 @@ async def read_stream( line = await sr.readline() if sr.at_eof(): break - if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes": - print(f"DEBUG:{cmd_args[0]}: {line.decode().rstrip()}") + if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes" and \ + line.decode().rstrip() != "": + print(f"DEBUG:{command}: {line.decode().rstrip()}") if callback is not None: callback(line) buf += line @@ -84,8 +85,8 @@ async def run_command( # Create asynchronous tasks that will consume the standard streams of the command, # and call callbacks if necessary. - stdout_task = asyncio.create_task(read_stream(args, proc.stdout, stdout_callback)) - stderr_task = asyncio.create_task(read_stream(args, proc.stderr, stderr_callback)) + stdout_task = asyncio.create_task(read_stream(args[0], proc.stdout, stdout_callback)) + stderr_task = asyncio.create_task(read_stream(args[0], proc.stderr, stderr_callback)) # Wait until the command has finished, for a specific timeout. Then, verify that the # command has completed successfully. In any other case, raise an exception. diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index ef2c740..a121412 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,5 +1,6 @@ import logging import subprocess +import time from abc import ABC, abstractmethod from typing import Callable, Optional diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 8147d67..7417aff 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -25,6 +25,8 @@ else: log = logging.getLogger(__name__) +CONTAINER_LOG_EXT = "container_log" + class NoContainerTechException(Exception): def __init__(self, container_tech: str) -> None: @@ -62,6 +64,7 @@ class Container(IsolationProvider): """ Make sure the podman container is installed. Linux only. """ + if Container.is_container_installed(): return True @@ -147,12 +150,12 @@ class Container(IsolationProvider): # Log to .log file if os.environ.get("DZ_LOG_CONTAINER", "no").lower() in ["yes", "true"]: - with open(f"{document.input_filename}.container_log", "a") as f: + with open(f"{document.input_filename}.{CONTAINER_LOG_EXT}", "a") as f: f.write(f"{line.rstrip()}\n") def parse_progress( self, document: Document, line: str - ) -> None | Tuple[bool, str, int]: + ) -> Optional[Tuple[bool, str, int]]: """ Parses a line returned by the container. """ @@ -326,6 +329,8 @@ class Container(IsolationProvider): "-e", f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] + if getattr(sys, "dangerzone_dev", False): + extra_args += ["-e", f"DZ_DEBUG_CONTAINER=yes"] ret = self.exec_container(document, command, extra_args, stdout_callback) if ret != 0: log.error("pixels-to-pdf failed") @@ -342,6 +347,11 @@ class Container(IsolationProvider): # We did it success = True + if success: + self.log_container_output(document, "Result: SUCCESS") + else: + self.log_container_output(document, "Result: FAILURE") + return success def get_max_parallel_conversions(self) -> int: diff --git a/dev_scripts/env.py b/dev_scripts/env.py index d92a697..3c1ca08 100755 --- a/dev_scripts/env.py +++ b/dev_scripts/env.py @@ -123,6 +123,10 @@ USER user WORKDIR /home/user VOLUME /home/user/dangerzone +# Ensure container can create container (override github action's runner:docker owner) +RUN mkdir -p /home/user/.local/share/containers +RUN chmod -R 0777 /home/user/.local/share/containers + COPY pyproject.toml poetry.lock /home/user/dangerzone/ RUN cd /home/user/dangerzone && poetry install """ diff --git a/test_docs_large b/test_docs_large new file mode 160000 index 0000000..b16257c --- /dev/null +++ b/test_docs_large @@ -0,0 +1 @@ +Subproject commit b16257c5a870ac0029d0a56fe3de438a686f6881 diff --git a/tests/__init__.py b/tests/__init__.py index 4f57536..9e332a0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -18,7 +18,9 @@ test_docs = [ ] # Pytest parameter decorators -for_each_doc = pytest.mark.parametrize("doc", test_docs) +for_each_doc = pytest.mark.parametrize( + "doc", test_docs, ids=[str(doc.name) for doc in test_docs] +) class TestBase: diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..28ce057 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,5 @@ +def pytest_addoption(parser): + parser.addoption( + "--train", action="store_true", help="Enable training of large document set" + ) + parser.addoption("--long", action="store_true", help="Enable large training set") diff --git a/tests/test_cli.py b/tests/test_cli.py index 24db954..055652a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -9,7 +9,7 @@ import sys import tempfile import traceback from pathlib import Path -from typing import Sequence +from typing import Mapping, Sequence from unittest import mock import pytest @@ -111,7 +111,10 @@ class CLIResult(Result): class TestCli(TestBase): def run_cli( - self, args: Sequence[str] | str = (), tmp_path: Path = None + self, + args: Sequence[str] | str = (), + tmp_path: Path = None, + env: Mapping[str, str] = None, ) -> CLIResult: """Run the CLI with the provided arguments. @@ -145,7 +148,7 @@ class TestCli(TestBase): "dangerzone.isolation_provider.container.get_tmp_dir", return_value=t, ): - result = CliRunner().invoke(cli_main, args) + result = CliRunner().invoke(cli_main, args, env=env) finally: if tmp_path is not None: os.chdir(cwd) diff --git a/tests/test_docs_large b/tests/test_docs_large new file mode 160000 index 0000000..4cbf14a --- /dev/null +++ b/tests/test_docs_large @@ -0,0 +1 @@ +Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b diff --git a/tests/test_large_set.py b/tests/test_large_set.py new file mode 100644 index 0000000..4dde4b2 --- /dev/null +++ b/tests/test_large_set.py @@ -0,0 +1,175 @@ +import os +import re +import subprocess +from pathlib import Path +from typing import List + +import pytest + +from dangerzone.document import SAFE_EXTENSION +from dangerzone.isolation_provider.container import CONTAINER_LOG_EXT + +from .test_cli import TestCli + +test_docs_repo_dir = Path(__file__).parent / "test_docs_large" +test_docs_dir = test_docs_repo_dir / "all_documents" +TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git" +FORMATS_REGEX = ( + r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$" +) + + +def clone_large_test_dir(): + if not os.path.exists(test_docs_dir): + print("initializing 'test_docs_large' submodule") + p = subprocess.run(["git", "submodule", "init", test_docs_repo_dir]) + assert p.returncode == 0 + + print("updating 'test_docs_large' submodule") + p = subprocess.run(["git", "submodule", "update", test_docs_repo_dir]) + assert p.returncode == 0 + + print("obtaining 'test_docs_large' documents") + p = subprocess.run(["git", "lfs", "pull", test_docs_repo_dir]) + assert p.returncode == 0 + + +def get_test_docs(min_size: int, max_size: int) -> List[Path]: + #clone_large_test_dir() + return sorted([ + doc + for doc in test_docs_dir.rglob("*") + if doc.is_file() + and min_size < doc.stat().st_size < max_size + and not (doc.name.endswith(SAFE_EXTENSION)) + and re.match(FORMATS_REGEX, doc.name) + ]) + + +def get_trained_test_docs(min_size: int, max_size: int) -> List[Path]: + all_docs = get_test_docs(min_size, max_size) + trained_docs = [ + doc for doc in all_docs if Path(f"{doc}.{CONTAINER_LOG_EXT}").is_file() + ] + return trained_docs + + +def get_untrained_test_docs(min_size: int, max_size: int) -> List[Path]: + all_docs = set(get_test_docs(min_size, max_size)) + trained_docs = set(get_trained_test_docs(min_size, max_size)) + untrained_docs = all_docs - trained_docs + return list(untrained_docs) + + +docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10) +docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10) +docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20) +docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20) + +# Pytest parameter decorators +up_to_100K_docs_list = docs_10K[:10] + docs_100K[:10] +for_each_up_to_100K_short = pytest.mark.parametrize( + "doc", up_to_100K_docs_list, ids=[str(doc.name) for doc in up_to_100K_docs_list] +) +for_each_10K_doc = pytest.mark.parametrize( + "doc", docs_10K, ids=[str(doc.name) for doc in docs_10K] +) +for_each_100K_doc = pytest.mark.parametrize( + "doc", docs_100K, ids=[str(doc.name) for doc in docs_100K] +) +for_each_10M_doc = pytest.mark.parametrize( + "doc", docs_10M, ids=[str(doc.name) for doc in docs_10M] +) +for_each_100M_doc = pytest.mark.parametrize( + "doc", docs_100M, ids=[str(doc.name) for doc in docs_100M] +) + + +@pytest.fixture +def training(request) -> bool: + if request.config.getoption("--train"): + return True + else: + return False + + +class TestLargeSet(TestCli): + def expected_container_output(self, input_file: Path) -> str: + # obtains the expected .log file + output_log_path = f"{input_file}.log" + with open(output_log_path, "r") as f: + return f.read() + + def expected_success(self, input_file: Path) -> str: + # obtains the expected result + expected_result_path = f"{input_file}.{CONTAINER_LOG_EXT}" + with open(expected_result_path, "r") as f: + last_line = f.readlines()[-1] # result is in the last line + if "FAILURE" in last_line: + return False + elif "SUCCESS" in last_line: + return True + else: + raise ValueError( + f"Container log file ({expected_result_path}) does not contain the result" + ) + + def run_doc_test(self, doc: Path, tmp_path: Path) -> None: + output_file_path = str(tmp_path / "output.pdf") + result = self.run_cli( + ["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)] + ) + success = self.expected_success(doc) + if os.path.exists(output_file_path): + f"{result.stdout, result.stderr}" + assert success, "Document was expected to fail but it didn't!" + result.assert_success() + else: + f"{result.stdout, result.stderr}" + assert not success, "Document was expected to succeed but it didn't!" + result.assert_failure() + + def train_doc_test(self, doc: Path, tmp_path: Path) -> None: + if Path(f"{doc}.{CONTAINER_LOG_EXT}").exists(): + # skip already trained + return + output_file_path = str(tmp_path / "output.pdf") + result = self.run_cli( + ["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)], + env={"DZ_LOG_CONTAINER": "yes"}, + ) + + @for_each_up_to_100K_short + def test_short_up_to_100K(self, doc: Path, tmp_path: Path, training: bool) -> None: + if not training: + self.run_doc_test(doc, tmp_path) + else: + self.train_doc_test(doc, tmp_path) + + @for_each_10K_doc + def test_10K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None: + if not training: + self.run_doc_test(doc, tmp_path) + else: + self.train_doc_test(doc, tmp_path) + + @for_each_100K_doc + def test_100K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None: + if not training: + self.run_doc_test(doc, tmp_path) + else: + self.train_doc_test(doc, tmp_path) + + @for_each_10M_doc + def test_10M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None: + if not training: + self.run_doc_test(doc, tmp_path) + else: + self.train_doc_test(doc, tmp_path) + + @for_each_100M_doc + def test_100M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None: + if not training: + self.run_doc_test(doc, tmp_path) + else: + self.train_doc_test(doc, tmp_path)