This commit is contained in:
deeplow 2023-02-16 10:56:25 +00:00
parent 134dc27364
commit 378cec9386
No known key found for this signature in database
GPG key ID: 577982871529A52A
14 changed files with 327 additions and 67 deletions

View file

@ -166,10 +166,25 @@ jobs:
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run:
name: Install git-lfs for getting large test files
command: |
sudo apt-get install -y git-lfs
- run:
name: fetch large test set
command: |
sudo apt install -y git-lfs
git submodule init tests/test_docs_large
git submodule update tests/test_docs_large
mv ~/.gitconfig /tmp
git lfs -c tests/test_docs_large pull tests/test_docs_large
- run: - run:
name: run automated tests name: run automated tests
command: | command: |
poetry run make test poetry run make test
poetry run make test-large-subset
- store_test_results:
path: .
ci-ubuntu-kinetic: ci-ubuntu-kinetic:
machine: machine:
@ -483,42 +498,45 @@ workflows:
- convert-test-docs: - convert-test-docs:
requires: requires:
- build-container-image - build-container-image
- ci-ubuntu-kinetic: # - convert-test-docs-large-subset:
requires: # requires:
- build-container-image # - build-container-image
- ci-ubuntu-jammy: # - ci-ubuntu-kinetic:
requires: # requires:
- build-container-image # - build-container-image
- ci-debian-bookworm: # - ci-ubuntu-jammy:
requires: # requires:
- build-container-image # - build-container-image
- ci-fedora-37: # - ci-debian-bookworm:
requires: # requires:
- build-container-image # - build-container-image
- ci-fedora-36: # - ci-fedora-37:
requires: # requires:
- build-container-image # - build-container-image
- build-ubuntu-kinetic: # - ci-fedora-36:
requires: # requires:
- build-container-image # - build-container-image
- build-ubuntu-jammy: # - build-ubuntu-kinetic:
requires: # requires:
- build-container-image # - build-container-image
- build-ubuntu-focal: # - build-ubuntu-jammy:
requires: # requires:
- build-container-image # - build-container-image
- build-debian-bullseye: # - build-ubuntu-focal:
requires: # requires:
- build-container-image # - build-container-image
- build-debian-bookworm: # - build-debian-bullseye:
requires: # requires:
- build-container-image # - build-container-image
- build-fedora-37: # - build-debian-bookworm:
requires: # requires:
- build-container-image # - build-container-image
- build-fedora-36: # - build-fedora-37:
requires: # requires:
- build-container-image # - build-container-image
# - build-fedora-36:
# requires:
# - build-container-image
build-and-deploy: build-and-deploy:
jobs: jobs:

View file

@ -7,24 +7,36 @@ on:
- cron: '0 0 * * *' # Run every day at 00:00 UTC. - cron: '0 0 * * *' # Run every day at 00:00 UTC.
jobs: jobs:
windows: # windows:
runs-on: windows-latest # runs-on: windows-latest
env: # env:
DUMMY_CONVERSION: True # DUMMY_CONVERSION: True
steps: # steps:
- uses: actions/checkout@v3 # - uses: actions/checkout@v3
- uses: actions/setup-python@v4 # - uses: actions/setup-python@v4
with: # with:
python-version: '3.10' # python-version: '3.10'
- run: pip install poetry # - run: pip install poetry
- run: poetry install # - run: poetry install
- name: Run CLI tests # - name: Run CLI tests
run: poetry run make test # run: poetry run make test
macOS: # macOS:
runs-on: macos-latest # runs-on: macos-latest
env: # env:
DUMMY_CONVERSION: True # DUMMY_CONVERSION: True
# steps:
# - uses: actions/checkout@v3
# - uses: actions/setup-python@v4
# with:
# python-version: '3.10'
# - run: pip install poetry
# - run: poetry install
# - name: Run CLI tests
# run: poetry run make test
convert-test-docs-large-subset:
runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
@ -32,5 +44,16 @@ jobs:
python-version: '3.10' python-version: '3.10'
- run: pip install poetry - run: pip install poetry
- run: poetry install - run: poetry install
- name: Run CLI tests # - name: Run CLI tests
run: poetry run make test # run: |
# git submodule init tests/test_docs_large
# git submodule update tests/test_docs_large
# git -C tests/test_docs_large lfs pull
# ls -l ~/.local/share/containers
# podman unshare ls -l ~/.local/share/containers
# podman unshare chmod -R 0777 ~/.local/share/containers
# ./install/linux/build-image.sh
# ./dev_scripts/env.py --distro ubuntu --version 22.10 build-dev
#./dev_scripts/env.py --distro ubuntu --version 22.10 run --dev \
# bash -c 'cd dangerzone; whoami; id; ls -la ~/ ; ls -laR ~/.local/share/containers ; poetry run ./dev_scripts/dangerzone-cli tests/test_docs/* ' #&& poetry run make-large-test-subset'
- uses: deeplow/action-ssh-onion-service@HEAD

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "tests/test_docs_large"]
path = tests/test_docs_large
url = https://github.com/freedomofpress/dangerzone-test-set

View file

@ -40,7 +40,20 @@ lint-apply: lint-black-apply lint-isort-apply ## apply all the linter's suggesti
.PHONY: test .PHONY: test
test: test:
python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts --ignore tests/test_large_set.py
.PHONY: tests-large
test-large:
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml
.PHONY: tests-large-subset
test-large-subset:
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet::test_short_up_to_100K -v --junitxml=/tmp/junit.xml
test-large-train: ## Train large test set
# find tests/test_docs_large/ -name "*.container_log" exec rm {} \;
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml --train
# Makefile self-help borrowed from the securedrop-client project # Makefile self-help borrowed from the securedrop-client project
# Explaination of the below shell command should it ever break. # Explaination of the below shell command should it ever break.

View file

@ -30,7 +30,7 @@ TIMEOUT_PER_MB: float = 10 # (seconds)
async def read_stream( async def read_stream(
cmd_args: List[str], sr: asyncio.StreamReader, callback: Callable = None command: str, sr: asyncio.StreamReader, callback: Callable = None
) -> bytes: ) -> bytes:
"""Consume a byte stream line-by-line. """Consume a byte stream line-by-line.
@ -46,8 +46,9 @@ async def read_stream(
line = await sr.readline() line = await sr.readline()
if sr.at_eof(): if sr.at_eof():
break break
if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes": if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes" and \
print(f"DEBUG:{cmd_args[0]}: {line.decode().rstrip()}") line.decode().rstrip() != "":
print(f"DEBUG:{command}: {line.decode().rstrip()}")
if callback is not None: if callback is not None:
callback(line) callback(line)
buf += line buf += line
@ -84,8 +85,8 @@ async def run_command(
# Create asynchronous tasks that will consume the standard streams of the command, # Create asynchronous tasks that will consume the standard streams of the command,
# and call callbacks if necessary. # and call callbacks if necessary.
stdout_task = asyncio.create_task(read_stream(args, proc.stdout, stdout_callback)) stdout_task = asyncio.create_task(read_stream(args[0], proc.stdout, stdout_callback))
stderr_task = asyncio.create_task(read_stream(args, proc.stderr, stderr_callback)) stderr_task = asyncio.create_task(read_stream(args[0], proc.stderr, stderr_callback))
# Wait until the command has finished, for a specific timeout. Then, verify that the # Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception. # command has completed successfully. In any other case, raise an exception.

View file

@ -1,5 +1,6 @@
import logging import logging
import subprocess import subprocess
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Optional from typing import Callable, Optional

View file

@ -25,6 +25,8 @@ else:
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
CONTAINER_LOG_EXT = "container_log"
class NoContainerTechException(Exception): class NoContainerTechException(Exception):
def __init__(self, container_tech: str) -> None: def __init__(self, container_tech: str) -> None:
@ -62,6 +64,7 @@ class Container(IsolationProvider):
""" """
Make sure the podman container is installed. Linux only. Make sure the podman container is installed. Linux only.
""" """
if Container.is_container_installed(): if Container.is_container_installed():
return True return True
@ -147,12 +150,12 @@ class Container(IsolationProvider):
# Log to .log file # Log to .log file
if os.environ.get("DZ_LOG_CONTAINER", "no").lower() in ["yes", "true"]: if os.environ.get("DZ_LOG_CONTAINER", "no").lower() in ["yes", "true"]:
with open(f"{document.input_filename}.container_log", "a") as f: with open(f"{document.input_filename}.{CONTAINER_LOG_EXT}", "a") as f:
f.write(f"{line.rstrip()}\n") f.write(f"{line.rstrip()}\n")
def parse_progress( def parse_progress(
self, document: Document, line: str self, document: Document, line: str
) -> None | Tuple[bool, str, int]: ) -> Optional[Tuple[bool, str, int]]:
""" """
Parses a line returned by the container. Parses a line returned by the container.
""" """
@ -326,6 +329,8 @@ class Container(IsolationProvider):
"-e", "-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}", f"ENABLE_TIMEOUTS={self.enable_timeouts}",
] ]
if getattr(sys, "dangerzone_dev", False):
extra_args += ["-e", f"DZ_DEBUG_CONTAINER=yes"]
ret = self.exec_container(document, command, extra_args, stdout_callback) ret = self.exec_container(document, command, extra_args, stdout_callback)
if ret != 0: if ret != 0:
log.error("pixels-to-pdf failed") log.error("pixels-to-pdf failed")
@ -342,6 +347,11 @@ class Container(IsolationProvider):
# We did it # We did it
success = True success = True
if success:
self.log_container_output(document, "Result: SUCCESS")
else:
self.log_container_output(document, "Result: FAILURE")
return success return success
def get_max_parallel_conversions(self) -> int: def get_max_parallel_conversions(self) -> int:

View file

@ -123,6 +123,10 @@ USER user
WORKDIR /home/user WORKDIR /home/user
VOLUME /home/user/dangerzone VOLUME /home/user/dangerzone
# Ensure container can create container (override github action's runner:docker owner)
RUN mkdir -p /home/user/.local/share/containers
RUN chmod -R 0777 /home/user/.local/share/containers
COPY pyproject.toml poetry.lock /home/user/dangerzone/ COPY pyproject.toml poetry.lock /home/user/dangerzone/
RUN cd /home/user/dangerzone && poetry install RUN cd /home/user/dangerzone && poetry install
""" """

1
test_docs_large Submodule

@ -0,0 +1 @@
Subproject commit b16257c5a870ac0029d0a56fe3de438a686f6881

View file

@ -18,7 +18,9 @@ test_docs = [
] ]
# Pytest parameter decorators # Pytest parameter decorators
for_each_doc = pytest.mark.parametrize("doc", test_docs) for_each_doc = pytest.mark.parametrize(
"doc", test_docs, ids=[str(doc.name) for doc in test_docs]
)
class TestBase: class TestBase:

5
tests/conftest.py Normal file
View file

@ -0,0 +1,5 @@
def pytest_addoption(parser):
parser.addoption(
"--train", action="store_true", help="Enable training of large document set"
)
parser.addoption("--long", action="store_true", help="Enable large training set")

View file

@ -9,7 +9,7 @@ import sys
import tempfile import tempfile
import traceback import traceback
from pathlib import Path from pathlib import Path
from typing import Sequence from typing import Mapping, Sequence
from unittest import mock from unittest import mock
import pytest import pytest
@ -111,7 +111,10 @@ class CLIResult(Result):
class TestCli(TestBase): class TestCli(TestBase):
def run_cli( def run_cli(
self, args: Sequence[str] | str = (), tmp_path: Path = None self,
args: Sequence[str] | str = (),
tmp_path: Path = None,
env: Mapping[str, str] = None,
) -> CLIResult: ) -> CLIResult:
"""Run the CLI with the provided arguments. """Run the CLI with the provided arguments.
@ -145,7 +148,7 @@ class TestCli(TestBase):
"dangerzone.isolation_provider.container.get_tmp_dir", "dangerzone.isolation_provider.container.get_tmp_dir",
return_value=t, return_value=t,
): ):
result = CliRunner().invoke(cli_main, args) result = CliRunner().invoke(cli_main, args, env=env)
finally: finally:
if tmp_path is not None: if tmp_path is not None:
os.chdir(cwd) os.chdir(cwd)

1
tests/test_docs_large Submodule

@ -0,0 +1 @@
Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b

175
tests/test_large_set.py Normal file
View file

@ -0,0 +1,175 @@
import os
import re
import subprocess
from pathlib import Path
from typing import List
import pytest
from dangerzone.document import SAFE_EXTENSION
from dangerzone.isolation_provider.container import CONTAINER_LOG_EXT
from .test_cli import TestCli
test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
test_docs_dir = test_docs_repo_dir / "all_documents"
TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
FORMATS_REGEX = (
r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
)
def clone_large_test_dir():
if not os.path.exists(test_docs_dir):
print("initializing 'test_docs_large' submodule")
p = subprocess.run(["git", "submodule", "init", test_docs_repo_dir])
assert p.returncode == 0
print("updating 'test_docs_large' submodule")
p = subprocess.run(["git", "submodule", "update", test_docs_repo_dir])
assert p.returncode == 0
print("obtaining 'test_docs_large' documents")
p = subprocess.run(["git", "lfs", "pull", test_docs_repo_dir])
assert p.returncode == 0
def get_test_docs(min_size: int, max_size: int) -> List[Path]:
#clone_large_test_dir()
return sorted([
doc
for doc in test_docs_dir.rglob("*")
if doc.is_file()
and min_size < doc.stat().st_size < max_size
and not (doc.name.endswith(SAFE_EXTENSION))
and re.match(FORMATS_REGEX, doc.name)
])
def get_trained_test_docs(min_size: int, max_size: int) -> List[Path]:
all_docs = get_test_docs(min_size, max_size)
trained_docs = [
doc for doc in all_docs if Path(f"{doc}.{CONTAINER_LOG_EXT}").is_file()
]
return trained_docs
def get_untrained_test_docs(min_size: int, max_size: int) -> List[Path]:
all_docs = set(get_test_docs(min_size, max_size))
trained_docs = set(get_trained_test_docs(min_size, max_size))
untrained_docs = all_docs - trained_docs
return list(untrained_docs)
docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)
# Pytest parameter decorators
up_to_100K_docs_list = docs_10K[:10] + docs_100K[:10]
for_each_up_to_100K_short = pytest.mark.parametrize(
"doc", up_to_100K_docs_list, ids=[str(doc.name) for doc in up_to_100K_docs_list]
)
for_each_10K_doc = pytest.mark.parametrize(
"doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
)
for_each_100K_doc = pytest.mark.parametrize(
"doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
)
for_each_10M_doc = pytest.mark.parametrize(
"doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
)
for_each_100M_doc = pytest.mark.parametrize(
"doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
)
@pytest.fixture
def training(request) -> bool:
if request.config.getoption("--train"):
return True
else:
return False
class TestLargeSet(TestCli):
def expected_container_output(self, input_file: Path) -> str:
# obtains the expected .log file
output_log_path = f"{input_file}.log"
with open(output_log_path, "r") as f:
return f.read()
def expected_success(self, input_file: Path) -> str:
# obtains the expected result
expected_result_path = f"{input_file}.{CONTAINER_LOG_EXT}"
with open(expected_result_path, "r") as f:
last_line = f.readlines()[-1] # result is in the last line
if "FAILURE" in last_line:
return False
elif "SUCCESS" in last_line:
return True
else:
raise ValueError(
f"Container log file ({expected_result_path}) does not contain the result"
)
def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
output_file_path = str(tmp_path / "output.pdf")
result = self.run_cli(
["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)]
)
success = self.expected_success(doc)
if os.path.exists(output_file_path):
f"{result.stdout, result.stderr}"
assert success, "Document was expected to fail but it didn't!"
result.assert_success()
else:
f"{result.stdout, result.stderr}"
assert not success, "Document was expected to succeed but it didn't!"
result.assert_failure()
def train_doc_test(self, doc: Path, tmp_path: Path) -> None:
if Path(f"{doc}.{CONTAINER_LOG_EXT}").exists():
# skip already trained
return
output_file_path = str(tmp_path / "output.pdf")
result = self.run_cli(
["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)],
env={"DZ_LOG_CONTAINER": "yes"},
)
@for_each_up_to_100K_short
def test_short_up_to_100K(self, doc: Path, tmp_path: Path, training: bool) -> None:
if not training:
self.run_doc_test(doc, tmp_path)
else:
self.train_doc_test(doc, tmp_path)
@for_each_10K_doc
def test_10K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
if not training:
self.run_doc_test(doc, tmp_path)
else:
self.train_doc_test(doc, tmp_path)
@for_each_100K_doc
def test_100K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
if not training:
self.run_doc_test(doc, tmp_path)
else:
self.train_doc_test(doc, tmp_path)
@for_each_10M_doc
def test_10M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
if not training:
self.run_doc_test(doc, tmp_path)
else:
self.train_doc_test(doc, tmp_path)
@for_each_100M_doc
def test_100M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
if not training:
self.run_doc_test(doc, tmp_path)
else:
self.train_doc_test(doc, tmp_path)