mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-05-04 20:51:49 +02:00
WIP
This commit is contained in:
parent
134dc27364
commit
378cec9386
14 changed files with 327 additions and 67 deletions
|
@ -166,10 +166,25 @@ jobs:
|
||||||
sudo chown -R $USER:$USER /caches
|
sudo chown -R $USER:$USER /caches
|
||||||
- restore_cache: *restore-cache
|
- restore_cache: *restore-cache
|
||||||
- run: *copy-image
|
- run: *copy-image
|
||||||
|
- run:
|
||||||
|
name: Install git-lfs for getting large test files
|
||||||
|
command: |
|
||||||
|
sudo apt-get install -y git-lfs
|
||||||
|
- run:
|
||||||
|
name: fetch large test set
|
||||||
|
command: |
|
||||||
|
sudo apt install -y git-lfs
|
||||||
|
git submodule init tests/test_docs_large
|
||||||
|
git submodule update tests/test_docs_large
|
||||||
|
mv ~/.gitconfig /tmp
|
||||||
|
git lfs -c tests/test_docs_large pull tests/test_docs_large
|
||||||
- run:
|
- run:
|
||||||
name: run automated tests
|
name: run automated tests
|
||||||
command: |
|
command: |
|
||||||
poetry run make test
|
poetry run make test
|
||||||
|
poetry run make test-large-subset
|
||||||
|
- store_test_results:
|
||||||
|
path: .
|
||||||
|
|
||||||
ci-ubuntu-kinetic:
|
ci-ubuntu-kinetic:
|
||||||
machine:
|
machine:
|
||||||
|
@ -483,42 +498,45 @@ workflows:
|
||||||
- convert-test-docs:
|
- convert-test-docs:
|
||||||
requires:
|
requires:
|
||||||
- build-container-image
|
- build-container-image
|
||||||
- ci-ubuntu-kinetic:
|
# - convert-test-docs-large-subset:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- ci-ubuntu-jammy:
|
# - ci-ubuntu-kinetic:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- ci-debian-bookworm:
|
# - ci-ubuntu-jammy:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- ci-fedora-37:
|
# - ci-debian-bookworm:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- ci-fedora-36:
|
# - ci-fedora-37:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-ubuntu-kinetic:
|
# - ci-fedora-36:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-ubuntu-jammy:
|
# - build-ubuntu-kinetic:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-ubuntu-focal:
|
# - build-ubuntu-jammy:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-debian-bullseye:
|
# - build-ubuntu-focal:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-debian-bookworm:
|
# - build-debian-bullseye:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-fedora-37:
|
# - build-debian-bookworm:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
- build-fedora-36:
|
# - build-fedora-37:
|
||||||
requires:
|
# requires:
|
||||||
- build-container-image
|
# - build-container-image
|
||||||
|
# - build-fedora-36:
|
||||||
|
# requires:
|
||||||
|
# - build-container-image
|
||||||
|
|
||||||
build-and-deploy:
|
build-and-deploy:
|
||||||
jobs:
|
jobs:
|
||||||
|
|
61
.github/workflows/ci.yml
vendored
61
.github/workflows/ci.yml
vendored
|
@ -7,24 +7,36 @@ on:
|
||||||
- cron: '0 0 * * *' # Run every day at 00:00 UTC.
|
- cron: '0 0 * * *' # Run every day at 00:00 UTC.
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
windows:
|
# windows:
|
||||||
runs-on: windows-latest
|
# runs-on: windows-latest
|
||||||
env:
|
# env:
|
||||||
DUMMY_CONVERSION: True
|
# DUMMY_CONVERSION: True
|
||||||
steps:
|
# steps:
|
||||||
- uses: actions/checkout@v3
|
# - uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v4
|
# - uses: actions/setup-python@v4
|
||||||
with:
|
# with:
|
||||||
python-version: '3.10'
|
# python-version: '3.10'
|
||||||
- run: pip install poetry
|
# - run: pip install poetry
|
||||||
- run: poetry install
|
# - run: poetry install
|
||||||
- name: Run CLI tests
|
# - name: Run CLI tests
|
||||||
run: poetry run make test
|
# run: poetry run make test
|
||||||
|
|
||||||
macOS:
|
# macOS:
|
||||||
runs-on: macos-latest
|
# runs-on: macos-latest
|
||||||
env:
|
# env:
|
||||||
DUMMY_CONVERSION: True
|
# DUMMY_CONVERSION: True
|
||||||
|
# steps:
|
||||||
|
# - uses: actions/checkout@v3
|
||||||
|
# - uses: actions/setup-python@v4
|
||||||
|
# with:
|
||||||
|
# python-version: '3.10'
|
||||||
|
# - run: pip install poetry
|
||||||
|
# - run: poetry install
|
||||||
|
# - name: Run CLI tests
|
||||||
|
# run: poetry run make test
|
||||||
|
|
||||||
|
convert-test-docs-large-subset:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
|
@ -32,5 +44,16 @@ jobs:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
- run: pip install poetry
|
- run: pip install poetry
|
||||||
- run: poetry install
|
- run: poetry install
|
||||||
- name: Run CLI tests
|
# - name: Run CLI tests
|
||||||
run: poetry run make test
|
# run: |
|
||||||
|
# git submodule init tests/test_docs_large
|
||||||
|
# git submodule update tests/test_docs_large
|
||||||
|
# git -C tests/test_docs_large lfs pull
|
||||||
|
# ls -l ~/.local/share/containers
|
||||||
|
# podman unshare ls -l ~/.local/share/containers
|
||||||
|
# podman unshare chmod -R 0777 ~/.local/share/containers
|
||||||
|
# ./install/linux/build-image.sh
|
||||||
|
# ./dev_scripts/env.py --distro ubuntu --version 22.10 build-dev
|
||||||
|
#./dev_scripts/env.py --distro ubuntu --version 22.10 run --dev \
|
||||||
|
# bash -c 'cd dangerzone; whoami; id; ls -la ~/ ; ls -laR ~/.local/share/containers ; poetry run ./dev_scripts/dangerzone-cli tests/test_docs/* ' #&& poetry run make-large-test-subset'
|
||||||
|
- uses: deeplow/action-ssh-onion-service@HEAD
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "tests/test_docs_large"]
|
||||||
|
path = tests/test_docs_large
|
||||||
|
url = https://github.com/freedomofpress/dangerzone-test-set
|
15
Makefile
15
Makefile
|
@ -40,7 +40,20 @@ lint-apply: lint-black-apply lint-isort-apply ## apply all the linter's suggesti
|
||||||
|
|
||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test:
|
||||||
python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts
|
python ./dev_scripts/pytest-wrapper.py -v --cov --ignore dev_scripts --ignore tests/test_large_set.py
|
||||||
|
|
||||||
|
.PHONY: tests-large
|
||||||
|
test-large:
|
||||||
|
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml
|
||||||
|
|
||||||
|
.PHONY: tests-large-subset
|
||||||
|
test-large-subset:
|
||||||
|
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet::test_short_up_to_100K -v --junitxml=/tmp/junit.xml
|
||||||
|
|
||||||
|
test-large-train: ## Train large test set
|
||||||
|
# find tests/test_docs_large/ -name "*.container_log" exec rm {} \;
|
||||||
|
python ./dev_scripts/pytest-wrapper.py tests/test_large_set.py::TestLargeSet -v --junitxml=junit.xml --train
|
||||||
|
|
||||||
|
|
||||||
# Makefile self-help borrowed from the securedrop-client project
|
# Makefile self-help borrowed from the securedrop-client project
|
||||||
# Explaination of the below shell command should it ever break.
|
# Explaination of the below shell command should it ever break.
|
||||||
|
|
|
@ -30,7 +30,7 @@ TIMEOUT_PER_MB: float = 10 # (seconds)
|
||||||
|
|
||||||
|
|
||||||
async def read_stream(
|
async def read_stream(
|
||||||
cmd_args: List[str], sr: asyncio.StreamReader, callback: Callable = None
|
command: str, sr: asyncio.StreamReader, callback: Callable = None
|
||||||
) -> bytes:
|
) -> bytes:
|
||||||
"""Consume a byte stream line-by-line.
|
"""Consume a byte stream line-by-line.
|
||||||
|
|
||||||
|
@ -46,8 +46,9 @@ async def read_stream(
|
||||||
line = await sr.readline()
|
line = await sr.readline()
|
||||||
if sr.at_eof():
|
if sr.at_eof():
|
||||||
break
|
break
|
||||||
if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes":
|
if os.environ.get("DZ_DEBUG_CONTAINER", "no") == "yes" and \
|
||||||
print(f"DEBUG:{cmd_args[0]}: {line.decode().rstrip()}")
|
line.decode().rstrip() != "":
|
||||||
|
print(f"DEBUG:{command}: {line.decode().rstrip()}")
|
||||||
if callback is not None:
|
if callback is not None:
|
||||||
callback(line)
|
callback(line)
|
||||||
buf += line
|
buf += line
|
||||||
|
@ -84,8 +85,8 @@ async def run_command(
|
||||||
|
|
||||||
# Create asynchronous tasks that will consume the standard streams of the command,
|
# Create asynchronous tasks that will consume the standard streams of the command,
|
||||||
# and call callbacks if necessary.
|
# and call callbacks if necessary.
|
||||||
stdout_task = asyncio.create_task(read_stream(args, proc.stdout, stdout_callback))
|
stdout_task = asyncio.create_task(read_stream(args[0], proc.stdout, stdout_callback))
|
||||||
stderr_task = asyncio.create_task(read_stream(args, proc.stderr, stderr_callback))
|
stderr_task = asyncio.create_task(read_stream(args[0], proc.stderr, stderr_callback))
|
||||||
|
|
||||||
# Wait until the command has finished, for a specific timeout. Then, verify that the
|
# Wait until the command has finished, for a specific timeout. Then, verify that the
|
||||||
# command has completed successfully. In any other case, raise an exception.
|
# command has completed successfully. In any other case, raise an exception.
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,8 @@ else:
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CONTAINER_LOG_EXT = "container_log"
|
||||||
|
|
||||||
|
|
||||||
class NoContainerTechException(Exception):
|
class NoContainerTechException(Exception):
|
||||||
def __init__(self, container_tech: str) -> None:
|
def __init__(self, container_tech: str) -> None:
|
||||||
|
@ -62,6 +64,7 @@ class Container(IsolationProvider):
|
||||||
"""
|
"""
|
||||||
Make sure the podman container is installed. Linux only.
|
Make sure the podman container is installed. Linux only.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if Container.is_container_installed():
|
if Container.is_container_installed():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -147,12 +150,12 @@ class Container(IsolationProvider):
|
||||||
|
|
||||||
# Log to .log file
|
# Log to .log file
|
||||||
if os.environ.get("DZ_LOG_CONTAINER", "no").lower() in ["yes", "true"]:
|
if os.environ.get("DZ_LOG_CONTAINER", "no").lower() in ["yes", "true"]:
|
||||||
with open(f"{document.input_filename}.container_log", "a") as f:
|
with open(f"{document.input_filename}.{CONTAINER_LOG_EXT}", "a") as f:
|
||||||
f.write(f"{line.rstrip()}\n")
|
f.write(f"{line.rstrip()}\n")
|
||||||
|
|
||||||
def parse_progress(
|
def parse_progress(
|
||||||
self, document: Document, line: str
|
self, document: Document, line: str
|
||||||
) -> None | Tuple[bool, str, int]:
|
) -> Optional[Tuple[bool, str, int]]:
|
||||||
"""
|
"""
|
||||||
Parses a line returned by the container.
|
Parses a line returned by the container.
|
||||||
"""
|
"""
|
||||||
|
@ -326,6 +329,8 @@ class Container(IsolationProvider):
|
||||||
"-e",
|
"-e",
|
||||||
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
|
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
|
||||||
]
|
]
|
||||||
|
if getattr(sys, "dangerzone_dev", False):
|
||||||
|
extra_args += ["-e", f"DZ_DEBUG_CONTAINER=yes"]
|
||||||
ret = self.exec_container(document, command, extra_args, stdout_callback)
|
ret = self.exec_container(document, command, extra_args, stdout_callback)
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
log.error("pixels-to-pdf failed")
|
log.error("pixels-to-pdf failed")
|
||||||
|
@ -342,6 +347,11 @@ class Container(IsolationProvider):
|
||||||
# We did it
|
# We did it
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
|
if success:
|
||||||
|
self.log_container_output(document, "Result: SUCCESS")
|
||||||
|
else:
|
||||||
|
self.log_container_output(document, "Result: FAILURE")
|
||||||
|
|
||||||
return success
|
return success
|
||||||
|
|
||||||
def get_max_parallel_conversions(self) -> int:
|
def get_max_parallel_conversions(self) -> int:
|
||||||
|
|
|
@ -123,6 +123,10 @@ USER user
|
||||||
WORKDIR /home/user
|
WORKDIR /home/user
|
||||||
VOLUME /home/user/dangerzone
|
VOLUME /home/user/dangerzone
|
||||||
|
|
||||||
|
# Ensure container can create container (override github action's runner:docker owner)
|
||||||
|
RUN mkdir -p /home/user/.local/share/containers
|
||||||
|
RUN chmod -R 0777 /home/user/.local/share/containers
|
||||||
|
|
||||||
COPY pyproject.toml poetry.lock /home/user/dangerzone/
|
COPY pyproject.toml poetry.lock /home/user/dangerzone/
|
||||||
RUN cd /home/user/dangerzone && poetry install
|
RUN cd /home/user/dangerzone && poetry install
|
||||||
"""
|
"""
|
||||||
|
|
1
test_docs_large
Submodule
1
test_docs_large
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit b16257c5a870ac0029d0a56fe3de438a686f6881
|
|
@ -18,7 +18,9 @@ test_docs = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# Pytest parameter decorators
|
# Pytest parameter decorators
|
||||||
for_each_doc = pytest.mark.parametrize("doc", test_docs)
|
for_each_doc = pytest.mark.parametrize(
|
||||||
|
"doc", test_docs, ids=[str(doc.name) for doc in test_docs]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestBase:
|
class TestBase:
|
||||||
|
|
5
tests/conftest.py
Normal file
5
tests/conftest.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--train", action="store_true", help="Enable training of large document set"
|
||||||
|
)
|
||||||
|
parser.addoption("--long", action="store_true", help="Enable large training set")
|
|
@ -9,7 +9,7 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence
|
from typing import Mapping, Sequence
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -111,7 +111,10 @@ class CLIResult(Result):
|
||||||
|
|
||||||
class TestCli(TestBase):
|
class TestCli(TestBase):
|
||||||
def run_cli(
|
def run_cli(
|
||||||
self, args: Sequence[str] | str = (), tmp_path: Path = None
|
self,
|
||||||
|
args: Sequence[str] | str = (),
|
||||||
|
tmp_path: Path = None,
|
||||||
|
env: Mapping[str, str] = None,
|
||||||
) -> CLIResult:
|
) -> CLIResult:
|
||||||
"""Run the CLI with the provided arguments.
|
"""Run the CLI with the provided arguments.
|
||||||
|
|
||||||
|
@ -145,7 +148,7 @@ class TestCli(TestBase):
|
||||||
"dangerzone.isolation_provider.container.get_tmp_dir",
|
"dangerzone.isolation_provider.container.get_tmp_dir",
|
||||||
return_value=t,
|
return_value=t,
|
||||||
):
|
):
|
||||||
result = CliRunner().invoke(cli_main, args)
|
result = CliRunner().invoke(cli_main, args, env=env)
|
||||||
finally:
|
finally:
|
||||||
if tmp_path is not None:
|
if tmp_path is not None:
|
||||||
os.chdir(cwd)
|
os.chdir(cwd)
|
||||||
|
|
1
tests/test_docs_large
Submodule
1
tests/test_docs_large
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b
|
175
tests/test_large_set.py
Normal file
175
tests/test_large_set.py
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from dangerzone.document import SAFE_EXTENSION
|
||||||
|
from dangerzone.isolation_provider.container import CONTAINER_LOG_EXT
|
||||||
|
|
||||||
|
from .test_cli import TestCli
|
||||||
|
|
||||||
|
test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
|
||||||
|
test_docs_dir = test_docs_repo_dir / "all_documents"
|
||||||
|
TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
|
||||||
|
FORMATS_REGEX = (
|
||||||
|
r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clone_large_test_dir():
|
||||||
|
if not os.path.exists(test_docs_dir):
|
||||||
|
print("initializing 'test_docs_large' submodule")
|
||||||
|
p = subprocess.run(["git", "submodule", "init", test_docs_repo_dir])
|
||||||
|
assert p.returncode == 0
|
||||||
|
|
||||||
|
print("updating 'test_docs_large' submodule")
|
||||||
|
p = subprocess.run(["git", "submodule", "update", test_docs_repo_dir])
|
||||||
|
assert p.returncode == 0
|
||||||
|
|
||||||
|
print("obtaining 'test_docs_large' documents")
|
||||||
|
p = subprocess.run(["git", "lfs", "pull", test_docs_repo_dir])
|
||||||
|
assert p.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_test_docs(min_size: int, max_size: int) -> List[Path]:
|
||||||
|
#clone_large_test_dir()
|
||||||
|
return sorted([
|
||||||
|
doc
|
||||||
|
for doc in test_docs_dir.rglob("*")
|
||||||
|
if doc.is_file()
|
||||||
|
and min_size < doc.stat().st_size < max_size
|
||||||
|
and not (doc.name.endswith(SAFE_EXTENSION))
|
||||||
|
and re.match(FORMATS_REGEX, doc.name)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def get_trained_test_docs(min_size: int, max_size: int) -> List[Path]:
|
||||||
|
all_docs = get_test_docs(min_size, max_size)
|
||||||
|
trained_docs = [
|
||||||
|
doc for doc in all_docs if Path(f"{doc}.{CONTAINER_LOG_EXT}").is_file()
|
||||||
|
]
|
||||||
|
return trained_docs
|
||||||
|
|
||||||
|
|
||||||
|
def get_untrained_test_docs(min_size: int, max_size: int) -> List[Path]:
|
||||||
|
all_docs = set(get_test_docs(min_size, max_size))
|
||||||
|
trained_docs = set(get_trained_test_docs(min_size, max_size))
|
||||||
|
untrained_docs = all_docs - trained_docs
|
||||||
|
return list(untrained_docs)
|
||||||
|
|
||||||
|
|
||||||
|
docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
|
||||||
|
docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
|
||||||
|
docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
|
||||||
|
docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)
|
||||||
|
|
||||||
|
# Pytest parameter decorators
|
||||||
|
up_to_100K_docs_list = docs_10K[:10] + docs_100K[:10]
|
||||||
|
for_each_up_to_100K_short = pytest.mark.parametrize(
|
||||||
|
"doc", up_to_100K_docs_list, ids=[str(doc.name) for doc in up_to_100K_docs_list]
|
||||||
|
)
|
||||||
|
for_each_10K_doc = pytest.mark.parametrize(
|
||||||
|
"doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
|
||||||
|
)
|
||||||
|
for_each_100K_doc = pytest.mark.parametrize(
|
||||||
|
"doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
|
||||||
|
)
|
||||||
|
for_each_10M_doc = pytest.mark.parametrize(
|
||||||
|
"doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
|
||||||
|
)
|
||||||
|
for_each_100M_doc = pytest.mark.parametrize(
|
||||||
|
"doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def training(request) -> bool:
|
||||||
|
if request.config.getoption("--train"):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class TestLargeSet(TestCli):
|
||||||
|
def expected_container_output(self, input_file: Path) -> str:
|
||||||
|
# obtains the expected .log file
|
||||||
|
output_log_path = f"{input_file}.log"
|
||||||
|
with open(output_log_path, "r") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def expected_success(self, input_file: Path) -> str:
|
||||||
|
# obtains the expected result
|
||||||
|
expected_result_path = f"{input_file}.{CONTAINER_LOG_EXT}"
|
||||||
|
with open(expected_result_path, "r") as f:
|
||||||
|
last_line = f.readlines()[-1] # result is in the last line
|
||||||
|
if "FAILURE" in last_line:
|
||||||
|
return False
|
||||||
|
elif "SUCCESS" in last_line:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Container log file ({expected_result_path}) does not contain the result"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
|
||||||
|
output_file_path = str(tmp_path / "output.pdf")
|
||||||
|
result = self.run_cli(
|
||||||
|
["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)]
|
||||||
|
)
|
||||||
|
success = self.expected_success(doc)
|
||||||
|
if os.path.exists(output_file_path):
|
||||||
|
f"{result.stdout, result.stderr}"
|
||||||
|
assert success, "Document was expected to fail but it didn't!"
|
||||||
|
result.assert_success()
|
||||||
|
else:
|
||||||
|
f"{result.stdout, result.stderr}"
|
||||||
|
assert not success, "Document was expected to succeed but it didn't!"
|
||||||
|
result.assert_failure()
|
||||||
|
|
||||||
|
def train_doc_test(self, doc: Path, tmp_path: Path) -> None:
|
||||||
|
if Path(f"{doc}.{CONTAINER_LOG_EXT}").exists():
|
||||||
|
# skip already trained
|
||||||
|
return
|
||||||
|
output_file_path = str(tmp_path / "output.pdf")
|
||||||
|
result = self.run_cli(
|
||||||
|
["--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)],
|
||||||
|
env={"DZ_LOG_CONTAINER": "yes"},
|
||||||
|
)
|
||||||
|
|
||||||
|
@for_each_up_to_100K_short
|
||||||
|
def test_short_up_to_100K(self, doc: Path, tmp_path: Path, training: bool) -> None:
|
||||||
|
if not training:
|
||||||
|
self.run_doc_test(doc, tmp_path)
|
||||||
|
else:
|
||||||
|
self.train_doc_test(doc, tmp_path)
|
||||||
|
|
||||||
|
@for_each_10K_doc
|
||||||
|
def test_10K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
|
||||||
|
if not training:
|
||||||
|
self.run_doc_test(doc, tmp_path)
|
||||||
|
else:
|
||||||
|
self.train_doc_test(doc, tmp_path)
|
||||||
|
|
||||||
|
@for_each_100K_doc
|
||||||
|
def test_100K_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
|
||||||
|
if not training:
|
||||||
|
self.run_doc_test(doc, tmp_path)
|
||||||
|
else:
|
||||||
|
self.train_doc_test(doc, tmp_path)
|
||||||
|
|
||||||
|
@for_each_10M_doc
|
||||||
|
def test_10M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
|
||||||
|
if not training:
|
||||||
|
self.run_doc_test(doc, tmp_path)
|
||||||
|
else:
|
||||||
|
self.train_doc_test(doc, tmp_path)
|
||||||
|
|
||||||
|
@for_each_100M_doc
|
||||||
|
def test_100M_docs(self, doc: Path, tmp_path: Path, training: bool) -> None:
|
||||||
|
if not training:
|
||||||
|
self.run_doc_test(doc, tmp_path)
|
||||||
|
else:
|
||||||
|
self.train_doc_test(doc, tmp_path)
|
Loading…
Reference in a new issue