mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 09:52:37 +02:00
Add large test logic and documentation
Adds a large pool of document that can and should be used prior to a release to understand effects of the new release over a real-world scenario. Documents are stored in an external git LFS repo under `tests/test_docs_large` and currently it's about 11K documents gathered from multiple PDF readers and office suite's test sets. Documentation on how to run the tests is under `docs/developer/TESTING.md`
This commit is contained in:
parent
f41cefde1d
commit
b73ce5bf6a
7 changed files with 130 additions and 5 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "tests/test_docs_large"]
|
||||
path = tests/test_docs_large
|
||||
url = https://github.com/freedomofpress/dangerzone-test-set
|
28
Makefile
28
Makefile
|
@ -1,18 +1,22 @@
|
|||
LARGE_TEST_REPO_DIR:=tests/test_docs_large
|
||||
GIT_DESC=$$(git describe)
|
||||
JUNIT_FLAGS := --capture=sys -o junit_logging=all
|
||||
|
||||
.PHONY: lint-black
|
||||
lint-black: ## check python source code formatting issues, with black
|
||||
black --check --diff --exclude dev_scripts/envs ./
|
||||
black --check --diff --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
|
||||
|
||||
.PHONY: lint-black-apply
|
||||
lint-black-apply: ## apply black's source code formatting suggestions
|
||||
black --exclude dev_scripts/envs ./
|
||||
black --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
|
||||
|
||||
.PHONY: lint-isort
|
||||
lint-isort: ## check imports are organized, with isort
|
||||
isort --check-only --skip dev_scripts/envs ./
|
||||
isort --check-only --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
|
||||
|
||||
.PHONY: lint-isort-apply
|
||||
lint-isort-apply: ## apply isort's imports organization suggestions
|
||||
isort --skip dev_scripts/envs ./
|
||||
isort --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
|
||||
|
||||
MYPY_ARGS := --ignore-missing-imports \
|
||||
--disallow-incomplete-defs \
|
||||
|
@ -44,6 +48,22 @@ test:
|
|||
pytest -v --cov --ignore dev_scripts --ignore tests/gui --ignore tests/test_large_set.py
|
||||
|
||||
|
||||
.PHONY: test-large-requirements
|
||||
test-large-requirements:
|
||||
@git-lfs --version || (echo "ERROR: you need to install 'git-lfs'" && false)
|
||||
@xmllint --version || (echo "ERROR: you need to install 'xmllint'" && false)
|
||||
|
||||
test-large-init: test-large-requirements
|
||||
@echo "initializing 'test_docs_large' submodule"
|
||||
git submodule init $(LARGE_TEST_REPO_DIR)
|
||||
git submodule update $(LARGE_TEST_REPO_DIR)
|
||||
git lfs pull $(LARGE_TEST_REPO_DIR)
|
||||
|
||||
TEST_LARGE_RESULTS:=$(LARGE_TEST_REPO_DIR)/results/junit/commit_$(GIT_DESC).junit.xml
|
||||
.PHONY: tests-large
|
||||
test-large: test-large-init ## Run large test set
|
||||
python -m pytest tests/test_large_set.py::TestLargeSet -v $(JUNIT_FLAGS) --junitxml=$(TEST_LARGE_RESULTS)
|
||||
|
||||
# Makefile self-help borrowed from the securedrop-client project
|
||||
# Explaination of the below shell command should it ever break.
|
||||
# 1. Set the field separator to ": ##" and any make targets that might appear between : and ##
|
||||
|
|
|
@ -2,6 +2,14 @@
|
|||
|
||||
This section documents the release process. Unless you're a dangerzone developer making a release, you'll probably never need to follow it.
|
||||
|
||||
## Large document testing
|
||||
|
||||
Parallel to the QA process, the release candidate should be put through the large document tests in a dedicated machine to run overnight.
|
||||
|
||||
Follow the instructions in `docs/developer/TESTING.md` to run the tests.
|
||||
|
||||
These tests will identify any regressions or progression in terms of document coverage.
|
||||
|
||||
## QA
|
||||
|
||||
To ensure that new releases do not introduce regressions, and support existing
|
||||
|
|
|
@ -22,7 +22,9 @@ test_docs = [
|
|||
]
|
||||
|
||||
# Pytest parameter decorators
|
||||
for_each_doc = pytest.mark.parametrize("doc", test_docs)
|
||||
for_each_doc = pytest.mark.parametrize(
|
||||
"doc", test_docs, ids=[str(doc.name) for doc in test_docs]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
|
@ -159,6 +159,10 @@ class TestCli:
|
|||
stale_files = list(tmp_dir.iterdir())
|
||||
assert not stale_files
|
||||
|
||||
# XXX Print stdout so that junitXML exports with output capturing
|
||||
# actually include the stdout + stderr (they are combined into stdout)
|
||||
print(result.stdout)
|
||||
|
||||
return CLIResult.reclass_click_result(result, args)
|
||||
|
||||
|
||||
|
|
1
tests/test_docs_large
Submodule
1
tests/test_docs_large
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b
|
87
tests/test_large_set.py
Normal file
87
tests/test_large_set.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
|
||||
from dangerzone.document import SAFE_EXTENSION
|
||||
|
||||
|
||||
test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
|
||||
test_docs_dir = test_docs_repo_dir / "all_documents"
|
||||
TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
|
||||
FORMATS_REGEX = (
|
||||
r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
|
||||
)
|
||||
|
||||
|
||||
def ensure_test_data_exists() -> None:
|
||||
if len(os.listdir(test_docs_repo_dir)) == 0:
|
||||
print("Test data repository it empty. Skipping large tests.")
|
||||
exit(1)
|
||||
|
||||
|
||||
def get_test_docs(min_size: int, max_size: int) -> List[Path]:
|
||||
ensure_test_data_exists()
|
||||
return sorted(
|
||||
[
|
||||
doc
|
||||
for doc in test_docs_dir.rglob("*")
|
||||
if doc.is_file()
|
||||
and min_size < doc.stat().st_size < max_size
|
||||
and not (doc.name.endswith(SAFE_EXTENSION))
|
||||
and re.match(FORMATS_REGEX, doc.name)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
|
||||
docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
|
||||
docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
|
||||
docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)
|
||||
|
||||
# Pytest parameter decorators
|
||||
for_each_10K_doc = pytest.mark.parametrize(
|
||||
"doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
|
||||
)
|
||||
for_each_100K_doc = pytest.mark.parametrize(
|
||||
"doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
|
||||
)
|
||||
for_each_10M_doc = pytest.mark.parametrize(
|
||||
"doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
|
||||
)
|
||||
for_each_100M_doc = pytest.mark.parametrize(
|
||||
"doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
|
||||
)
|
||||
|
||||
|
||||
|
||||
class TestLargeSet():
|
||||
def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
|
||||
output_file_path = str(tmp_path / "output.pdf")
|
||||
p = subprocess.Popen([
|
||||
"python", "dev_scripts/dangerzone-cli", "--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)
|
||||
], stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
||||
out, _ = p.communicate()
|
||||
from strip_ansi import strip_ansi
|
||||
print(strip_ansi(out.decode()))
|
||||
|
||||
@for_each_10K_doc
|
||||
def test_10K_docs(self, doc: Path, tmp_path: Path) -> None:
|
||||
self.run_doc_test(doc, tmp_path)
|
||||
|
||||
@for_each_100K_doc
|
||||
def test_100K_docs(self, doc: Path, tmp_path: Path) -> None:
|
||||
self.run_doc_test(doc, tmp_path)
|
||||
|
||||
@for_each_10M_doc
|
||||
def test_10M_docs(self, doc: Path, tmp_path: Path) -> None:
|
||||
self.run_doc_test(doc, tmp_path)
|
||||
|
||||
@for_each_100M_doc
|
||||
def test_100M_docs(self, doc: Path, tmp_path: Path) -> None:
|
||||
self.run_doc_test(doc, tmp_path)
|
Loading…
Reference in a new issue