From b73ce5bf6af4ca44caf6c1bb8a44944e89ee7b31 Mon Sep 17 00:00:00 2001 From: deeplow Date: Fri, 23 Jun 2023 11:27:23 +0100 Subject: [PATCH] Add large test logic and documentation Adds a large pool of document that can and should be used prior to a release to understand effects of the new release over a real-world scenario. Documents are stored in an external git LFS repo under `tests/test_docs_large` and currently it's about 11K documents gathered from multiple PDF readers and office suite's test sets. Documentation on how to run the tests is under `docs/developer/TESTING.md` --- .gitmodules | 3 ++ Makefile | 28 +++++++++++-- RELEASE.md | 8 ++++ tests/__init__.py | 4 +- tests/test_cli.py | 4 ++ tests/test_docs_large | 1 + tests/test_large_set.py | 87 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 .gitmodules create mode 160000 tests/test_docs_large create mode 100644 tests/test_large_set.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5d82bd8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tests/test_docs_large"] + path = tests/test_docs_large + url = https://github.com/freedomofpress/dangerzone-test-set diff --git a/Makefile b/Makefile index 4828b17..4a9d8ac 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,22 @@ +LARGE_TEST_REPO_DIR:=tests/test_docs_large +GIT_DESC=$$(git describe) +JUNIT_FLAGS := --capture=sys -o junit_logging=all + .PHONY: lint-black lint-black: ## check python source code formatting issues, with black - black --check --diff --exclude dev_scripts/envs ./ + black --check --diff --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./ .PHONY: lint-black-apply lint-black-apply: ## apply black's source code formatting suggestions - black --exclude dev_scripts/envs ./ + black --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./ .PHONY: lint-isort lint-isort: ## check imports are organized, with isort - isort --check-only --skip dev_scripts/envs ./ + isort --check-only --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./ .PHONY: lint-isort-apply lint-isort-apply: ## apply isort's imports organization suggestions - isort --skip dev_scripts/envs ./ + isort --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./ MYPY_ARGS := --ignore-missing-imports \ --disallow-incomplete-defs \ @@ -44,6 +48,22 @@ test: pytest -v --cov --ignore dev_scripts --ignore tests/gui --ignore tests/test_large_set.py +.PHONY: test-large-requirements +test-large-requirements: + @git-lfs --version || (echo "ERROR: you need to install 'git-lfs'" && false) + @xmllint --version || (echo "ERROR: you need to install 'xmllint'" && false) + +test-large-init: test-large-requirements + @echo "initializing 'test_docs_large' submodule" + git submodule init $(LARGE_TEST_REPO_DIR) + git submodule update $(LARGE_TEST_REPO_DIR) + git lfs pull $(LARGE_TEST_REPO_DIR) + +TEST_LARGE_RESULTS:=$(LARGE_TEST_REPO_DIR)/results/junit/commit_$(GIT_DESC).junit.xml +.PHONY: tests-large +test-large: test-large-init ## Run large test set + python -m pytest tests/test_large_set.py::TestLargeSet -v $(JUNIT_FLAGS) --junitxml=$(TEST_LARGE_RESULTS) + # Makefile self-help borrowed from the securedrop-client project # Explaination of the below shell command should it ever break. # 1. Set the field separator to ": ##" and any make targets that might appear between : and ## diff --git a/RELEASE.md b/RELEASE.md index c08cfa6..83bdf46 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,6 +2,14 @@ This section documents the release process. Unless you're a dangerzone developer making a release, you'll probably never need to follow it. +## Large document testing + +Parallel to the QA process, the release candidate should be put through the large document tests in a dedicated machine to run overnight. + +Follow the instructions in `docs/developer/TESTING.md` to run the tests. + +These tests will identify any regressions or progression in terms of document coverage. + ## QA To ensure that new releases do not introduce regressions, and support existing diff --git a/tests/__init__.py b/tests/__init__.py index 06f55db..57e7d0b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -22,7 +22,9 @@ test_docs = [ ] # Pytest parameter decorators -for_each_doc = pytest.mark.parametrize("doc", test_docs) +for_each_doc = pytest.mark.parametrize( + "doc", test_docs, ids=[str(doc.name) for doc in test_docs] +) @pytest.fixture diff --git a/tests/test_cli.py b/tests/test_cli.py index 4a666b7..148918b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -159,6 +159,10 @@ class TestCli: stale_files = list(tmp_dir.iterdir()) assert not stale_files + # XXX Print stdout so that junitXML exports with output capturing + # actually include the stdout + stderr (they are combined into stdout) + print(result.stdout) + return CLIResult.reclass_click_result(result, args) diff --git a/tests/test_docs_large b/tests/test_docs_large new file mode 160000 index 0000000..4cbf14a --- /dev/null +++ b/tests/test_docs_large @@ -0,0 +1 @@ +Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b diff --git a/tests/test_large_set.py b/tests/test_large_set.py new file mode 100644 index 0000000..9d7e650 --- /dev/null +++ b/tests/test_large_set.py @@ -0,0 +1,87 @@ +import os +import re +import subprocess +import time +from pathlib import Path +from typing import List + +import pytest +from _pytest.fixtures import FixtureRequest + +from dangerzone.document import SAFE_EXTENSION + + +test_docs_repo_dir = Path(__file__).parent / "test_docs_large" +test_docs_dir = test_docs_repo_dir / "all_documents" +TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git" +FORMATS_REGEX = ( + r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$" +) + + +def ensure_test_data_exists() -> None: + if len(os.listdir(test_docs_repo_dir)) == 0: + print("Test data repository it empty. Skipping large tests.") + exit(1) + + +def get_test_docs(min_size: int, max_size: int) -> List[Path]: + ensure_test_data_exists() + return sorted( + [ + doc + for doc in test_docs_dir.rglob("*") + if doc.is_file() + and min_size < doc.stat().st_size < max_size + and not (doc.name.endswith(SAFE_EXTENSION)) + and re.match(FORMATS_REGEX, doc.name) + ] + ) + + +docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10) +docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10) +docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20) +docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20) + +# Pytest parameter decorators +for_each_10K_doc = pytest.mark.parametrize( + "doc", docs_10K, ids=[str(doc.name) for doc in docs_10K] +) +for_each_100K_doc = pytest.mark.parametrize( + "doc", docs_100K, ids=[str(doc.name) for doc in docs_100K] +) +for_each_10M_doc = pytest.mark.parametrize( + "doc", docs_10M, ids=[str(doc.name) for doc in docs_10M] +) +for_each_100M_doc = pytest.mark.parametrize( + "doc", docs_100M, ids=[str(doc.name) for doc in docs_100M] +) + + + +class TestLargeSet(): + def run_doc_test(self, doc: Path, tmp_path: Path) -> None: + output_file_path = str(tmp_path / "output.pdf") + p = subprocess.Popen([ + "python", "dev_scripts/dangerzone-cli", "--output-filename", output_file_path, "--ocr-lang", "eng", str(doc) + ], stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + out, _ = p.communicate() + from strip_ansi import strip_ansi + print(strip_ansi(out.decode())) + + @for_each_10K_doc + def test_10K_docs(self, doc: Path, tmp_path: Path) -> None: + self.run_doc_test(doc, tmp_path) + + @for_each_100K_doc + def test_100K_docs(self, doc: Path, tmp_path: Path) -> None: + self.run_doc_test(doc, tmp_path) + + @for_each_10M_doc + def test_10M_docs(self, doc: Path, tmp_path: Path) -> None: + self.run_doc_test(doc, tmp_path) + + @for_each_100M_doc + def test_100M_docs(self, doc: Path, tmp_path: Path) -> None: + self.run_doc_test(doc, tmp_path)