From b73ce5bf6af4ca44caf6c1bb8a44944e89ee7b31 Mon Sep 17 00:00:00 2001
From: deeplow <deeplower@protonmail.com>
Date: Fri, 23 Jun 2023 11:27:23 +0100
Subject: [PATCH] Add large test logic and documentation

Adds a large pool of document that can and should be used prior to a
release to understand effects of the new release over a real-world
scenario.

Documents are stored in an external git LFS repo under
`tests/test_docs_large` and currently it's about 11K documents gathered
from multiple PDF readers and office suite's test sets.

Documentation on how to run the tests is under
`docs/developer/TESTING.md`
---
 .gitmodules             |  3 ++
 Makefile                | 28 +++++++++++--
 RELEASE.md              |  8 ++++
 tests/__init__.py       |  4 +-
 tests/test_cli.py       |  4 ++
 tests/test_docs_large   |  1 +
 tests/test_large_set.py | 87 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 130 insertions(+), 5 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 tests/test_docs_large
 create mode 100644 tests/test_large_set.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..5d82bd8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tests/test_docs_large"]
+	path = tests/test_docs_large
+	url = https://github.com/freedomofpress/dangerzone-test-set
diff --git a/Makefile b/Makefile
index 4828b17..4a9d8ac 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,22 @@
+LARGE_TEST_REPO_DIR:=tests/test_docs_large
+GIT_DESC=$$(git describe)
+JUNIT_FLAGS := --capture=sys -o junit_logging=all
+
 .PHONY: lint-black
 lint-black: ## check python source code formatting issues, with black
-	black --check --diff --exclude dev_scripts/envs ./
+	black --check --diff --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-black-apply
 lint-black-apply: ## apply black's source code formatting suggestions
-	black --exclude dev_scripts/envs ./
+	black --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-isort
 lint-isort: ## check imports are organized, with isort
-	isort --check-only --skip dev_scripts/envs ./
+	isort --check-only --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-isort-apply
 lint-isort-apply: ## apply isort's imports organization suggestions
-	isort --skip dev_scripts/envs ./
+	isort --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
 
 MYPY_ARGS := --ignore-missing-imports \
 			 --disallow-incomplete-defs \
@@ -44,6 +48,22 @@ test:
 	pytest -v --cov --ignore dev_scripts --ignore tests/gui --ignore tests/test_large_set.py
 
 
+.PHONY: test-large-requirements
+test-large-requirements:
+	@git-lfs --version || (echo "ERROR: you need to install 'git-lfs'" && false)
+	@xmllint --version || (echo "ERROR: you need to install 'xmllint'" && false)
+
+test-large-init: test-large-requirements
+	@echo "initializing 'test_docs_large' submodule"
+	git submodule init $(LARGE_TEST_REPO_DIR)
+	git submodule update $(LARGE_TEST_REPO_DIR)
+	git lfs pull $(LARGE_TEST_REPO_DIR)
+
+TEST_LARGE_RESULTS:=$(LARGE_TEST_REPO_DIR)/results/junit/commit_$(GIT_DESC).junit.xml
+.PHONY: tests-large
+test-large: test-large-init  ## Run large test set
+	python -m pytest tests/test_large_set.py::TestLargeSet -v $(JUNIT_FLAGS) --junitxml=$(TEST_LARGE_RESULTS)
+
 # Makefile self-help borrowed from the securedrop-client project
 # Explaination of the below shell command should it ever break.
 # 1. Set the field separator to ": ##" and any make targets that might appear between : and ##
diff --git a/RELEASE.md b/RELEASE.md
index c08cfa6..83bdf46 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,6 +2,14 @@
 
 This section documents the release process. Unless you're a dangerzone developer making a release, you'll probably never need to follow it.
 
+## Large document testing
+
+Parallel to the QA process, the release candidate should be put through the large document tests in a dedicated machine to run overnight.
+
+Follow the instructions in `docs/developer/TESTING.md` to run the tests.
+
+These tests will identify any regressions or progression in terms of document coverage.
+
 ## QA
 
 To ensure that new releases do not introduce regressions, and support existing
diff --git a/tests/__init__.py b/tests/__init__.py
index 06f55db..57e7d0b 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -22,7 +22,9 @@ test_docs = [
 ]
 
 # Pytest parameter decorators
-for_each_doc = pytest.mark.parametrize("doc", test_docs)
+for_each_doc = pytest.mark.parametrize(
+    "doc", test_docs, ids=[str(doc.name) for doc in test_docs]
+)
 
 
 @pytest.fixture
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4a666b7..148918b 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -159,6 +159,10 @@ class TestCli:
                     stale_files = list(tmp_dir.iterdir())
                     assert not stale_files
 
+        # XXX Print stdout so that junitXML exports with output capturing
+        # actually include the stdout + stderr (they are combined into stdout)
+        print(result.stdout)
+
         return CLIResult.reclass_click_result(result, args)
 
 
diff --git a/tests/test_docs_large b/tests/test_docs_large
new file mode 160000
index 0000000..4cbf14a
--- /dev/null
+++ b/tests/test_docs_large
@@ -0,0 +1 @@
+Subproject commit 4cbf14ac31ac986ced60e83867aac8a6d2d4a81b
diff --git a/tests/test_large_set.py b/tests/test_large_set.py
new file mode 100644
index 0000000..9d7e650
--- /dev/null
+++ b/tests/test_large_set.py
@@ -0,0 +1,87 @@
+import os
+import re
+import subprocess
+import time
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.fixtures import FixtureRequest
+
+from dangerzone.document import SAFE_EXTENSION
+
+
+test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
+test_docs_dir = test_docs_repo_dir / "all_documents"
+TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
+FORMATS_REGEX = (
+    r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
+)
+
+
+def ensure_test_data_exists() -> None:
+    if len(os.listdir(test_docs_repo_dir)) == 0:
+        print("Test data repository it empty. Skipping large tests.")
+        exit(1)
+
+
+def get_test_docs(min_size: int, max_size: int) -> List[Path]:
+    ensure_test_data_exists()
+    return sorted(
+        [
+            doc
+            for doc in test_docs_dir.rglob("*")
+            if doc.is_file()
+            and min_size < doc.stat().st_size < max_size
+            and not (doc.name.endswith(SAFE_EXTENSION))
+            and re.match(FORMATS_REGEX, doc.name)
+        ]
+    )
+
+
+docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
+docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
+docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
+docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)
+
+# Pytest parameter decorators
+for_each_10K_doc = pytest.mark.parametrize(
+    "doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
+)
+for_each_100K_doc = pytest.mark.parametrize(
+    "doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
+)
+for_each_10M_doc = pytest.mark.parametrize(
+    "doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
+)
+for_each_100M_doc = pytest.mark.parametrize(
+    "doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
+)
+
+
+
+class TestLargeSet():
+    def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
+        output_file_path = str(tmp_path / "output.pdf")
+        p = subprocess.Popen([
+            "python", "dev_scripts/dangerzone-cli", "--output-filename", output_file_path, "--ocr-lang", "eng", str(doc)
+        ], stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+        out, _ = p.communicate()
+        from strip_ansi import strip_ansi
+        print(strip_ansi(out.decode()))
+
+    @for_each_10K_doc
+    def test_10K_docs(self, doc: Path, tmp_path: Path) -> None:
+        self.run_doc_test(doc, tmp_path)
+
+    @for_each_100K_doc
+    def test_100K_docs(self, doc: Path, tmp_path: Path) -> None:
+        self.run_doc_test(doc, tmp_path)
+
+    @for_each_10M_doc
+    def test_10M_docs(self, doc: Path, tmp_path: Path) -> None:
+        self.run_doc_test(doc, tmp_path)
+
+    @for_each_100M_doc
+    def test_100M_docs(self, doc: Path, tmp_path: Path) -> None:
+        self.run_doc_test(doc, tmp_path)