mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00

Reporting script now parses JunitXML instead of a series of ".container_log" files. The script in in changed submodule. Additionally it makes failed tests actually fail so that this is recorded in the JunitXML report.
99 lines
2.9 KiB
Python
99 lines
2.9 KiB
Python
import os
|
|
import re
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
import pytest
|
|
from _pytest.fixtures import FixtureRequest
|
|
|
|
from dangerzone.document import SAFE_EXTENSION
|
|
|
|
from .test_cli import TestCli
|
|
|
|
test_docs_repo_dir = Path(__file__).parent / "test_docs_large"
|
|
test_docs_dir = test_docs_repo_dir / "all_documents"
|
|
TEST_DOCS_REPO = "git@github.com:freedomofpress/dangerzone-test-set.git"
|
|
FORMATS_REGEX = (
|
|
r".*\.(pdf|docx|doc|xlsx|xls|pptx|ppt|odt|ods|odp|odg|jpg|jpeg|gif|png)$"
|
|
)
|
|
|
|
|
|
def ensure_test_data_exists() -> None:
|
|
if len(os.listdir(test_docs_repo_dir)) == 0:
|
|
print("Test data repository it empty. Skipping large tests.")
|
|
exit(1)
|
|
|
|
|
|
def get_test_docs(min_size: int, max_size: int) -> List[Path]:
|
|
ensure_test_data_exists()
|
|
return sorted(
|
|
[
|
|
doc
|
|
for doc in test_docs_dir.rglob("*")
|
|
if doc.is_file()
|
|
and min_size < doc.stat().st_size < max_size
|
|
and not (doc.name.endswith(SAFE_EXTENSION))
|
|
and re.match(FORMATS_REGEX, doc.name)
|
|
]
|
|
)
|
|
|
|
|
|
docs_10K = get_test_docs(min_size=0, max_size=10 * 2**10)
|
|
docs_100K = get_test_docs(min_size=10 * 2**10, max_size=100 * 2**10)
|
|
docs_10M = get_test_docs(min_size=100 * 2**10, max_size=10 * 2**20)
|
|
docs_100M = get_test_docs(min_size=10 * 2**20, max_size=100 * 2**20)
|
|
|
|
# Pytest parameter decorators
|
|
for_each_10K_doc = pytest.mark.parametrize(
|
|
"doc", docs_10K, ids=[str(doc.name) for doc in docs_10K]
|
|
)
|
|
for_each_100K_doc = pytest.mark.parametrize(
|
|
"doc", docs_100K, ids=[str(doc.name) for doc in docs_100K]
|
|
)
|
|
for_each_10M_doc = pytest.mark.parametrize(
|
|
"doc", docs_10M, ids=[str(doc.name) for doc in docs_10M]
|
|
)
|
|
for_each_100M_doc = pytest.mark.parametrize(
|
|
"doc", docs_100M, ids=[str(doc.name) for doc in docs_100M]
|
|
)
|
|
|
|
|
|
class TestLargeSet(TestCli):
|
|
def run_doc_test(self, doc: Path, tmp_path: Path) -> None:
|
|
output_file_path = str(tmp_path / "output.pdf")
|
|
p = subprocess.Popen(
|
|
[
|
|
"python",
|
|
"dev_scripts/dangerzone-cli",
|
|
"--output-filename",
|
|
output_file_path,
|
|
"--ocr-lang",
|
|
"eng",
|
|
str(doc),
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
)
|
|
out, _ = p.communicate()
|
|
from strip_ansi import strip_ansi
|
|
|
|
print(strip_ansi(out.decode()))
|
|
assert p.returncode == 0
|
|
|
|
@for_each_10K_doc
|
|
def test_10K_docs(self, doc: Path, tmp_path: Path) -> None:
|
|
self.run_doc_test(doc, tmp_path)
|
|
|
|
@for_each_100K_doc
|
|
def test_100K_docs(self, doc: Path, tmp_path: Path) -> None:
|
|
self.run_doc_test(doc, tmp_path)
|
|
|
|
@for_each_10M_doc
|
|
def test_10M_docs(self, doc: Path, tmp_path: Path) -> None:
|
|
self.run_doc_test(doc, tmp_path)
|
|
|
|
@for_each_100M_doc
|
|
def test_100M_docs(self, doc: Path, tmp_path: Path) -> None:
|
|
self.run_doc_test(doc, tmp_path)
|