diff --git a/CHANGELOG.md b/CHANGELOG.md index f808968..4d312d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or ### Changed - Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET)) +- Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658)) ## Dangerzone 0.5.1 diff --git a/Dockerfile b/Dockerfile index cd2490a..c46325c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,14 +55,10 @@ FROM alpine:latest # Install dependencies RUN apk --no-cache -U upgrade && \ apk --no-cache add \ - ghostscript \ libreoffice \ openjdk8 \ - poppler-utils \ - poppler-data \ python3 \ py3-magic \ - tesseract-ocr \ font-noto-cjk COPY --from=pymupdf-build /usr/lib/python3.11/site-packages/fitz/ /usr/lib/python3.11/site-packages/fitz diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 1e9f3a0..dc23ea5 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -1,5 +1,6 @@ import platform import subprocess +from pathlib import Path import pytest @@ -16,16 +17,30 @@ def test_ocr_ommisions() -> None: # Create the command that will list all the installed languages in the container # image. runtime = Container.get_runtime() - command = [runtime, "run", Container.CONTAINER_NAME, "tesseract", "--list-langs"] + command = [ + runtime, + "run", + Container.CONTAINER_NAME, + "find", + "/usr/share/tessdata/", + "-name", + "*.traineddata", + ] # Run the command, strip any extra whitespace, and remove the following first line # from the result: # # List of available languages in "/usr/share/tessdata/" ... - installed_langs = set( + installed_langs_filenames = ( subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE) .stdout.strip() - .split("\n")[1:] + .split("\n") + ) + installed_langs = set( + [ + Path(filename).name.split(".traineddata")[0] + for filename in installed_langs_filenames + ] ) # Remove the "osd" and "equ" languages from the list of installed languages, since