mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Remove Dockerfile dependencies replaced by PyMuPDF
PyMuPDF replaced the need for almost all dependencies, which this commit now removes. We are also removing tesseract-ocr as a dependency since (to our surprise) PyMuPDF ships directly with tesseract binaries [1]. However, now that tesseract-ocr is not available directly as a binary tool, the `test_ocr.py` needed to be changed. Fixes #658 [1]: https://github.com/freedomofpress/dangerzone/issues/658#issuecomment-1861033149
This commit is contained in:
parent
ee35e28aa6
commit
f676891482
3 changed files with 19 additions and 7 deletions
|
@ -13,6 +13,7 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
- Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
|
- Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
|
||||||
|
- Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658))
|
||||||
|
|
||||||
## Dangerzone 0.5.1
|
## Dangerzone 0.5.1
|
||||||
|
|
||||||
|
|
|
@ -55,14 +55,10 @@ FROM alpine:latest
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN apk --no-cache -U upgrade && \
|
RUN apk --no-cache -U upgrade && \
|
||||||
apk --no-cache add \
|
apk --no-cache add \
|
||||||
ghostscript \
|
|
||||||
libreoffice \
|
libreoffice \
|
||||||
openjdk8 \
|
openjdk8 \
|
||||||
poppler-utils \
|
|
||||||
poppler-data \
|
|
||||||
python3 \
|
python3 \
|
||||||
py3-magic \
|
py3-magic \
|
||||||
tesseract-ocr \
|
|
||||||
font-noto-cjk
|
font-noto-cjk
|
||||||
|
|
||||||
COPY --from=pymupdf-build /usr/lib/python3.11/site-packages/fitz/ /usr/lib/python3.11/site-packages/fitz
|
COPY --from=pymupdf-build /usr/lib/python3.11/site-packages/fitz/ /usr/lib/python3.11/site-packages/fitz
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import platform
|
import platform
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -16,16 +17,30 @@ def test_ocr_ommisions() -> None:
|
||||||
# Create the command that will list all the installed languages in the container
|
# Create the command that will list all the installed languages in the container
|
||||||
# image.
|
# image.
|
||||||
runtime = Container.get_runtime()
|
runtime = Container.get_runtime()
|
||||||
command = [runtime, "run", Container.CONTAINER_NAME, "tesseract", "--list-langs"]
|
command = [
|
||||||
|
runtime,
|
||||||
|
"run",
|
||||||
|
Container.CONTAINER_NAME,
|
||||||
|
"find",
|
||||||
|
"/usr/share/tessdata/",
|
||||||
|
"-name",
|
||||||
|
"*.traineddata",
|
||||||
|
]
|
||||||
|
|
||||||
# Run the command, strip any extra whitespace, and remove the following first line
|
# Run the command, strip any extra whitespace, and remove the following first line
|
||||||
# from the result:
|
# from the result:
|
||||||
#
|
#
|
||||||
# List of available languages in "/usr/share/tessdata/" ...
|
# List of available languages in "/usr/share/tessdata/" ...
|
||||||
installed_langs = set(
|
installed_langs_filenames = (
|
||||||
subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE)
|
subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE)
|
||||||
.stdout.strip()
|
.stdout.strip()
|
||||||
.split("\n")[1:]
|
.split("\n")
|
||||||
|
)
|
||||||
|
installed_langs = set(
|
||||||
|
[
|
||||||
|
Path(filename).name.split(".traineddata")[0]
|
||||||
|
for filename in installed_langs_filenames
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove the "osd" and "equ" languages from the list of installed languages, since
|
# Remove the "osd" and "equ" languages from the list of installed languages, since
|
||||||
|
|
Loading…
Reference in a new issue