dangerzone/container/Dockerfile
deeplow d28aa5a25b
Remove PDFtk dependency (replace w/ pdftoppm)
PDFtk actually isn't needed. It was being used for breaking a PDF
into pages but this is something that be replaced by the already present
'pdftoppm'. Furthermore, by removing this dependency we contribute to
reproducible builds and overall supply chain security because it was
obtained from gitlab with no signature verification or version pinning.

The replacement 'pdftoppm' enabled us to do a shortcut:
 - before: PDF -> PDF pages -> PNG images -> RGB images
 - after:  PDF -> PPM images -> RGB images

And this last conversion step is trivial since the RGB format we were
using is just a PPM file without the metadata in its header.
2023-01-23 14:00:57 +00:00

93 lines
2.5 KiB
Docker

FROM alpine:latest
# Install dependencies
RUN apk -U upgrade && \
apk add \
ghostscript \
graphicsmagick \
libreoffice \
openjdk8 \
poppler-utils \
python3 \
py3-magic \
sudo \
tesseract-ocr \
tesseract-ocr-data-afr \
tesseract-ocr-data-ara \
tesseract-ocr-data-aze \
tesseract-ocr-data-bel \
tesseract-ocr-data-ben \
tesseract-ocr-data-bul \
tesseract-ocr-data-cat \
tesseract-ocr-data-ces \
tesseract-ocr-data-chi_sim \
tesseract-ocr-data-chi_tra \
tesseract-ocr-data-chr \
tesseract-ocr-data-dan \
tesseract-ocr-data-deu \
tesseract-ocr-data-ell \
tesseract-ocr-data-enm \
tesseract-ocr-data-epo \
tesseract-ocr-data-equ \
tesseract-ocr-data-est \
tesseract-ocr-data-eus \
tesseract-ocr-data-fin \
tesseract-ocr-data-fra \
tesseract-ocr-data-frk \
tesseract-ocr-data-frm \
tesseract-ocr-data-glg \
tesseract-ocr-data-grc \
tesseract-ocr-data-heb \
tesseract-ocr-data-hin \
tesseract-ocr-data-hrv \
tesseract-ocr-data-hun \
tesseract-ocr-data-ind \
tesseract-ocr-data-isl \
tesseract-ocr-data-ita \
tesseract-ocr-data-ita_old \
tesseract-ocr-data-jpn \
tesseract-ocr-data-kan \
tesseract-ocr-data-kat \
tesseract-ocr-data-kor \
tesseract-ocr-data-lav \
tesseract-ocr-data-lit \
tesseract-ocr-data-mal \
tesseract-ocr-data-mkd \
tesseract-ocr-data-mlt \
tesseract-ocr-data-msa \
tesseract-ocr-data-nld \
tesseract-ocr-data-nor \
tesseract-ocr-data-pol \
tesseract-ocr-data-por \
tesseract-ocr-data-ron \
tesseract-ocr-data-rus \
tesseract-ocr-data-slk \
tesseract-ocr-data-slv \
tesseract-ocr-data-spa \
tesseract-ocr-data-spa_old \
tesseract-ocr-data-sqi \
tesseract-ocr-data-srp \
tesseract-ocr-data-swa \
tesseract-ocr-data-swe \
tesseract-ocr-data-tam \
tesseract-ocr-data-tel \
tesseract-ocr-data-tgl \
tesseract-ocr-data-tha \
tesseract-ocr-data-tur \
tesseract-ocr-data-ukr \
tesseract-ocr-data-vie
COPY dangerzone.py /usr/local/bin/
RUN chmod +x /usr/local/bin/dangerzone.py
# Add the unprivileged user
RUN adduser -s /bin/sh -D dangerzone
USER dangerzone
# /tmp/input_file is where the first convert expects the input file to be, and
# /tmp where it will write the pixel files
#
# /dangerzone is where the second script expects files to be put by the first one
#
# /safezone is where the wrapper eventually moves the sanitized files.
VOLUME /dangerzone /tmp/input_file /safezone