mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Multi-stage Dockerfile build
Breaks down the container build into multiple stages in order to speed up build times. Building PyMuPDF was taking too long and this way it can be cached. The original version was made by @apyrgio
This commit is contained in:
parent
1cd87f73a8
commit
e0b092692d
1 changed files with 55 additions and 31 deletions
86
Dockerfile
86
Dockerfile
|
@ -1,9 +1,57 @@
|
||||||
FROM alpine:latest
|
###########################################
|
||||||
|
# Build PyMuPDF
|
||||||
|
|
||||||
|
FROM alpine:latest as pymupdf-build
|
||||||
|
|
||||||
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
|
|
||||||
ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d4c66071a
|
|
||||||
ARG REQUIREMENTS_TXT
|
ARG REQUIREMENTS_TXT
|
||||||
|
|
||||||
|
# Install PyMuPDF via hash-checked requirements file
|
||||||
|
COPY ${REQUIREMENTS_TXT} /tmp/requirements.txt
|
||||||
|
RUN apk --no-cache add linux-headers g++ linux-headers gcc make python3-dev py3-pip
|
||||||
|
RUN pip install --break-system-packages --require-hashes -r /tmp/requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Download Tesseract data
|
||||||
|
|
||||||
|
FROM alpine:latest as tessdata-dl
|
||||||
|
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
|
||||||
|
|
||||||
|
# Download the trained models from the latest GitHub release of Tesseract, and
|
||||||
|
# store them under /usr/share/tessdata. This is basically what distro packages
|
||||||
|
# do under the hood.
|
||||||
|
#
|
||||||
|
# Because the GitHub release contains more files than just the trained models,
|
||||||
|
# we use `find` to fetch only the '*.traineddata' files in the top directory.
|
||||||
|
#
|
||||||
|
# Before we untar the models, we also check if the checksum is the expected one.
|
||||||
|
RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
|
||||||
|
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
|
||||||
|
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
|
||||||
|
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
|
||||||
|
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
|
||||||
|
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
|
||||||
|
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
|
||||||
|
&& cd .. && rm -r tessdata
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Download H2ORestart
|
||||||
|
FROM alpine:latest as h2orestart-dl
|
||||||
|
ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d4c66071a
|
||||||
|
RUN mkdir /libreoffice_ext && cd libreoffice_ext \
|
||||||
|
&& H2ORESTART_FILENAME=h2orestart.oxt \
|
||||||
|
&& H2ORESTART_VERSION="v0.5.7" \
|
||||||
|
&& wget https://github.com/ebandal/H2Orestart/releases/download/$H2ORESTART_VERSION/$H2ORESTART_FILENAME \
|
||||||
|
&& echo "$H2ORESTART_CHECKSUM $H2ORESTART_FILENAME" | sha256sum -c \
|
||||||
|
&& install -dm777 "/usr/lib/libreoffice/share/extensions/"
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dangerzone image
|
||||||
|
|
||||||
|
FROM alpine:latest
|
||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN apk --no-cache -U upgrade && \
|
RUN apk --no-cache -U upgrade && \
|
||||||
apk --no-cache add \
|
apk --no-cache add \
|
||||||
|
@ -17,35 +65,11 @@ RUN apk --no-cache -U upgrade && \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
font-noto-cjk
|
font-noto-cjk
|
||||||
|
|
||||||
# Install PyMuPDF via hash-checked requirements file
|
COPY --from=pymupdf-build /usr/lib/python3.11/site-packages/fitz/ /usr/lib/python3.11/site-packages/fitz
|
||||||
COPY ${REQUIREMENTS_TXT} /tmp/requirements.txt
|
COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
|
||||||
RUN apk --no-cache add --virtual .builddeps linux-headers g++ gcc make python3-dev py3-pip \
|
COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
|
||||||
&& pip install --break-system-packages --require-hashes -r /tmp/requirements.txt \
|
|
||||||
&& apk del .builddeps
|
|
||||||
|
|
||||||
# Download the trained models from the latest GitHub release of Tesseract, and
|
RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"
|
||||||
# store them under /usr/share/tessdata. This is basically what distro packages
|
|
||||||
# do under the hood.
|
|
||||||
#
|
|
||||||
# Because the GitHub release contains more files than just the trained models,
|
|
||||||
# we use `find` to fetch only the '*.traineddata' files in the top directory.
|
|
||||||
#
|
|
||||||
# Before we untar the models, we also check if the checksum is the expected one.
|
|
||||||
RUN mkdir tessdata && cd tessdata \
|
|
||||||
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
|
|
||||||
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
|
|
||||||
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
|
|
||||||
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
|
|
||||||
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
|
|
||||||
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \
|
|
||||||
&& cd .. && rm -r tessdata
|
|
||||||
|
|
||||||
RUN mkdir /libreoffice_ext && cd libreoffice_ext \
|
|
||||||
&& H2ORESTART_FILENAME=h2orestart.oxt \
|
|
||||||
&& H2ORESTART_VERSION="v0.5.7" \
|
|
||||||
&& wget https://github.com/ebandal/H2Orestart/releases/download/$H2ORESTART_VERSION/$H2ORESTART_FILENAME \
|
|
||||||
&& echo "$H2ORESTART_CHECKSUM $H2ORESTART_FILENAME" | sha256sum -c \
|
|
||||||
&& install -dm777 "/usr/lib/libreoffice/share/extensions/"
|
|
||||||
|
|
||||||
ENV PYTHONPATH=/opt/dangerzone
|
ENV PYTHONPATH=/opt/dangerzone
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue