mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00

Change our Dockerfile and image build script to install PyMuPDF from the prebuilt Python wheels for musl. Note that this work is incomplete, because it doesn't handle the case of aarch64 wheels, which are still not prebuilt for musl. Closes #850
117 lines
4.5 KiB
Docker
117 lines
4.5 KiB
Docker
###########################################
|
|
# Build PyMuPDF
|
|
|
|
FROM alpine:latest as pymupdf-build
|
|
|
|
ARG REQUIREMENTS_TXT
|
|
|
|
# Install PyMuPDF via hash-checked requirements file
|
|
RUN apk --no-cache add py3-pip
|
|
COPY ${REQUIREMENTS_TXT} /tmp/requirements.txt
|
|
RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt
|
|
|
|
|
|
###########################################
|
|
# Download Tesseract data
|
|
|
|
FROM alpine:latest as tessdata-dl
|
|
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
|
|
|
|
# Download the trained models from the latest GitHub release of Tesseract, and
|
|
# store them under /usr/share/tessdata. This is basically what distro packages
|
|
# do under the hood.
|
|
#
|
|
# Because the GitHub release contains more files than just the trained models,
|
|
# we use `find` to fetch only the '*.traineddata' files in the top directory.
|
|
#
|
|
# Before we untar the models, we also check if the checksum is the expected one.
|
|
RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
|
|
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
|
|
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
|
|
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
|
|
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
|
|
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
|
|
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
|
|
&& cd .. && rm -r tessdata
|
|
|
|
|
|
###########################################
|
|
# Download H2ORestart
|
|
FROM alpine:latest as h2orestart-dl
|
|
ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d4c66071a
|
|
RUN mkdir /libreoffice_ext && cd libreoffice_ext \
|
|
&& H2ORESTART_FILENAME=h2orestart.oxt \
|
|
&& H2ORESTART_VERSION="v0.5.7" \
|
|
&& wget https://github.com/ebandal/H2Orestart/releases/download/$H2ORESTART_VERSION/$H2ORESTART_FILENAME \
|
|
&& echo "$H2ORESTART_CHECKSUM $H2ORESTART_FILENAME" | sha256sum -c \
|
|
&& install -dm777 "/usr/lib/libreoffice/share/extensions/"
|
|
|
|
|
|
###########################################
|
|
# Dangerzone image
|
|
|
|
FROM alpine:latest AS dangerzone-image
|
|
|
|
# Install dependencies
|
|
RUN apk --no-cache -U upgrade && \
|
|
apk --no-cache add \
|
|
libreoffice \
|
|
openjdk8 \
|
|
python3 \
|
|
py3-magic \
|
|
font-noto-cjk
|
|
|
|
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
|
|
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
|
|
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDFb.libs/ /usr/lib/python3.12/site-packages/PyMuPDFb.libs
|
|
COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
|
|
COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
|
|
|
|
RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"
|
|
|
|
RUN mkdir -p /opt/dangerzone/dangerzone
|
|
RUN touch /opt/dangerzone/dangerzone/__init__.py
|
|
COPY conversion /opt/dangerzone/dangerzone/conversion
|
|
|
|
# Add the unprivileged user. Set the UID/GID of the dangerzone user/group to
|
|
# 1000, since we will point to it from the OCI config.
|
|
#
|
|
# NOTE: A tmpfs will be mounted over /home/dangerzone directory,
|
|
# so nothing within it from the image will be persisted.
|
|
RUN addgroup -g 1000 dangerzone && \
|
|
adduser -u 1000 -s /bin/true -G dangerzone -h /home/dangerzone -D dangerzone
|
|
|
|
###########################################
|
|
# gVisor wrapper image
|
|
|
|
FROM alpine:latest
|
|
|
|
RUN apk --no-cache -U upgrade && \
|
|
apk --no-cache add python3
|
|
|
|
RUN GVISOR_URL="https://storage.googleapis.com/gvisor/releases/release/latest/$(uname -m)"; \
|
|
wget "${GVISOR_URL}/runsc" "${GVISOR_URL}/runsc.sha512" && \
|
|
sha512sum -c runsc.sha512 && \
|
|
rm -f runsc.sha512 && \
|
|
chmod 555 runsc && \
|
|
mv runsc /usr/bin/
|
|
|
|
# Add the unprivileged `dangerzone` user.
|
|
RUN addgroup dangerzone && \
|
|
adduser -s /bin/true -G dangerzone -h /home/dangerzone -D dangerzone
|
|
|
|
# Switch to the dangerzone user for the rest of the script.
|
|
USER dangerzone
|
|
|
|
# Copy the Dangerzone image, as created by the previous steps, into the home
|
|
# directory of the `dangerzone` user.
|
|
RUN mkdir /home/dangerzone/dangerzone-image
|
|
COPY --from=dangerzone-image / /home/dangerzone/dangerzone-image/rootfs
|
|
|
|
# Create a directory that will be used by gVisor as the place where it will
|
|
# store the state of its containers.
|
|
RUN mkdir /home/dangerzone/.containers
|
|
|
|
COPY gvisor_wrapper/entrypoint.py /
|
|
|
|
ENTRYPOINT ["/entrypoint.py"]
|