diff --git a/dangerzone-converter/.circleci/config.yml b/dangerzone-converter/.circleci/config.yml new file mode 100644 index 0000000..ebee675 --- /dev/null +++ b/dangerzone-converter/.circleci/config.yml @@ -0,0 +1,108 @@ +version: 2.1 + +executors: + docker-publisher: + environment: + IMAGE_NAME: flmcode/dangerzone + docker: + - image: circleci/python + +jobs: + + build: + executor: docker-publisher + steps: + - checkout + - setup_remote_docker + - run: + name: Build docker image + command: docker build -t $IMAGE_NAME:latest . + - run: + name: Archive docker image + command: docker save -o image.tar $IMAGE_NAME + - persist_to_workspace: + root: . + paths: + - ./image.tar + + publish-latest: + executor: docker-publisher + steps: + - attach_workspace: + at: /tmp/workspace + - setup_remote_docker + - run: + name: Load archived docker image + command: docker load -i /tmp/workspace/image.tar + - run: + name: Publish image to docker Hub + command: | + echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin + docker push $IMAGE_NAME:latest + + publish-tag: + executor: docker-publisher + steps: + - attach_workspace: + at: /tmp/workspace + - setup_remote_docker + - run: + name: Load archived docker image + command: docker load -i /tmp/workspace/image.tar + - run: + name: Publish image to docker Hub + command: | + echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin + IMAGE_TAG=${CIRCLE_TAG/v/''} + docker tag $IMAGE_NAME:latest $IMAGE_NAME:$IMAGE_TAG + docker push $IMAGE_NAME:latest + docker push $IMAGE_NAME:$IMAGE_TAG + +workflows: + version: 2 + build-stable: + jobs: + - build: + filters: + branches: + only: stable + - publish-latest: + requires: + - build + filters: + branches: + only: stable + build-tags: + jobs: + - build: + filters: + tags: + only: /^v.*/ + branches: + ignore: /.*/ + - publish-tag: + requires: + - build + filters: + tags: + only: /^v.*/ + branches: + ignore: /.*/ + monthly: + triggers: + - schedule: + cron: "0 0 1 * *" + filters: + branches: + only: stable + jobs: + - build: + filters: + branches: + only: stable + - publish-latest: + requires: + - build + filters: + branches: + only: stable diff --git a/dangerzone-converter/Dockerfile b/dangerzone-converter/Dockerfile new file mode 100644 index 0000000..240c98b --- /dev/null +++ b/dangerzone-converter/Dockerfile @@ -0,0 +1,100 @@ +FROM alpine:latest + +# Install dependencies +RUN apk -U upgrade && \ + apk add \ + ghostscript \ + graphicsmagick \ + libreoffice \ + openjdk8 \ + poppler-utils \ + py3-magic \ + py3-pillow \ + sudo \ + tesseract-ocr \ + tesseract-ocr-data-afr \ + tesseract-ocr-data-ara \ + tesseract-ocr-data-aze \ + tesseract-ocr-data-bel \ + tesseract-ocr-data-ben \ + tesseract-ocr-data-bul \ + tesseract-ocr-data-cat \ + tesseract-ocr-data-ces \ + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-chi_tra \ + tesseract-ocr-data-chr \ + tesseract-ocr-data-dan \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-ell \ + tesseract-ocr-data-enm \ + tesseract-ocr-data-epo \ + tesseract-ocr-data-equ \ + tesseract-ocr-data-est \ + tesseract-ocr-data-eus \ + tesseract-ocr-data-fin \ + tesseract-ocr-data-fra \ + tesseract-ocr-data-frk \ + tesseract-ocr-data-frm \ + tesseract-ocr-data-glg \ + tesseract-ocr-data-grc \ + tesseract-ocr-data-heb \ + tesseract-ocr-data-hin \ + tesseract-ocr-data-hrv \ + tesseract-ocr-data-hun \ + tesseract-ocr-data-ind \ + tesseract-ocr-data-isl \ + tesseract-ocr-data-ita \ + tesseract-ocr-data-ita_old \ + tesseract-ocr-data-jpn \ + tesseract-ocr-data-kan \ + tesseract-ocr-data-kat \ + tesseract-ocr-data-kor \ + tesseract-ocr-data-lav \ + tesseract-ocr-data-lit \ + tesseract-ocr-data-mal \ + tesseract-ocr-data-mkd \ + tesseract-ocr-data-mlt \ + tesseract-ocr-data-msa \ + tesseract-ocr-data-nld \ + tesseract-ocr-data-nor \ + tesseract-ocr-data-pol \ + tesseract-ocr-data-por \ + tesseract-ocr-data-ron \ + tesseract-ocr-data-rus \ + tesseract-ocr-data-slk \ + tesseract-ocr-data-slv \ + tesseract-ocr-data-spa \ + tesseract-ocr-data-spa_old \ + tesseract-ocr-data-sqi \ + tesseract-ocr-data-srp \ + tesseract-ocr-data-swa \ + tesseract-ocr-data-swe \ + tesseract-ocr-data-tam \ + tesseract-ocr-data-tel \ + tesseract-ocr-data-tgl \ + tesseract-ocr-data-tha \ + tesseract-ocr-data-tur \ + tesseract-ocr-data-ukr \ + tesseract-ocr-data-vie + +# Install pdftk +RUN \ + wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \ + mv pdftk-all.jar /usr/local/bin && \ + chmod +x /usr/local/bin/pdftk-all.jar && \ + echo '#!/bin/sh' > /usr/local/bin/pdftk && \ + echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \ + chmod +x /usr/local/bin/pdftk + +COPY scripts/* /usr/local/bin/ + +# Add the unprivileged user +RUN adduser -h /home/user -s /bin/sh -D user + +# /tmp/input_file is where the first convert expects the input file to be, and +# /tmp where it will write the pixel files +# +# /dangerzone is where the second script expects files to be put by the first one +# +# /safezone is where the wrapper eventually moves the sanitized files. +VOLUME /dangerzone /tmp/input_file /safezone diff --git a/dangerzone-converter/LICENSE b/dangerzone-converter/LICENSE new file mode 100644 index 0000000..ca63b31 --- /dev/null +++ b/dangerzone-converter/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 First Look Media + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dangerzone-converter/README.md b/dangerzone-converter/README.md new file mode 100644 index 0000000..61f2978 --- /dev/null +++ b/dangerzone-converter/README.md @@ -0,0 +1,9 @@ +# dangerzone-converter + +This is the container for [dangerzone](https://github.com/firstlookmedia/dangerzone), which converts potentially a dangerous PDF, office document, or image into a safe PDF. + +## Development notes + +Issues are tracked [here](https://github.com/firstlookmedia/dangerzone/issues?q=is%3Aissue+is%3Aopen+label%3Acontainer), in the dangerzone repository using the `container` label. + +Containers are built in continuous integration when commits are pushed to the `stable` branch. \ No newline at end of file diff --git a/dangerzone-converter/scripts/document-to-pixels b/dangerzone-converter/scripts/document-to-pixels new file mode 100755 index 0000000..864f5ff --- /dev/null +++ b/dangerzone-converter/scripts/document-to-pixels @@ -0,0 +1,27 @@ +#!/bin/sh + +# Remove this warning by setting the host in /etc/hosts: +# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution +echo 127.0.0.1 $(hostname) >> /etc/hosts + +# Record original permissions, and make document readable +START_PERMISSIONS=$(stat /tmp/input_file | grep Access | grep Uid | cut -d"(" -f2 |cut -d"/" -f1) +/bin/chmod 0644 /tmp/input_file + +# Do the conversion without root +/usr/bin/sudo -u user /usr/local/bin/document-to-pixels-unpriv +RETURN_CODE=$? + +# Restore original permissions +/bin/chmod $START_PERMISSIONS /tmp/input_file + +# Check for failure +if [ $RETURN_CODE -ne 0 ]; then + echo "" + exit $RETURN_CODE +fi + +# Move converted files into /dangerzone +/bin/mv /tmp/page-*.rgb /dangerzone +/bin/mv /tmp/page-*.width /dangerzone +/bin/mv /tmp/page-*.height /dangerzone diff --git a/dangerzone-converter/scripts/document-to-pixels-unpriv b/dangerzone-converter/scripts/document-to-pixels-unpriv new file mode 100755 index 0000000..d51a9ac --- /dev/null +++ b/dangerzone-converter/scripts/document-to-pixels-unpriv @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +import sys +import subprocess +import glob +import shutil +import os + +import magic +from PIL import Image + + +def print_flush(s): + print(s) + sys.stdout.flush() + + +def main(): + conversions = { + # .pdf + "application/pdf": {"type": None}, + # .docx + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .doc + "application/msword": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .docm + "application/vnd.ms-word.document.macroEnabled.12": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .xlsx + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .xls + "application/vnd.ms-excel": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .pptx + "application/vnd.openxmlformats-officedocument.presentationml.presentation": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .ppt + "application/vnd.ms-powerpoint": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .odt + "application/vnd.oasis.opendocument.text": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .odg + "application/vnd.oasis.opendocument.graphics": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .odp + "application/vnd.oasis.opendocument.presentation": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .ops + "application/vnd.oasis.opendocument.spreadsheet": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .jpg + "image/jpeg": {"type": "convert"}, + # .gif + "image/gif": {"type": "convert"}, + # .png + "image/png": {"type": "convert"}, + # .tif + "image/tiff": {"type": "convert"}, + "image/x-tiff": {"type": "convert"}, + } + + # Detect MIME type + mime = magic.Magic(mime=True) + mime_type = mime.from_file("/tmp/input_file") + + # Validate MIME type + if mime_type not in conversions: + print_flush("The document format is not supported") + sys.exit(1) + + # Convert input document to PDF + conversion = conversions[mime_type] + if conversion["type"] is None: + pdf_filename = "/tmp/input_file" + elif conversion["type"] == "libreoffice": + print_flush(f"Converting to PDF using LibreOffice") + args = [ + "libreoffice", + "--headless", + "--convert-to", + f"pdf:{conversion['libreoffice_output_filter']}", + "--outdir", + "/tmp", + "/tmp/input_file", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error converting document to PDF, LibreOffice timed out after 60 seconds" + ) + sys.exit(1) + + if p.returncode != 0: + print_flush(f"Conversion to PDF failed: {p.stdout}") + sys.exit(1) + pdf_filename = "/tmp/input_file.pdf" + elif conversion["type"] == "convert": + print_flush(f"Converting to PDF using GraphicsMagick") + args = [ + "gm", + "convert", + "/tmp/input_file", + "/tmp/input_file.pdf", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error converting document to PDF, GraphicsMagick timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Conversion to PDF failed: {p.stdout}") + sys.exit(1) + pdf_filename = "/tmp/input_file.pdf" + else: + print_flush("Invalid conversion type") + sys.exit(1) + + # Separate PDF into pages + print_flush("") + print_flush(f"Separating document into pages") + args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error separating document into pages, pdfseparate timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Separating document into pages failed: {p.stdout}") + sys.exit(1) + + page_filenames = glob.glob("/tmp/page-*.pdf") + print_flush(f"Document has {len(page_filenames)} pages") + print_flush("") + + # Convert to RGB pixel data + for page in range(1, len(page_filenames) + 1): + pdf_filename = f"/tmp/page-{page}.pdf" + png_filename = f"/tmp/page-{page}.png" + rgb_filename = f"/tmp/page-{page}.rgb" + width_filename = f"/tmp/page-{page}.width" + height_filename = f"/tmp/page-{page}.height" + filename_base = f"/tmp/page-{page}" + + print_flush(f"Converting page {page} to pixels") + + # Convert to png + try: + p = subprocess.run( + ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], + timeout=60, + ) + except subprocess.TimeoutExpired: + print_flush( + "Error converting from PDF to PNG, pdftocairo timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Conversion from PDF to PNG failed: {p.stdout}") + sys.exit(1) + + # Save the width and height + im = Image.open(png_filename) + width, height = im.size + with open(width_filename, "w") as f: + f.write(str(width)) + with open(height_filename, "w") as f: + f.write(str(height)) + + # Convert to RGB pixels + try: + p = subprocess.run( + ["gm", "convert", png_filename, "-depth", "8", f"rgb:{rgb_filename}"], + timeout=60, + ) + except subprocess.TimeoutExpired: + print_flush( + "Error converting from PNG to pixels, convert timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Conversion from PNG to RGB failed: {p.stdout}") + sys.exit(1) + + # Delete the png + os.remove(png_filename) + + +if __name__ == "__main__": + main() diff --git a/dangerzone-converter/scripts/pixels-to-pdf b/dangerzone-converter/scripts/pixels-to-pdf new file mode 100755 index 0000000..8af2ec2 --- /dev/null +++ b/dangerzone-converter/scripts/pixels-to-pdf @@ -0,0 +1,17 @@ +#!/bin/sh + +# Remove this warning by setting the host in /etc/hosts: +# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution +echo 127.0.0.1 $(hostname) >> /etc/hosts + +# Do the conversion without root +/usr/bin/sudo OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE -u user /usr/local/bin/pixels-to-pdf-unpriv +RETURN_CODE=$? +if [ $RETURN_CODE -ne 0 ]; then + echo "" + exit $RETURN_CODE +fi + +# Move converted files into /safezone +/bin/mv /tmp/safe-output.pdf /safezone +/bin/mv /tmp/safe-output-compressed.pdf /safezone diff --git a/dangerzone-converter/scripts/pixels-to-pdf-unpriv b/dangerzone-converter/scripts/pixels-to-pdf-unpriv new file mode 100755 index 0000000..876e6d6 --- /dev/null +++ b/dangerzone-converter/scripts/pixels-to-pdf-unpriv @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +import glob +import os +import sys +import subprocess + + +def print_flush(s=""): + print(s) + sys.stdout.flush() + + +def main(): + num_pages = len(glob.glob("/dangerzone/page-*.rgb")) + print_flush(f"Document has {num_pages} pages") + + # Convert RGB files to PDF files + for page in range(1, num_pages + 1): + filename_base = f"/dangerzone/page-{page}" + rgb_filename = f"{filename_base}.rgb" + width_filename = f"{filename_base}.width" + height_filename = f"{filename_base}.height" + png_filename = f"/tmp/page-{page}.png" + ocr_filename = f"/tmp/page-{page}" + pdf_filename = f"/tmp/page-{page}.pdf" + + with open(width_filename) as f: + width = f.read().strip() + with open(height_filename) as f: + height = f.read().strip() + + if os.environ.get("OCR") == "1": + # OCR the document + print_flush(f"Converting page {page} from pixels to searchable PDF") + + args = [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"png:{png_filename}", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error converting pixels to PNG, convert timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Page {page} conversion failed: {p.stdout}") + sys.exit(1) + + args = [ + "tesseract", + png_filename, + ocr_filename, + "-l", + os.environ.get("OCR_LANGUAGE"), + "--dpi", + "70", + "pdf" + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error converting PNG to searchable PDF, tesseract timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Page {page} conversion failed: {p.stdout}") + sys.exit(1) + + else: + # Don't OCR + print_flush(f"Converting page {page} from pixels to PDF") + + args = [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"pdf:{pdf_filename}", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error converting RGB to PDF, convert timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Page {page} conversion failed: {p.stdout}") + sys.exit(1) + + print_flush() + + # Merge pages into a single PDF + print_flush(f"Merging {num_pages} pages into a single PDF") + args = ["pdfunite"] + for page in range(1, num_pages + 1): + args.append(f"/tmp/page-{page}.pdf") + args.append(f"/tmp/safe-output.pdf") + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + print_flush( + "Error merging pages into a single PDF, pdfunite timed out after 60 seconds" + ) + sys.exit(1) + if p.returncode != 0: + print_flush(f"Merge failed: {p.stdout}") + sys.exit(1) + + # Compress + print_flush("Compressing PDF") + compress_timeout = num_pages * 3 + try: + p = subprocess.run( + ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], + timeout=compress_timeout, + ) + except subprocess.TimeoutExpired: + print_flush(f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds") + sys.exit(1) + if p.returncode != 0: + print_flush(f"Compression failed: {p.stdout}") + sys.exit(1) + + +if __name__ == "__main__": + main()