Move dangerzone-converter code into dangerzone repo

This commit is contained in:
Micah Lee 2021-06-28 16:31:11 -07:00
parent f4ff04ff80
commit 27bdcd408b
No known key found for this signature in database
GPG key ID: 403C2657CD994F73
8 changed files with 640 additions and 0 deletions

View file

@ -0,0 +1,108 @@
version: 2.1
executors:
docker-publisher:
environment:
IMAGE_NAME: flmcode/dangerzone
docker:
- image: circleci/python
jobs:
build:
executor: docker-publisher
steps:
- checkout
- setup_remote_docker
- run:
name: Build docker image
command: docker build -t $IMAGE_NAME:latest .
- run:
name: Archive docker image
command: docker save -o image.tar $IMAGE_NAME
- persist_to_workspace:
root: .
paths:
- ./image.tar
publish-latest:
executor: docker-publisher
steps:
- attach_workspace:
at: /tmp/workspace
- setup_remote_docker
- run:
name: Load archived docker image
command: docker load -i /tmp/workspace/image.tar
- run:
name: Publish image to docker Hub
command: |
echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin
docker push $IMAGE_NAME:latest
publish-tag:
executor: docker-publisher
steps:
- attach_workspace:
at: /tmp/workspace
- setup_remote_docker
- run:
name: Load archived docker image
command: docker load -i /tmp/workspace/image.tar
- run:
name: Publish image to docker Hub
command: |
echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin
IMAGE_TAG=${CIRCLE_TAG/v/''}
docker tag $IMAGE_NAME:latest $IMAGE_NAME:$IMAGE_TAG
docker push $IMAGE_NAME:latest
docker push $IMAGE_NAME:$IMAGE_TAG
workflows:
version: 2
build-stable:
jobs:
- build:
filters:
branches:
only: stable
- publish-latest:
requires:
- build
filters:
branches:
only: stable
build-tags:
jobs:
- build:
filters:
tags:
only: /^v.*/
branches:
ignore: /.*/
- publish-tag:
requires:
- build
filters:
tags:
only: /^v.*/
branches:
ignore: /.*/
monthly:
triggers:
- schedule:
cron: "0 0 1 * *"
filters:
branches:
only: stable
jobs:
- build:
filters:
branches:
only: stable
- publish-latest:
requires:
- build
filters:
branches:
only: stable

View file

@ -0,0 +1,100 @@
FROM alpine:latest
# Install dependencies
RUN apk -U upgrade && \
apk add \
ghostscript \
graphicsmagick \
libreoffice \
openjdk8 \
poppler-utils \
py3-magic \
py3-pillow \
sudo \
tesseract-ocr \
tesseract-ocr-data-afr \
tesseract-ocr-data-ara \
tesseract-ocr-data-aze \
tesseract-ocr-data-bel \
tesseract-ocr-data-ben \
tesseract-ocr-data-bul \
tesseract-ocr-data-cat \
tesseract-ocr-data-ces \
tesseract-ocr-data-chi_sim \
tesseract-ocr-data-chi_tra \
tesseract-ocr-data-chr \
tesseract-ocr-data-dan \
tesseract-ocr-data-deu \
tesseract-ocr-data-ell \
tesseract-ocr-data-enm \
tesseract-ocr-data-epo \
tesseract-ocr-data-equ \
tesseract-ocr-data-est \
tesseract-ocr-data-eus \
tesseract-ocr-data-fin \
tesseract-ocr-data-fra \
tesseract-ocr-data-frk \
tesseract-ocr-data-frm \
tesseract-ocr-data-glg \
tesseract-ocr-data-grc \
tesseract-ocr-data-heb \
tesseract-ocr-data-hin \
tesseract-ocr-data-hrv \
tesseract-ocr-data-hun \
tesseract-ocr-data-ind \
tesseract-ocr-data-isl \
tesseract-ocr-data-ita \
tesseract-ocr-data-ita_old \
tesseract-ocr-data-jpn \
tesseract-ocr-data-kan \
tesseract-ocr-data-kat \
tesseract-ocr-data-kor \
tesseract-ocr-data-lav \
tesseract-ocr-data-lit \
tesseract-ocr-data-mal \
tesseract-ocr-data-mkd \
tesseract-ocr-data-mlt \
tesseract-ocr-data-msa \
tesseract-ocr-data-nld \
tesseract-ocr-data-nor \
tesseract-ocr-data-pol \
tesseract-ocr-data-por \
tesseract-ocr-data-ron \
tesseract-ocr-data-rus \
tesseract-ocr-data-slk \
tesseract-ocr-data-slv \
tesseract-ocr-data-spa \
tesseract-ocr-data-spa_old \
tesseract-ocr-data-sqi \
tesseract-ocr-data-srp \
tesseract-ocr-data-swa \
tesseract-ocr-data-swe \
tesseract-ocr-data-tam \
tesseract-ocr-data-tel \
tesseract-ocr-data-tgl \
tesseract-ocr-data-tha \
tesseract-ocr-data-tur \
tesseract-ocr-data-ukr \
tesseract-ocr-data-vie
# Install pdftk
RUN \
wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \
mv pdftk-all.jar /usr/local/bin && \
chmod +x /usr/local/bin/pdftk-all.jar && \
echo '#!/bin/sh' > /usr/local/bin/pdftk && \
echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \
chmod +x /usr/local/bin/pdftk
COPY scripts/* /usr/local/bin/
# Add the unprivileged user
RUN adduser -h /home/user -s /bin/sh -D user
# /tmp/input_file is where the first convert expects the input file to be, and
# /tmp where it will write the pixel files
#
# /dangerzone is where the second script expects files to be put by the first one
#
# /safezone is where the wrapper eventually moves the sanitized files.
VOLUME /dangerzone /tmp/input_file /safezone

View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 First Look Media
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,9 @@
# dangerzone-converter
This is the container for [dangerzone](https://github.com/firstlookmedia/dangerzone), which converts potentially a dangerous PDF, office document, or image into a safe PDF.
## Development notes
Issues are tracked [here](https://github.com/firstlookmedia/dangerzone/issues?q=is%3Aissue+is%3Aopen+label%3Acontainer), in the dangerzone repository using the `container` label.
Containers are built in continuous integration when commits are pushed to the `stable` branch.

View file

@ -0,0 +1,27 @@
#!/bin/sh
# Remove this warning by setting the host in /etc/hosts:
# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution
echo 127.0.0.1 $(hostname) >> /etc/hosts
# Record original permissions, and make document readable
START_PERMISSIONS=$(stat /tmp/input_file | grep Access | grep Uid | cut -d"(" -f2 |cut -d"/" -f1)
/bin/chmod 0644 /tmp/input_file
# Do the conversion without root
/usr/bin/sudo -u user /usr/local/bin/document-to-pixels-unpriv
RETURN_CODE=$?
# Restore original permissions
/bin/chmod $START_PERMISSIONS /tmp/input_file
# Check for failure
if [ $RETURN_CODE -ne 0 ]; then
echo ""
exit $RETURN_CODE
fi
# Move converted files into /dangerzone
/bin/mv /tmp/page-*.rgb /dangerzone
/bin/mv /tmp/page-*.width /dangerzone
/bin/mv /tmp/page-*.height /dangerzone

View file

@ -0,0 +1,219 @@
#!/usr/bin/env python3
import sys
import subprocess
import glob
import shutil
import os
import magic
from PIL import Image
def print_flush(s):
print(s)
sys.stdout.flush()
def main():
conversions = {
# .pdf
"application/pdf": {"type": None},
# .docx
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .doc
"application/msword": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .docm
"application/vnd.ms-word.document.macroEnabled.12": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .xlsx
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .xls
"application/vnd.ms-excel": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .pptx
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ppt
"application/vnd.ms-powerpoint": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odt
"application/vnd.oasis.opendocument.text": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .odg
"application/vnd.oasis.opendocument.graphics": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odp
"application/vnd.oasis.opendocument.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ops
"application/vnd.oasis.opendocument.spreadsheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .jpg
"image/jpeg": {"type": "convert"},
# .gif
"image/gif": {"type": "convert"},
# .png
"image/png": {"type": "convert"},
# .tif
"image/tiff": {"type": "convert"},
"image/x-tiff": {"type": "convert"},
}
# Detect MIME type
mime = magic.Magic(mime=True)
mime_type = mime.from_file("/tmp/input_file")
# Validate MIME type
if mime_type not in conversions:
print_flush("The document format is not supported")
sys.exit(1)
# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice":
print_flush(f"Converting to PDF using LibreOffice")
args = [
"libreoffice",
"--headless",
"--convert-to",
f"pdf:{conversion['libreoffice_output_filter']}",
"--outdir",
"/tmp",
"/tmp/input_file",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting document to PDF, LibreOffice timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion to PDF failed: {p.stdout}")
sys.exit(1)
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
print_flush(f"Converting to PDF using GraphicsMagick")
args = [
"gm",
"convert",
"/tmp/input_file",
"/tmp/input_file.pdf",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion to PDF failed: {p.stdout}")
sys.exit(1)
pdf_filename = "/tmp/input_file.pdf"
else:
print_flush("Invalid conversion type")
sys.exit(1)
# Separate PDF into pages
print_flush("")
print_flush(f"Separating document into pages")
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error separating document into pages, pdfseparate timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Separating document into pages failed: {p.stdout}")
sys.exit(1)
page_filenames = glob.glob("/tmp/page-*.pdf")
print_flush(f"Document has {len(page_filenames)} pages")
print_flush("")
# Convert to RGB pixel data
for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png"
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"
print_flush(f"Converting page {page} to pixels")
# Convert to png
try:
p = subprocess.run(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
timeout=60,
)
except subprocess.TimeoutExpired:
print_flush(
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion from PDF to PNG failed: {p.stdout}")
sys.exit(1)
# Save the width and height
im = Image.open(png_filename)
width, height = im.size
with open(width_filename, "w") as f:
f.write(str(width))
with open(height_filename, "w") as f:
f.write(str(height))
# Convert to RGB pixels
try:
p = subprocess.run(
["gm", "convert", png_filename, "-depth", "8", f"rgb:{rgb_filename}"],
timeout=60,
)
except subprocess.TimeoutExpired:
print_flush(
"Error converting from PNG to pixels, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion from PNG to RGB failed: {p.stdout}")
sys.exit(1)
# Delete the png
os.remove(png_filename)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,17 @@
#!/bin/sh
# Remove this warning by setting the host in /etc/hosts:
# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution
echo 127.0.0.1 $(hostname) >> /etc/hosts
# Do the conversion without root
/usr/bin/sudo OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE -u user /usr/local/bin/pixels-to-pdf-unpriv
RETURN_CODE=$?
if [ $RETURN_CODE -ne 0 ]; then
echo ""
exit $RETURN_CODE
fi
# Move converted files into /safezone
/bin/mv /tmp/safe-output.pdf /safezone
/bin/mv /tmp/safe-output-compressed.pdf /safezone

View file

@ -0,0 +1,139 @@
#!/usr/bin/env python3
import glob
import os
import sys
import subprocess
def print_flush(s=""):
print(s)
sys.stdout.flush()
def main():
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
print_flush(f"Document has {num_pages} pages")
# Convert RGB files to PDF files
for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
if os.environ.get("OCR") == "1":
# OCR the document
print_flush(f"Converting page {page} from pixels to searchable PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting pixels to PNG, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
args = [
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"),
"--dpi",
"70",
"pdf"
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
else:
# Don't OCR
print_flush(f"Converting page {page} from pixels to PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting RGB to PDF, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
print_flush()
# Merge pages into a single PDF
print_flush(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Merge failed: {p.stdout}")
sys.exit(1)
# Compress
print_flush("Compressing PDF")
compress_timeout = num_pages * 3
try:
p = subprocess.run(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
timeout=compress_timeout,
)
except subprocess.TimeoutExpired:
print_flush(f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds")
sys.exit(1)
if p.returncode != 0:
print_flush(f"Compression failed: {p.stdout}")
sys.exit(1)
if __name__ == "__main__":
main()