From 3e1ad6376c0e95efd475e772bc27f39e1bcb390e Mon Sep 17 00:00:00 2001 From: Micah Lee Date: Tue, 7 Jan 2020 10:22:11 -0800 Subject: [PATCH] Add support for converting docx to pdf using headless libreoffice --- dangerzone/tasks.py | 6 ++++++ share/container/Containerfile | 2 +- share/container/document-to-pixels | 28 +++++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/dangerzone/tasks.py b/dangerzone/tasks.py index 31ba048..378ba80 100644 --- a/dangerzone/tasks.py +++ b/dangerzone/tasks.py @@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase): ] output = self.execute_podman(args) + # Did we hit an error? + for line in output.split("\n"): + if "conversion failed" in line or "The document format is not supported" in line: + self.task_failed.emit(output) + return + # How many pages was that? num_pages = None for line in output.split("\n"): diff --git a/share/container/Containerfile b/share/container/Containerfile index eab9432..d36d79b 100644 --- a/share/container/Containerfile +++ b/share/container/Containerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 RUN apt-get update && \ - apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr + apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice # TODO: when we support OCR in other languages, we need tesseract-ocr-all diff --git a/share/container/document-to-pixels b/share/container/document-to-pixels index 5154723..76d80d4 100755 --- a/share/container/document-to-pixels +++ b/share/container/document-to-pixels @@ -1,11 +1,33 @@ #!/bin/bash -IMG_DEPTH=8 +die() { + echo "$1" >&2 + exit 1 +} + +# Detect the mime type +MIME_TYPE=$(file -b --mime-type /tmp/input_file) +echo "Documet MIME type is $MIME_TYPE" +echo + +# .docx +if [ $MIME_TYPE = "application/pdf" ]; then + ORIGINAL_PDF=/tmp/input_file +elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then + echo "Converting document to PDF" + CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1) + if [ $? -ne 0 ]; then + die "Page $PAGE conversion failed: $CONVERT_MSGS" + fi + ORIGINAL_PDF=/tmp/input_file.pdf +else + die "The document format is not supported" +fi echo "Separating document into pages" +pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf -pdfseparate /tmp/input_file /tmp/page-%d.pdf - +IMG_DEPTH=8 NUM_PAGES=$(find /tmp/page-*.pdf |wc -l) echo "Document has $NUM_PAGES pages" echo