From 3e1ad6376c0e95efd475e772bc27f39e1bcb390e Mon Sep 17 00:00:00 2001
From: Micah Lee <micah@micahflee.com>
Date: Tue, 7 Jan 2020 10:22:11 -0800
Subject: [PATCH] Add support for converting docx to pdf using headless
 libreoffice

---
 dangerzone/tasks.py                |  6 ++++++
 share/container/Containerfile      |  2 +-
 share/container/document-to-pixels | 28 +++++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/dangerzone/tasks.py b/dangerzone/tasks.py
index 31ba048..378ba80 100644
--- a/dangerzone/tasks.py
+++ b/dangerzone/tasks.py
@@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase):
         ]
         output = self.execute_podman(args)
 
+        # Did we hit an error?
+        for line in output.split("\n"):
+            if "conversion failed" in line or "The document format is not supported" in line:
+                self.task_failed.emit(output)
+                return
+
         # How many pages was that?
         num_pages = None
         for line in output.split("\n"):
diff --git a/share/container/Containerfile b/share/container/Containerfile
index eab9432..d36d79b 100644
--- a/share/container/Containerfile
+++ b/share/container/Containerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:18.04
 
 RUN apt-get update && \
-    apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
+    apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice
 
 # TODO: when we support OCR in other languages, we need tesseract-ocr-all
 
diff --git a/share/container/document-to-pixels b/share/container/document-to-pixels
index 5154723..76d80d4 100755
--- a/share/container/document-to-pixels
+++ b/share/container/document-to-pixels
@@ -1,11 +1,33 @@
 #!/bin/bash
 
-IMG_DEPTH=8
+die() {
+    echo "$1" >&2
+    exit 1
+}
+
+# Detect the mime type
+MIME_TYPE=$(file -b --mime-type /tmp/input_file)
+echo "Documet MIME type is $MIME_TYPE"
+echo
+
+# .docx
+if [ $MIME_TYPE = "application/pdf" ]; then
+    ORIGINAL_PDF=/tmp/input_file
+elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then
+    echo "Converting document to PDF"
+    CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1)
+    if [ $? -ne 0 ]; then
+        die "Page $PAGE conversion failed: $CONVERT_MSGS"
+    fi
+    ORIGINAL_PDF=/tmp/input_file.pdf
+else
+    die "The document format is not supported"
+fi
 
 echo "Separating document into pages"
+pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf
 
-pdfseparate /tmp/input_file /tmp/page-%d.pdf
-
+IMG_DEPTH=8
 NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
 echo "Document has $NUM_PAGES pages"
 echo