Add support for converting docx to pdf using headless libreoffice

2025-04-28 18:02:38 +02:00 · 2020-01-07 10:22:11 -08:00 · 2020-01-07 10:22:11 -08:00 · 3e1ad6376c
commit 3e1ad6376c
parent ab425feef5
3 changed files with 32 additions and 4 deletions
--- a/dangerzone/tasks.py
+++ b/dangerzone/tasks.py
@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase):
        ]
        output = self.execute_podman(args)
        # Did we hit an error?
        for line in output.split("\n"):
            if "conversion failed" in line or "The document format is not supported" in line:
                self.task_failed.emit(output)
                return
        # How many pages was that?
        num_pages = None
        for line in output.split("\n"):
--- a/share/container/Containerfile
+++ b/share/container/Containerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04
 RUN apt-get update && \
-    apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
+    apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice
 # TODO: when we support OCR in other languages, we need tesseract-ocr-all
--- a/share/container/document-to-pixels
+++ b/share/container/document-to-pixels
@ -1,11 +1,33 @@
 #!/bin/bash
-IMG_DEPTH=8
+die() {
    echo "$1" >&2
    exit 1
 }
 # Detect the mime type
 MIME_TYPE=$(file -b --mime-type /tmp/input_file)
 echo "Documet MIME type is $MIME_TYPE"
 echo
 # .docx
 if [ $MIME_TYPE = "application/pdf" ]; then
    ORIGINAL_PDF=/tmp/input_file
 elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then
    echo "Converting document to PDF"
    CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1)
    if [ $? -ne 0 ]; then
        die "Page $PAGE conversion failed: $CONVERT_MSGS"
    fi
    ORIGINAL_PDF=/tmp/input_file.pdf
 else
    die "The document format is not supported"
 fi
 echo "Separating document into pages"
 pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf
-pdfseparate /tmp/input_file /tmp/page-%d.pdf
+IMG_DEPTH=8
 NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
 echo "Document has $NUM_PAGES pages"
 echo