Add support for converting docx to pdf using headless libreoffice

This commit is contained in:
Micah Lee 2020-01-07 10:22:11 -08:00
parent ab425feef5
commit 3e1ad6376c
No known key found for this signature in database
GPG key ID: 403C2657CD994F73
3 changed files with 32 additions and 4 deletions

View file

@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase):
]
output = self.execute_podman(args)
# Did we hit an error?
for line in output.split("\n"):
if "conversion failed" in line or "The document format is not supported" in line:
self.task_failed.emit(output)
return
# How many pages was that?
num_pages = None
for line in output.split("\n"):

View file

@ -1,7 +1,7 @@
FROM ubuntu:18.04
RUN apt-get update && \
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice
# TODO: when we support OCR in other languages, we need tesseract-ocr-all

View file

@ -1,11 +1,33 @@
#!/bin/bash
IMG_DEPTH=8
die() {
echo "$1" >&2
exit 1
}
# Detect the mime type
MIME_TYPE=$(file -b --mime-type /tmp/input_file)
echo "Documet MIME type is $MIME_TYPE"
echo
# .docx
if [ $MIME_TYPE = "application/pdf" ]; then
ORIGINAL_PDF=/tmp/input_file
elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then
echo "Converting document to PDF"
CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1)
if [ $? -ne 0 ]; then
die "Page $PAGE conversion failed: $CONVERT_MSGS"
fi
ORIGINAL_PDF=/tmp/input_file.pdf
else
die "The document format is not supported"
fi
echo "Separating document into pages"
pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf
pdfseparate /tmp/input_file /tmp/page-%d.pdf
IMG_DEPTH=8
NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
echo "Document has $NUM_PAGES pages"
echo