mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 09:52:37 +02:00
Add support for converting docx to pdf using headless libreoffice
This commit is contained in:
parent
ab425feef5
commit
3e1ad6376c
3 changed files with 32 additions and 4 deletions
|
@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase):
|
|||
]
|
||||
output = self.execute_podman(args)
|
||||
|
||||
# Did we hit an error?
|
||||
for line in output.split("\n"):
|
||||
if "conversion failed" in line or "The document format is not supported" in line:
|
||||
self.task_failed.emit(output)
|
||||
return
|
||||
|
||||
# How many pages was that?
|
||||
num_pages = None
|
||||
for line in output.split("\n"):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
FROM ubuntu:18.04
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
|
||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice
|
||||
|
||||
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
|
||||
|
||||
|
|
|
@ -1,11 +1,33 @@
|
|||
#!/bin/bash
|
||||
|
||||
IMG_DEPTH=8
|
||||
die() {
|
||||
echo "$1" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Detect the mime type
|
||||
MIME_TYPE=$(file -b --mime-type /tmp/input_file)
|
||||
echo "Documet MIME type is $MIME_TYPE"
|
||||
echo
|
||||
|
||||
# .docx
|
||||
if [ $MIME_TYPE = "application/pdf" ]; then
|
||||
ORIGINAL_PDF=/tmp/input_file
|
||||
elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then
|
||||
echo "Converting document to PDF"
|
||||
CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1)
|
||||
if [ $? -ne 0 ]; then
|
||||
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||
fi
|
||||
ORIGINAL_PDF=/tmp/input_file.pdf
|
||||
else
|
||||
die "The document format is not supported"
|
||||
fi
|
||||
|
||||
echo "Separating document into pages"
|
||||
pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf
|
||||
|
||||
pdfseparate /tmp/input_file /tmp/page-%d.pdf
|
||||
|
||||
IMG_DEPTH=8
|
||||
NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
|
||||
echo "Document has $NUM_PAGES pages"
|
||||
echo
|
||||
|
|
Loading…
Reference in a new issue