mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Add support for converting docx to pdf using headless libreoffice
This commit is contained in:
parent
ab425feef5
commit
3e1ad6376c
3 changed files with 32 additions and 4 deletions
|
@ -95,6 +95,12 @@ class ConvertToPixels(TaskBase):
|
||||||
]
|
]
|
||||||
output = self.execute_podman(args)
|
output = self.execute_podman(args)
|
||||||
|
|
||||||
|
# Did we hit an error?
|
||||||
|
for line in output.split("\n"):
|
||||||
|
if "conversion failed" in line or "The document format is not supported" in line:
|
||||||
|
self.task_failed.emit(output)
|
||||||
|
return
|
||||||
|
|
||||||
# How many pages was that?
|
# How many pages was that?
|
||||||
num_pages = None
|
num_pages = None
|
||||||
for line in output.split("\n"):
|
for line in output.split("\n"):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
FROM ubuntu:18.04
|
FROM ubuntu:18.04
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
|
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr libreoffice
|
||||||
|
|
||||||
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
|
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,33 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
IMG_DEPTH=8
|
die() {
|
||||||
|
echo "$1" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Detect the mime type
|
||||||
|
MIME_TYPE=$(file -b --mime-type /tmp/input_file)
|
||||||
|
echo "Documet MIME type is $MIME_TYPE"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# .docx
|
||||||
|
if [ $MIME_TYPE = "application/pdf" ]; then
|
||||||
|
ORIGINAL_PDF=/tmp/input_file
|
||||||
|
elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then
|
||||||
|
echo "Converting document to PDF"
|
||||||
|
CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1)
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||||
|
fi
|
||||||
|
ORIGINAL_PDF=/tmp/input_file.pdf
|
||||||
|
else
|
||||||
|
die "The document format is not supported"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Separating document into pages"
|
echo "Separating document into pages"
|
||||||
|
pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf
|
||||||
|
|
||||||
pdfseparate /tmp/input_file /tmp/page-%d.pdf
|
IMG_DEPTH=8
|
||||||
|
|
||||||
NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
|
NUM_PAGES=$(find /tmp/page-*.pdf |wc -l)
|
||||||
echo "Document has $NUM_PAGES pages"
|
echo "Document has $NUM_PAGES pages"
|
||||||
echo
|
echo
|
||||||
|
|
Loading…
Reference in a new issue