mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 09:52:37 +02:00
Support english-language OCR
This commit is contained in:
parent
db23ced6c5
commit
ab425feef5
2 changed files with 9 additions and 3 deletions
|
@ -1,7 +1,9 @@
|
|||
FROM ubuntu:18.04
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr tesseract-ocr-all
|
||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
|
||||
|
||||
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
|
||||
|
||||
# Fix imagemagick policy to allow writing PDFs
|
||||
RUN sed -i '/rights="none" pattern="PDF"/c\<policy domain="coder" rights="read|write" pattern="PDF" />' /etc/ImageMagick-6/policy.xml
|
||||
|
|
|
@ -19,6 +19,7 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
|
|||
WIDTH_FILENAME=${FILENAME_BASE}.width
|
||||
HEIGHT_FILENAME=${FILENAME_BASE}.height
|
||||
PNG_FILENAME=/tmp/$(basename $FILENAME_BASE).png
|
||||
OCR_FILENAME=/tmp/$(basename $FILENAME_BASE)
|
||||
PDF_FILENAME=/tmp/$(basename $FILENAME_BASE).pdf
|
||||
|
||||
echo "Converting page $PAGE from pixels to PNG"
|
||||
|
@ -31,13 +32,16 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
|
|||
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||
fi
|
||||
|
||||
echo "Converting page $PAGE from PNG to PDF"
|
||||
CONVERT_MSGS=$(convert "$PNG_FILENAME" "$PDF_FILENAME" 2>&1)
|
||||
echo "Converting page $PAGE from PNG to searchable PDF"
|
||||
CONVERT_MSGS=$(tesseract $PNG_FILENAME $OCR_FILENAME pdf 2>&1)
|
||||
if [ $? -ne 0 ]; then
|
||||
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||
fi
|
||||
|
||||
echo
|
||||
done
|
||||
|
||||
echo
|
||||
echo "Merging $NUM_PAGES pages into a single PDF"
|
||||
|
||||
# Put PDF filenames into an array
|
||||
|
|
Loading…
Reference in a new issue