Support english-language OCR

This commit is contained in:
Micah Lee 2020-01-07 09:52:04 -08:00
parent db23ced6c5
commit ab425feef5
No known key found for this signature in database
GPG key ID: 403C2657CD994F73
2 changed files with 9 additions and 3 deletions

View file

@ -1,7 +1,9 @@
FROM ubuntu:18.04
RUN apt-get update && \
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr tesseract-ocr-all
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
# Fix imagemagick policy to allow writing PDFs
RUN sed -i '/rights="none" pattern="PDF"/c\<policy domain="coder" rights="read|write" pattern="PDF" />' /etc/ImageMagick-6/policy.xml

View file

@ -19,6 +19,7 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
WIDTH_FILENAME=${FILENAME_BASE}.width
HEIGHT_FILENAME=${FILENAME_BASE}.height
PNG_FILENAME=/tmp/$(basename $FILENAME_BASE).png
OCR_FILENAME=/tmp/$(basename $FILENAME_BASE)
PDF_FILENAME=/tmp/$(basename $FILENAME_BASE).pdf
echo "Converting page $PAGE from pixels to PNG"
@ -31,13 +32,16 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
die "Page $PAGE conversion failed: $CONVERT_MSGS"
fi
echo "Converting page $PAGE from PNG to PDF"
CONVERT_MSGS=$(convert "$PNG_FILENAME" "$PDF_FILENAME" 2>&1)
echo "Converting page $PAGE from PNG to searchable PDF"
CONVERT_MSGS=$(tesseract $PNG_FILENAME $OCR_FILENAME pdf 2>&1)
if [ $? -ne 0 ]; then
die "Page $PAGE conversion failed: $CONVERT_MSGS"
fi
echo
done
echo
echo "Merging $NUM_PAGES pages into a single PDF"
# Put PDF filenames into an array