mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Support english-language OCR
This commit is contained in:
parent
db23ced6c5
commit
ab425feef5
2 changed files with 9 additions and 3 deletions
|
@ -1,7 +1,9 @@
|
||||||
FROM ubuntu:18.04
|
FROM ubuntu:18.04
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr tesseract-ocr-all
|
apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr
|
||||||
|
|
||||||
|
# TODO: when we support OCR in other languages, we need tesseract-ocr-all
|
||||||
|
|
||||||
# Fix imagemagick policy to allow writing PDFs
|
# Fix imagemagick policy to allow writing PDFs
|
||||||
RUN sed -i '/rights="none" pattern="PDF"/c\<policy domain="coder" rights="read|write" pattern="PDF" />' /etc/ImageMagick-6/policy.xml
|
RUN sed -i '/rights="none" pattern="PDF"/c\<policy domain="coder" rights="read|write" pattern="PDF" />' /etc/ImageMagick-6/policy.xml
|
||||||
|
|
|
@ -19,6 +19,7 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
|
||||||
WIDTH_FILENAME=${FILENAME_BASE}.width
|
WIDTH_FILENAME=${FILENAME_BASE}.width
|
||||||
HEIGHT_FILENAME=${FILENAME_BASE}.height
|
HEIGHT_FILENAME=${FILENAME_BASE}.height
|
||||||
PNG_FILENAME=/tmp/$(basename $FILENAME_BASE).png
|
PNG_FILENAME=/tmp/$(basename $FILENAME_BASE).png
|
||||||
|
OCR_FILENAME=/tmp/$(basename $FILENAME_BASE)
|
||||||
PDF_FILENAME=/tmp/$(basename $FILENAME_BASE).pdf
|
PDF_FILENAME=/tmp/$(basename $FILENAME_BASE).pdf
|
||||||
|
|
||||||
echo "Converting page $PAGE from pixels to PNG"
|
echo "Converting page $PAGE from pixels to PNG"
|
||||||
|
@ -31,13 +32,16 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do
|
||||||
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Converting page $PAGE from PNG to PDF"
|
echo "Converting page $PAGE from PNG to searchable PDF"
|
||||||
CONVERT_MSGS=$(convert "$PNG_FILENAME" "$PDF_FILENAME" 2>&1)
|
CONVERT_MSGS=$(tesseract $PNG_FILENAME $OCR_FILENAME pdf 2>&1)
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
die "Page $PAGE conversion failed: $CONVERT_MSGS"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo
|
||||||
done
|
done
|
||||||
|
|
||||||
|
echo
|
||||||
echo "Merging $NUM_PAGES pages into a single PDF"
|
echo "Merging $NUM_PAGES pages into a single PDF"
|
||||||
|
|
||||||
# Put PDF filenames into an array
|
# Put PDF filenames into an array
|
||||||
|
|
Loading…
Reference in a new issue