From ab425feef54da83648bbd503ed5a749374776b63 Mon Sep 17 00:00:00 2001 From: Micah Lee Date: Tue, 7 Jan 2020 09:52:04 -0800 Subject: [PATCH] Support english-language OCR --- share/container/Containerfile | 4 +++- share/container/pixels-to-pdf | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/share/container/Containerfile b/share/container/Containerfile index ecccfbb..eab9432 100644 --- a/share/container/Containerfile +++ b/share/container/Containerfile @@ -1,7 +1,9 @@ FROM ubuntu:18.04 RUN apt-get update && \ - apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr tesseract-ocr-all + apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr + +# TODO: when we support OCR in other languages, we need tesseract-ocr-all # Fix imagemagick policy to allow writing PDFs RUN sed -i '/rights="none" pattern="PDF"/c\' /etc/ImageMagick-6/policy.xml diff --git a/share/container/pixels-to-pdf b/share/container/pixels-to-pdf index ebab8fb..63b78d9 100755 --- a/share/container/pixels-to-pdf +++ b/share/container/pixels-to-pdf @@ -19,6 +19,7 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do WIDTH_FILENAME=${FILENAME_BASE}.width HEIGHT_FILENAME=${FILENAME_BASE}.height PNG_FILENAME=/tmp/$(basename $FILENAME_BASE).png + OCR_FILENAME=/tmp/$(basename $FILENAME_BASE) PDF_FILENAME=/tmp/$(basename $FILENAME_BASE).pdf echo "Converting page $PAGE from pixels to PNG" @@ -31,13 +32,16 @@ for RGB_FILENAME in $(find /dangerzone/page-*.rgb); do die "Page $PAGE conversion failed: $CONVERT_MSGS" fi - echo "Converting page $PAGE from PNG to PDF" - CONVERT_MSGS=$(convert "$PNG_FILENAME" "$PDF_FILENAME" 2>&1) + echo "Converting page $PAGE from PNG to searchable PDF" + CONVERT_MSGS=$(tesseract $PNG_FILENAME $OCR_FILENAME pdf 2>&1) if [ $? -ne 0 ]; then die "Page $PAGE conversion failed: $CONVERT_MSGS" fi + + echo done +echo echo "Merging $NUM_PAGES pages into a single PDF" # Put PDF filenames into an array