From 911a511e11080bbd9db1839a7b50a0995b4116f8 Mon Sep 17 00:00:00 2001 From: Micah Lee Date: Thu, 9 Jan 2020 11:30:37 -0800 Subject: [PATCH] Remove container in preparation of making it a git submodule --- share/container/Containerfile | 10 ---- share/container/document-to-pixels | 72 ----------------------------- share/container/pixels-to-pdf | 73 ------------------------------ 3 files changed, 155 deletions(-) delete mode 100644 share/container/Containerfile delete mode 100755 share/container/document-to-pixels delete mode 100755 share/container/pixels-to-pdf diff --git a/share/container/Containerfile b/share/container/Containerfile deleted file mode 100644 index 5b23f01..0000000 --- a/share/container/Containerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM ubuntu:18.04 - -RUN apt-get update && \ - apt-get install -y file poppler-utils imagemagick ghostscript tesseract-ocr tesseract-ocr-all libreoffice - -# Fix imagemagick policy to allow writing PDFs -RUN sed -i '/rights="none" pattern="PDF"/c\' /etc/ImageMagick-6/policy.xml - -COPY document-to-pixels /usr/local/bin/document-to-pixels -COPY pixels-to-pdf /usr/local/bin/pixels-to-pdf diff --git a/share/container/document-to-pixels b/share/container/document-to-pixels deleted file mode 100755 index 7296775..0000000 --- a/share/container/document-to-pixels +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -die() { - echo "$1" >&2 - exit 1 -} - -# Detect the mime type -MIME_TYPE=$(file -b --mime-type /tmp/input_file) -echo "Documet MIME type is $MIME_TYPE" -echo - -# .pdf -if [ $MIME_TYPE = "application/pdf" ]; then - ORIGINAL_PDF=/tmp/input_file -# .docx -elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ]; then - echo "Converting DOCX to PDF" - CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir /tmp /tmp/input_file 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - ORIGINAL_PDF=/tmp/input_file.pdf -# .xlsx -elif [ $MIME_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ]; then - echo "Converting XLSX to PDF" - CONVERT_MSGS=$(libreoffice --headless --convert-to pdf:calc_pdf_Export --outdir /tmp /tmp/input_file 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - ORIGINAL_PDF=/tmp/input_file.pdf -else - die "The document format is not supported" -fi - -echo "Separating document into pages" -CONVERT_MSGS=$(pdfseparate $ORIGINAL_PDF /tmp/page-%d.pdf 2>&1) -if [ $? -ne 0 ]; then - die "Separating document into pages failed: $CONVERT_MSGS" -fi - -IMG_DEPTH=8 -NUM_PAGES=$(find /tmp/page-*.pdf |wc -l) -echo "Document has $NUM_PAGES pages" -echo - -for PAGE in $(seq 1 $NUM_PAGES); do - FILENAME=/tmp/page-$PAGE.pdf - FILENAME_BASE=/tmp/page-$PAGE - - echo "Converting page $PAGE to pixels" - - # Convert to png - pdftocairo "$FILENAME" -png -singlefile "$FILENAME_BASE" - - # Get the width and height - IMG_WIDTH=$(identify -format "%w" "$FILENAME_BASE.png") - IMG_HEIGHT=$(identify -format "%h" "$FILENAME_BASE.png") - echo $IMG_WIDTH > $FILENAME_BASE.width - echo $IMG_HEIGHT > $FILENAME_BASE.height - - # Convert to rgb - convert "$FILENAME_BASE.png" -depth $IMG_DEPTH rgb:"$FILENAME_BASE.rgb" - - # Delete the png - rm "$FILENAME_BASE.png" - - # Move files needed for the next step to the mounted volume - mv "$FILENAME_BASE.rgb" /dangerzone - mv "$FILENAME_BASE.width" /dangerzone - mv "$FILENAME_BASE.height" /dangerzone -done diff --git a/share/container/pixels-to-pdf b/share/container/pixels-to-pdf deleted file mode 100755 index 2f1fcba..0000000 --- a/share/container/pixels-to-pdf +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -die() { - echo "$1" >&2 - exit 1 -} - -IMG_DEPTH=8 -NUM_PAGES=$(find /dangerzone/page-*.rgb |wc -l) - -echo "Document has $NUM_PAGES pages" -echo - -# Convert rgb files to png files -for PAGE in $(seq 1 $NUM_PAGES); do - FILENAME_BASE=/dangerzone/page-$PAGE - RGB_FILENAME=${FILENAME_BASE}.rgb - WIDTH_FILENAME=${FILENAME_BASE}.width - HEIGHT_FILENAME=${FILENAME_BASE}.height - PNG_FILENAME=/tmp/page-$PAGE.png - OCR_FILENAME=/tmp/page-$PAGE - PDF_FILENAME=/tmp/page-$PAGE.pdf - - IMG_WIDTH=$(cat $WIDTH_FILENAME) - IMG_HEIGHT=$(cat $HEIGHT_FILENAME) - - if [ $OCR = "1" ]; then - - echo "Converting page $PAGE from pixels to searchable PDF" - - CONVERT_MSGS=$(convert -size "${IMG_WIDTH}x${IMG_HEIGHT}" -depth ${IMG_DEPTH} rgb:"$RGB_FILENAME" png:"$PNG_FILENAME" 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - - CONVERT_MSGS=$(tesseract $PNG_FILENAME $OCR_FILENAME pdf -l $OCR_LANGUAGE 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - - else - - echo "Converting page $PAGE from pixels to PDF" - - CONVERT_MSGS=$(convert -size "${IMG_WIDTH}x${IMG_HEIGHT}" -depth ${IMG_DEPTH} rgb:"$RGB_FILENAME" png:"$PNG_FILENAME" 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - - CONVERT_MSGS=$(convert $PNG_FILENAME $PDF_FILENAME 2>&1) - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed: $CONVERT_MSGS" - fi - - fi -done - -echo -echo "Merging $NUM_PAGES pages into a single PDF" - -# Put PDF filenames into an array -declare -a args -for PAGE in $(seq 1 $NUM_PAGES); do - args+=("/tmp/page-$PAGE.pdf") -done -args+=("/safezone/safe-output.pdf") - -# Merge them -pdfunite "${args[@]}" - -# Compress -echo "Compressing PDF" -ps2pdf /safezone/safe-output.pdf /safezone/safe-output-compressed.pdf