From e172274873f978420a02d46a3c357838fe62a8c4 Mon Sep 17 00:00:00 2001 From: Micah Lee Date: Fri, 30 Jul 2021 15:00:16 -0700 Subject: [PATCH] Refactor dangerzone-converter to have both containers share code --- dangerzone-converter/Dockerfile | 1 + dangerzone-converter/scripts/dangerzone.py | 374 ++++++++++++++++++ .../scripts/document-to-pixels | 2 +- .../scripts/document-to-pixels-unpriv | 219 ---------- dangerzone-converter/scripts/pixels-to-pdf | 2 +- .../scripts/pixels-to-pdf-unpriv | 139 ------- 6 files changed, 377 insertions(+), 360 deletions(-) create mode 100644 dangerzone-converter/scripts/dangerzone.py delete mode 100755 dangerzone-converter/scripts/document-to-pixels-unpriv delete mode 100755 dangerzone-converter/scripts/pixels-to-pdf-unpriv diff --git a/dangerzone-converter/Dockerfile b/dangerzone-converter/Dockerfile index 240c98b..bea4bfe 100644 --- a/dangerzone-converter/Dockerfile +++ b/dangerzone-converter/Dockerfile @@ -8,6 +8,7 @@ RUN apk -U upgrade && \ libreoffice \ openjdk8 \ poppler-utils \ + python3 \ py3-magic \ py3-pillow \ sudo \ diff --git a/dangerzone-converter/scripts/dangerzone.py b/dangerzone-converter/scripts/dangerzone.py new file mode 100644 index 0000000..c17f467 --- /dev/null +++ b/dangerzone-converter/scripts/dangerzone.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +import sys +import subprocess +import glob +import os + +import magic +from PIL import Image + + +class DangerzoneConverter: + def __init__(self): + pass + + def document_to_pixels(self): + conversions = { + # .pdf + "application/pdf": {"type": None}, + # .docx + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .doc + "application/msword": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .docm + "application/vnd.ms-word.document.macroEnabled.12": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .xlsx + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .xls + "application/vnd.ms-excel": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .pptx + "application/vnd.openxmlformats-officedocument.presentationml.presentation": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .ppt + "application/vnd.ms-powerpoint": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .odt + "application/vnd.oasis.opendocument.text": { + "type": "libreoffice", + "libreoffice_output_filter": "writer_pdf_Export", + }, + # .odg + "application/vnd.oasis.opendocument.graphics": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .odp + "application/vnd.oasis.opendocument.presentation": { + "type": "libreoffice", + "libreoffice_output_filter": "impress_pdf_Export", + }, + # .ops + "application/vnd.oasis.opendocument.spreadsheet": { + "type": "libreoffice", + "libreoffice_output_filter": "calc_pdf_Export", + }, + # .jpg + "image/jpeg": {"type": "convert"}, + # .gif + "image/gif": {"type": "convert"}, + # .png + "image/png": {"type": "convert"}, + # .tif + "image/tiff": {"type": "convert"}, + "image/x-tiff": {"type": "convert"}, + } + + # Detect MIME type + mime = magic.Magic(mime=True) + mime_type = mime.from_file("/tmp/input_file") + + # Validate MIME type + if mime_type not in conversions: + self._print("The document format is not supported") + return 1 + + # Convert input document to PDF + conversion = conversions[mime_type] + if conversion["type"] is None: + pdf_filename = "/tmp/input_file" + elif conversion["type"] == "libreoffice": + self._print(f"Converting to PDF using LibreOffice") + args = [ + "libreoffice", + "--headless", + "--convert-to", + f"pdf:{conversion['libreoffice_output_filter']}", + "--outdir", + "/tmp", + "/tmp/input_file", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error converting document to PDF, LibreOffice timed out after 60 seconds" + ) + return 1 + + if p.returncode != 0: + self._print(f"Conversion to PDF failed: {p.stdout}") + return 1 + pdf_filename = "/tmp/input_file.pdf" + elif conversion["type"] == "convert": + self._print(f"Converting to PDF using GraphicsMagick") + args = [ + "gm", + "convert", + "/tmp/input_file", + "/tmp/input_file.pdf", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error converting document to PDF, GraphicsMagick timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Conversion to PDF failed: {p.stdout}") + return 1 + pdf_filename = "/tmp/input_file.pdf" + else: + self._print("Invalid conversion type") + return 1 + + # Separate PDF into pages + self._print("") + self._print(f"Separating document into pages") + args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error separating document into pages, pdfseparate timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Separating document into pages failed: {p.stdout}") + return 1 + + page_filenames = glob.glob("/tmp/page-*.pdf") + self._print(f"Document has {len(page_filenames)} pages") + self._print("") + + # Convert to RGB pixel data + for page in range(1, len(page_filenames) + 1): + pdf_filename = f"/tmp/page-{page}.pdf" + png_filename = f"/tmp/page-{page}.png" + rgb_filename = f"/tmp/page-{page}.rgb" + width_filename = f"/tmp/page-{page}.width" + height_filename = f"/tmp/page-{page}.height" + filename_base = f"/tmp/page-{page}" + + self._print(f"Converting page {page} to pixels") + + # Convert to png + try: + p = subprocess.run( + ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], + timeout=60, + ) + except subprocess.TimeoutExpired: + self._print( + "Error converting from PDF to PNG, pdftocairo timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Conversion from PDF to PNG failed: {p.stdout}") + return 1 + + # Save the width and height + im = Image.open(png_filename) + width, height = im.size + with open(width_filename, "w") as f: + f.write(str(width)) + with open(height_filename, "w") as f: + f.write(str(height)) + + # Convert to RGB pixels + try: + p = subprocess.run( + [ + "gm", + "convert", + png_filename, + "-depth", + "8", + f"rgb:{rgb_filename}", + ], + timeout=60, + ) + except subprocess.TimeoutExpired: + self._print( + "Error converting from PNG to pixels, convert timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Conversion from PNG to RGB failed: {p.stdout}") + return 1 + + # Delete the png + os.remove(png_filename) + + return 0 + + def pixels_to_pdf(self): + num_pages = len(glob.glob("/dangerzone/page-*.rgb")) + self._print(f"Document has {num_pages} pages") + + # Convert RGB files to PDF files + for page in range(1, num_pages + 1): + filename_base = f"/dangerzone/page-{page}" + rgb_filename = f"{filename_base}.rgb" + width_filename = f"{filename_base}.width" + height_filename = f"{filename_base}.height" + png_filename = f"/tmp/page-{page}.png" + ocr_filename = f"/tmp/page-{page}" + pdf_filename = f"/tmp/page-{page}.pdf" + + with open(width_filename) as f: + width = f.read().strip() + with open(height_filename) as f: + height = f.read().strip() + + if os.environ.get("OCR") == "1": + # OCR the document + self._print(f"Converting page {page} from pixels to searchable PDF") + + args = [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"png:{png_filename}", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error converting pixels to PNG, convert timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Page {page} conversion failed: {p.stdout}") + return 1 + + args = [ + "tesseract", + png_filename, + ocr_filename, + "-l", + os.environ.get("OCR_LANGUAGE"), + "--dpi", + "70", + "pdf", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error converting PNG to searchable PDF, tesseract timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Page {page} conversion failed: {p.stdout}") + return 1 + + else: + # Don't OCR + self._print(f"Converting page {page} from pixels to PDF") + + args = [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"pdf:{pdf_filename}", + ] + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error converting RGB to PDF, convert timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Page {page} conversion failed: {p.stdout}") + return 1 + + self._print() + + # Merge pages into a single PDF + self._print(f"Merging {num_pages} pages into a single PDF") + args = ["pdfunite"] + for page in range(1, num_pages + 1): + args.append(f"/tmp/page-{page}.pdf") + args.append(f"/tmp/safe-output.pdf") + try: + p = subprocess.run(args, timeout=60) + except subprocess.TimeoutExpired: + self._print( + "Error merging pages into a single PDF, pdfunite timed out after 60 seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Merge failed: {p.stdout}") + return 1 + + # Compress + self._print("Compressing PDF") + compress_timeout = num_pages * 3 + try: + p = subprocess.run( + ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], + timeout=compress_timeout, + ) + except subprocess.TimeoutExpired: + self._print( + f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds" + ) + return 1 + if p.returncode != 0: + self._print(f"Compression failed: {p.stdout}") + return 1 + + return 0 + + def _print(self, s=""): + print(s) + sys.stdout.flush() + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]") + return -1 + + converter = DangerzoneConverter() + + if sys.argv[1] == "document-to-pixels": + return converter.document_to_pixels() + + if sys.argv[1] == "pixels-to-pdf": + return converter.pixels_to_pdf() + + return -1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/dangerzone-converter/scripts/document-to-pixels b/dangerzone-converter/scripts/document-to-pixels index b2b3bff..56cba1d 100755 --- a/dangerzone-converter/scripts/document-to-pixels +++ b/dangerzone-converter/scripts/document-to-pixels @@ -10,7 +10,7 @@ START_PERMISSIONS=$(stat /tmp/input_file | grep Access | grep Uid | cut -d"(" -f # Do the conversion without root # /usr/bin/sudo -u user /usr/local/bin/document-to-pixels-unpriv -/usr/local/bin/document-to-pixels-unpriv +/usr/bin/python3 /usr/local/bin/dangerzone.py document-to-pixels RETURN_CODE=$? # Restore original permissions diff --git a/dangerzone-converter/scripts/document-to-pixels-unpriv b/dangerzone-converter/scripts/document-to-pixels-unpriv deleted file mode 100755 index d51a9ac..0000000 --- a/dangerzone-converter/scripts/document-to-pixels-unpriv +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -import sys -import subprocess -import glob -import shutil -import os - -import magic -from PIL import Image - - -def print_flush(s): - print(s) - sys.stdout.flush() - - -def main(): - conversions = { - # .pdf - "application/pdf": {"type": None}, - # .docx - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { - "type": "libreoffice", - "libreoffice_output_filter": "writer_pdf_Export", - }, - # .doc - "application/msword": { - "type": "libreoffice", - "libreoffice_output_filter": "writer_pdf_Export", - }, - # .docm - "application/vnd.ms-word.document.macroEnabled.12": { - "type": "libreoffice", - "libreoffice_output_filter": "writer_pdf_Export", - }, - # .xlsx - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { - "type": "libreoffice", - "libreoffice_output_filter": "calc_pdf_Export", - }, - # .xls - "application/vnd.ms-excel": { - "type": "libreoffice", - "libreoffice_output_filter": "calc_pdf_Export", - }, - # .pptx - "application/vnd.openxmlformats-officedocument.presentationml.presentation": { - "type": "libreoffice", - "libreoffice_output_filter": "impress_pdf_Export", - }, - # .ppt - "application/vnd.ms-powerpoint": { - "type": "libreoffice", - "libreoffice_output_filter": "impress_pdf_Export", - }, - # .odt - "application/vnd.oasis.opendocument.text": { - "type": "libreoffice", - "libreoffice_output_filter": "writer_pdf_Export", - }, - # .odg - "application/vnd.oasis.opendocument.graphics": { - "type": "libreoffice", - "libreoffice_output_filter": "impress_pdf_Export", - }, - # .odp - "application/vnd.oasis.opendocument.presentation": { - "type": "libreoffice", - "libreoffice_output_filter": "impress_pdf_Export", - }, - # .ops - "application/vnd.oasis.opendocument.spreadsheet": { - "type": "libreoffice", - "libreoffice_output_filter": "calc_pdf_Export", - }, - # .jpg - "image/jpeg": {"type": "convert"}, - # .gif - "image/gif": {"type": "convert"}, - # .png - "image/png": {"type": "convert"}, - # .tif - "image/tiff": {"type": "convert"}, - "image/x-tiff": {"type": "convert"}, - } - - # Detect MIME type - mime = magic.Magic(mime=True) - mime_type = mime.from_file("/tmp/input_file") - - # Validate MIME type - if mime_type not in conversions: - print_flush("The document format is not supported") - sys.exit(1) - - # Convert input document to PDF - conversion = conversions[mime_type] - if conversion["type"] is None: - pdf_filename = "/tmp/input_file" - elif conversion["type"] == "libreoffice": - print_flush(f"Converting to PDF using LibreOffice") - args = [ - "libreoffice", - "--headless", - "--convert-to", - f"pdf:{conversion['libreoffice_output_filter']}", - "--outdir", - "/tmp", - "/tmp/input_file", - ] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error converting document to PDF, LibreOffice timed out after 60 seconds" - ) - sys.exit(1) - - if p.returncode != 0: - print_flush(f"Conversion to PDF failed: {p.stdout}") - sys.exit(1) - pdf_filename = "/tmp/input_file.pdf" - elif conversion["type"] == "convert": - print_flush(f"Converting to PDF using GraphicsMagick") - args = [ - "gm", - "convert", - "/tmp/input_file", - "/tmp/input_file.pdf", - ] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error converting document to PDF, GraphicsMagick timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Conversion to PDF failed: {p.stdout}") - sys.exit(1) - pdf_filename = "/tmp/input_file.pdf" - else: - print_flush("Invalid conversion type") - sys.exit(1) - - # Separate PDF into pages - print_flush("") - print_flush(f"Separating document into pages") - args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error separating document into pages, pdfseparate timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Separating document into pages failed: {p.stdout}") - sys.exit(1) - - page_filenames = glob.glob("/tmp/page-*.pdf") - print_flush(f"Document has {len(page_filenames)} pages") - print_flush("") - - # Convert to RGB pixel data - for page in range(1, len(page_filenames) + 1): - pdf_filename = f"/tmp/page-{page}.pdf" - png_filename = f"/tmp/page-{page}.png" - rgb_filename = f"/tmp/page-{page}.rgb" - width_filename = f"/tmp/page-{page}.width" - height_filename = f"/tmp/page-{page}.height" - filename_base = f"/tmp/page-{page}" - - print_flush(f"Converting page {page} to pixels") - - # Convert to png - try: - p = subprocess.run( - ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], - timeout=60, - ) - except subprocess.TimeoutExpired: - print_flush( - "Error converting from PDF to PNG, pdftocairo timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Conversion from PDF to PNG failed: {p.stdout}") - sys.exit(1) - - # Save the width and height - im = Image.open(png_filename) - width, height = im.size - with open(width_filename, "w") as f: - f.write(str(width)) - with open(height_filename, "w") as f: - f.write(str(height)) - - # Convert to RGB pixels - try: - p = subprocess.run( - ["gm", "convert", png_filename, "-depth", "8", f"rgb:{rgb_filename}"], - timeout=60, - ) - except subprocess.TimeoutExpired: - print_flush( - "Error converting from PNG to pixels, convert timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Conversion from PNG to RGB failed: {p.stdout}") - sys.exit(1) - - # Delete the png - os.remove(png_filename) - - -if __name__ == "__main__": - main() diff --git a/dangerzone-converter/scripts/pixels-to-pdf b/dangerzone-converter/scripts/pixels-to-pdf index 7adde40..01ebcb2 100755 --- a/dangerzone-converter/scripts/pixels-to-pdf +++ b/dangerzone-converter/scripts/pixels-to-pdf @@ -6,7 +6,7 @@ echo 127.0.0.1 $(hostname) >> /etc/hosts # Do the conversion without root # /usr/bin/sudo OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE -u user /usr/local/bin/pixels-to-pdf-unpriv -OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE /usr/local/bin/pixels-to-pdf-unpriv +OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE /usr/bin/python3 /usr/local/bin/dangerzone.py pixels-to-pdf RETURN_CODE=$? if [ $RETURN_CODE -ne 0 ]; then echo "" diff --git a/dangerzone-converter/scripts/pixels-to-pdf-unpriv b/dangerzone-converter/scripts/pixels-to-pdf-unpriv deleted file mode 100755 index 876e6d6..0000000 --- a/dangerzone-converter/scripts/pixels-to-pdf-unpriv +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python3 -import glob -import os -import sys -import subprocess - - -def print_flush(s=""): - print(s) - sys.stdout.flush() - - -def main(): - num_pages = len(glob.glob("/dangerzone/page-*.rgb")) - print_flush(f"Document has {num_pages} pages") - - # Convert RGB files to PDF files - for page in range(1, num_pages + 1): - filename_base = f"/dangerzone/page-{page}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - png_filename = f"/tmp/page-{page}.png" - ocr_filename = f"/tmp/page-{page}" - pdf_filename = f"/tmp/page-{page}.pdf" - - with open(width_filename) as f: - width = f.read().strip() - with open(height_filename) as f: - height = f.read().strip() - - if os.environ.get("OCR") == "1": - # OCR the document - print_flush(f"Converting page {page} from pixels to searchable PDF") - - args = [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"png:{png_filename}", - ] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error converting pixels to PNG, convert timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Page {page} conversion failed: {p.stdout}") - sys.exit(1) - - args = [ - "tesseract", - png_filename, - ocr_filename, - "-l", - os.environ.get("OCR_LANGUAGE"), - "--dpi", - "70", - "pdf" - ] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error converting PNG to searchable PDF, tesseract timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Page {page} conversion failed: {p.stdout}") - sys.exit(1) - - else: - # Don't OCR - print_flush(f"Converting page {page} from pixels to PDF") - - args = [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"pdf:{pdf_filename}", - ] - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error converting RGB to PDF, convert timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Page {page} conversion failed: {p.stdout}") - sys.exit(1) - - print_flush() - - # Merge pages into a single PDF - print_flush(f"Merging {num_pages} pages into a single PDF") - args = ["pdfunite"] - for page in range(1, num_pages + 1): - args.append(f"/tmp/page-{page}.pdf") - args.append(f"/tmp/safe-output.pdf") - try: - p = subprocess.run(args, timeout=60) - except subprocess.TimeoutExpired: - print_flush( - "Error merging pages into a single PDF, pdfunite timed out after 60 seconds" - ) - sys.exit(1) - if p.returncode != 0: - print_flush(f"Merge failed: {p.stdout}") - sys.exit(1) - - # Compress - print_flush("Compressing PDF") - compress_timeout = num_pages * 3 - try: - p = subprocess.run( - ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], - timeout=compress_timeout, - ) - except subprocess.TimeoutExpired: - print_flush(f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds") - sys.exit(1) - if p.returncode != 0: - print_flush(f"Compression failed: {p.stdout}") - sys.exit(1) - - -if __name__ == "__main__": - main()