Refactor container to output JSON status updates, and make CLI work with it

This commit is contained in:
Micah Lee 2021-08-05 15:00:18 -07:00
parent 450320de6f
commit 5545252ca5
No known key found for this signature in database
GPG key ID: 403C2657CD994F73
2 changed files with 218 additions and 56 deletions

View file

@ -1,8 +1,22 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""
Here are the steps, with progress bar percentages for each step:
document_to_pixels
- 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
- 3%-5%: Split PDF into individual pages, and count those pages
- 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)
pixels_to_pdf:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""
import sys import sys
import subprocess import subprocess
import glob import glob
import os import os
import json
import magic import magic
from PIL import Image from PIL import Image
@ -13,6 +27,8 @@ class DangerzoneConverter:
pass pass
def document_to_pixels(self): def document_to_pixels(self):
percentage = 0.0
conversions = { conversions = {
# .pdf # .pdf
"application/pdf": {"type": None}, "application/pdf": {"type": None},
@ -88,7 +104,7 @@ class DangerzoneConverter:
# Validate MIME type # Validate MIME type
if mime_type not in conversions: if mime_type not in conversions:
self._print("The document format is not supported") self.output(True, "The document format is not supported", percentage)
return 1 return 1
# Convert input document to PDF # Convert input document to PDF
@ -96,7 +112,7 @@ class DangerzoneConverter:
if conversion["type"] is None: if conversion["type"] is None:
pdf_filename = "/tmp/input_file" pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice": elif conversion["type"] == "libreoffice":
self._print(f"Converting to PDF using LibreOffice") self.output(False, "Converting to PDF using LibreOffice", percentage)
args = [ args = [
"libreoffice", "libreoffice",
"--headless", "--headless",
@ -107,19 +123,30 @@ class DangerzoneConverter:
"/tmp/input_file", "/tmp/input_file",
] ]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting document to PDF, LibreOffice timed out after 60 seconds" True,
"Error converting document to PDF, LibreOffice timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Conversion to PDF failed: {p.stdout}") self.output(
True,
f"Conversion to PDF with LibreOffice failed",
percentage,
)
return 1 return 1
pdf_filename = "/tmp/input_file.pdf" pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert": elif conversion["type"] == "convert":
self._print(f"Converting to PDF using GraphicsMagick") self.output(False, "Converting to PDF using GraphicsMagick", percentage)
args = [ args = [
"gm", "gm",
"convert", "convert",
@ -127,40 +154,69 @@ class DangerzoneConverter:
"/tmp/input_file.pdf", "/tmp/input_file.pdf",
] ]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds" True,
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Conversion to PDF failed: {p.stdout}") self.output(
True,
"Conversion to PDF with GraphicsMagick failed",
percentage,
)
return 1 return 1
pdf_filename = "/tmp/input_file.pdf" pdf_filename = "/tmp/input_file.pdf"
else: else:
self._print("Invalid conversion type") self.output(
True,
"Invalid conversion type",
percentage,
)
return 1 return 1
percentage += 3
# Separate PDF into pages # Separate PDF into pages
self._print("") self.output(
self._print(f"Separating document into pages") False,
"Separating document into pages",
percentage,
)
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error separating document into pages, pdfseparate timed out after 60 seconds" True,
"Error separating document into pages, pdfseparate timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Separating document into pages failed: {p.stdout}") self.output(
True,
"Separating document into pages failed",
percentage,
)
return 1 return 1
page_filenames = glob.glob("/tmp/page-*.pdf") page_filenames = glob.glob("/tmp/page-*.pdf")
self._print(f"Document has {len(page_filenames)} pages")
self._print("") percentage += 2
# Convert to RGB pixel data # Convert to RGB pixel data
percentage_per_page = 45.0 / len(page_filenames)
for page in range(1, len(page_filenames) + 1): for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf" pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png" png_filename = f"/tmp/page-{page}.png"
@ -169,21 +225,33 @@ class DangerzoneConverter:
height_filename = f"/tmp/page-{page}.height" height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}" filename_base = f"/tmp/page-{page}"
self._print(f"Converting page {page} to pixels") self.output(
False,
f"Converting page {page}/{len(page_filenames)} to pixels",
percentage,
)
# Convert to png # Convert to png
try: try:
p = subprocess.run( p = subprocess.run(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60, timeout=60,
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds" True,
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Conversion from PDF to PNG failed: {p.stdout}") self.output(
True,
"Conversion from PDF to PNG failed",
percentage,
)
return 1 return 1
# Save the width and height # Save the width and height
@ -208,24 +276,39 @@ class DangerzoneConverter:
timeout=60, timeout=60,
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting from PNG to pixels, convert timed out after 60 seconds" True,
"Error converting from PNG to pixels, convert timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Conversion from PNG to RGB failed: {p.stdout}") self.output(
True,
"Conversion from PNG to RGB failed",
percentage,
)
return 1 return 1
# Delete the png # Delete the png
os.remove(png_filename) os.remove(png_filename)
percentage += percentage_per_page
self.output(
False,
"Converted document to pixels",
percentage,
)
return 0 return 0
def pixels_to_pdf(self): def pixels_to_pdf(self):
percentage = 50.0
num_pages = len(glob.glob("/dangerzone/page-*.rgb")) num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
self._print(f"Document has {num_pages} pages")
# Convert RGB files to PDF files # Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1): for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}" filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb" rgb_filename = f"{filename_base}.rgb"
@ -242,7 +325,11 @@ class DangerzoneConverter:
if os.environ.get("OCR") == "1": if os.environ.get("OCR") == "1":
# OCR the document # OCR the document
self._print(f"Converting page {page} from pixels to searchable PDF") self.output(
False,
f"Converting page {page}/{num_pages} from pixels to searchable PDF",
percentage,
)
args = [ args = [
"gm", "gm",
@ -255,14 +342,25 @@ class DangerzoneConverter:
f"png:{png_filename}", f"png:{png_filename}",
] ]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting pixels to PNG, convert timed out after 60 seconds" True,
"Error converting pixels to PNG, convert timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}") self.output(
True,
f"Page {page}/{num_pages} conversion to PNG failed",
percentage,
)
return 1 return 1
args = [ args = [
@ -276,19 +374,34 @@ class DangerzoneConverter:
"pdf", "pdf",
] ]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds" True,
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}") self.output(
True,
f"Page {page}/{num_pages} OCR failed",
percentage,
)
return 1 return 1
else: else:
# Don't OCR # Don't OCR
self._print(f"Converting page {page} from pixels to PDF") self.output(
False,
f"Converting page {page}/{num_pages} from pixels to PDF",
percentage,
)
args = [ args = [
"gm", "gm",
@ -301,56 +414,96 @@ class DangerzoneConverter:
f"pdf:{pdf_filename}", f"pdf:{pdf_filename}",
] ]
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error converting RGB to PDF, convert timed out after 60 seconds" True,
"Error converting RGB to PDF, convert timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}") self.output(
True,
f"Page {page}/{num_pages} conversion to PDF failed",
percentage,
)
return 1 return 1
self._print() percentage += percentage_per_page
# Merge pages into a single PDF # Merge pages into a single PDF
self._print(f"Merging {num_pages} pages into a single PDF") self.output(
False,
f"Merging {num_pages} pages into a single PDF",
percentage,
)
args = ["pdfunite"] args = ["pdfunite"]
for page in range(1, num_pages + 1): for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf") args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf") args.append(f"/tmp/safe-output.pdf")
try: try:
p = subprocess.run(args, timeout=60) p = subprocess.run(
args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60
)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds" True,
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Merge failed: {p.stdout}") self.output(
True,
"Merging pages into a single PDF failed",
percentage,
)
return 1 return 1
percentage += 2
# Compress # Compress
self._print("Compressing PDF") self.output(
False,
f"Compressing PDF",
percentage,
)
compress_timeout = num_pages * 3 compress_timeout = num_pages * 3
try: try:
p = subprocess.run( p = subprocess.run(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=compress_timeout, timeout=compress_timeout,
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self._print( self.output(
f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds" True,
f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
percentage,
) )
return 1 return 1
if p.returncode != 0: if p.returncode != 0:
self._print(f"Compression failed: {p.stdout}") self.output(
True,
f"Compressing PDF failed",
percentage,
)
return 1 return 1
percentage = 100.0
self.output(False, "Safe PDF created", percentage)
return 0 return 0
def _print(self, s=""): def output(self, error, text, percentage):
print(s) print(json.dumps({"error": error, "text": text, "percentage": int(percentage)}))
sys.stdout.flush() sys.stdout.flush()

View file

@ -1,5 +1,5 @@
import os import os
import shutil import json
import click import click
from colorama import Fore, Style from colorama import Fore, Style
@ -90,7 +90,16 @@ def cli_main(output_filename, ocr_lang, filename):
print_header("Converting document to safe PDF") print_header("Converting document to safe PDF")
def stdout_callback(line): def stdout_callback(line):
print(line.rstrip()) try:
status = json.loads(line)
s = Style.BRIGHT + Fore.CYAN + f"{status['percentage']}% "
if status["error"]:
s += Style.RESET_ALL + Fore.RED + status["text"]
else:
s += Style.RESET_ALL + status["text"]
click.echo(s)
except:
click.echo(f"Invalid JSON returned from container: {line}")
if convert( if convert(
global_common, global_common,