dangerzone/container/dangerzone.py
2022-11-10 09:58:45 +00:00

538 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Here are the steps, with progress bar percentages for each step:
document_to_pixels
- 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
- 3%-5%: Split PDF into individual pages, and count those pages
- 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)
pixels_to_pdf:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""
import glob
import json
import os
import shutil
import subprocess
import sys
from typing import Dict, Optional
import magic
from PIL import Image
def output(self, error: bool, text: str, percentage: float) -> None:
print(json.dumps({"error": error, "text": text, "percentage": int(percentage)}))
sys.stdout.flush()
def document_to_pixels() -> int:
percentage: float = 0.0
conversions: Dict[str, Dict[str, Optional[str]]] = {
# .pdf
"application/pdf": {"type": None},
# .docx
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .doc
"application/msword": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .docm
"application/vnd.ms-word.document.macroEnabled.12": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .xlsx
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .xls
"application/vnd.ms-excel": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .pptx
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ppt
"application/vnd.ms-powerpoint": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odt
"application/vnd.oasis.opendocument.text": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .odg
"application/vnd.oasis.opendocument.graphics": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odp
"application/vnd.oasis.opendocument.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ops
"application/vnd.oasis.opendocument.spreadsheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .jpg
"image/jpeg": {"type": "convert"},
# .gif
"image/gif": {"type": "convert"},
# .png
"image/png": {"type": "convert"},
# .tif
"image/tiff": {"type": "convert"},
"image/x-tiff": {"type": "convert"},
}
# Detect MIME type
mime = magic.Magic(mime=True)
mime_type = mime.from_file("/tmp/input_file")
# Validate MIME type
if mime_type not in conversions:
output(True, "The document format is not supported", percentage)
return 1
# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice":
output(False, "Converting to PDF using LibreOffice", percentage)
args = [
"libreoffice",
"--headless",
"--convert-to",
f"pdf:{conversion['libreoffice_output_filter']}",
"--outdir",
"/tmp",
"/tmp/input_file",
]
try:
p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting document to PDF, LibreOffice timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
f"Conversion to PDF with LibreOffice failed",
percentage,
)
return 1
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
output(False, "Converting to PDF using GraphicsMagick", percentage)
args = [
"gm",
"convert",
"/tmp/input_file",
"/tmp/input_file.pdf",
]
try:
p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
"Conversion to PDF with GraphicsMagick failed",
percentage,
)
return 1
pdf_filename = "/tmp/input_file.pdf"
else:
output(
True,
"Invalid conversion type",
percentage,
)
return 1
percentage += 3
# Separate PDF into pages
output(
False,
"Separating document into pages",
percentage,
)
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
try:
p = subprocess.run(
args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60
)
except subprocess.TimeoutExpired:
output(
True,
"Error separating document into pages, pdfseparate timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
"Separating document into pages failed",
percentage,
)
return 1
page_filenames = glob.glob("/tmp/page-*.pdf")
percentage += 2
# Convert to RGB pixel data
percentage_per_page = 45.0 / len(page_filenames)
for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png"
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"
output(
False,
f"Converting page {page}/{len(page_filenames)} to pixels",
percentage,
)
# Convert to png
try:
p = subprocess.run(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
"Conversion from PDF to PNG failed",
percentage,
)
return 1
# Save the width and height
im = Image.open(png_filename)
width, height = im.size
with open(width_filename, "w") as f:
f.write(str(width))
with open(height_filename, "w") as f:
f.write(str(height))
# Convert to RGB pixels
try:
p = subprocess.run(
[
"gm",
"convert",
png_filename,
"-depth",
"8",
f"rgb:{rgb_filename}",
],
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting from PNG to pixels, convert timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
"Conversion from PNG to RGB failed",
percentage,
)
return 1
# Delete the png
os.remove(png_filename)
percentage += percentage_per_page
output(
False,
"Converted document to pixels",
percentage,
)
# Move converted files into /dangerzone
for filename in (
glob.glob("/tmp/page-*.rgb")
+ glob.glob("/tmp/page-*.width")
+ glob.glob("/tmp/page-*.height")
):
shutil.move(filename, "/dangerzone")
return 0
def pixels_to_pdf() -> int:
percentage: float = 50.0
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
if os.environ.get("OCR") == "1":
# OCR the document
output(
False,
f"Converting page {page}/{num_pages} from pixels to searchable PDF",
percentage,
)
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
]
try:
p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting pixels to PNG, convert timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
f"Page {page}/{num_pages} conversion to PNG failed",
percentage,
)
return 1
args = [
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"), # type: ignore
"--dpi",
"70",
"pdf",
]
try:
p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
f"Page {page}/{num_pages} OCR failed",
percentage,
)
return 1
else:
# Don't OCR
output(
False,
f"Converting page {page}/{num_pages} from pixels to PDF",
percentage,
)
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
]
try:
p = subprocess.run(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
except subprocess.TimeoutExpired:
output(
True,
"Error converting RGB to PDF, convert timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
f"Page {page}/{num_pages} conversion to PDF failed",
percentage,
)
return 1
percentage += percentage_per_page
# Merge pages into a single PDF
output(
False,
f"Merging {num_pages} pages into a single PDF",
percentage,
)
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
try:
p = subprocess.run(
args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60
)
except subprocess.TimeoutExpired:
output(
True,
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
"Merging pages into a single PDF failed",
percentage,
)
return 1
percentage += 2
# Compress
output(
False,
f"Compressing PDF",
percentage,
)
compress_timeout = num_pages * 3
try:
p = subprocess.run(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=compress_timeout,
)
except subprocess.TimeoutExpired:
output(
True,
f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
percentage,
)
return 1
if p.returncode != 0:
output(
True,
f"Compressing PDF failed",
percentage,
)
return 1
percentage = 100.0
output(False, "Safe PDF created", percentage)
# Move converted files into /safezone
shutil.move("/tmp/safe-output.pdf", "/safezone")
shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")
return 0
def main() -> int:
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
return -1
if sys.argv[1] == "document-to-pixels":
return document_to_pixels()
if sys.argv[1] == "pixels-to-pdf":
return pixels_to_pdf()
return -1
if __name__ == "__main__":
sys.exit(main())