Refactor dangerzone-converter to have both containers share code

This commit is contained in:
Micah Lee 2021-07-30 15:00:16 -07:00
parent ee5acf64b2
commit e172274873
No known key found for this signature in database
GPG key ID: 403C2657CD994F73
6 changed files with 377 additions and 360 deletions

View file

@ -8,6 +8,7 @@ RUN apk -U upgrade && \
libreoffice \
openjdk8 \
poppler-utils \
python3 \
py3-magic \
py3-pillow \
sudo \

View file

@ -0,0 +1,374 @@
#!/usr/bin/env python3
import sys
import subprocess
import glob
import os
import magic
from PIL import Image
class DangerzoneConverter:
def __init__(self):
pass
def document_to_pixels(self):
conversions = {
# .pdf
"application/pdf": {"type": None},
# .docx
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .doc
"application/msword": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .docm
"application/vnd.ms-word.document.macroEnabled.12": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .xlsx
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .xls
"application/vnd.ms-excel": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .pptx
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ppt
"application/vnd.ms-powerpoint": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odt
"application/vnd.oasis.opendocument.text": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .odg
"application/vnd.oasis.opendocument.graphics": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odp
"application/vnd.oasis.opendocument.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ops
"application/vnd.oasis.opendocument.spreadsheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .jpg
"image/jpeg": {"type": "convert"},
# .gif
"image/gif": {"type": "convert"},
# .png
"image/png": {"type": "convert"},
# .tif
"image/tiff": {"type": "convert"},
"image/x-tiff": {"type": "convert"},
}
# Detect MIME type
mime = magic.Magic(mime=True)
mime_type = mime.from_file("/tmp/input_file")
# Validate MIME type
if mime_type not in conversions:
self._print("The document format is not supported")
return 1
# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice":
self._print(f"Converting to PDF using LibreOffice")
args = [
"libreoffice",
"--headless",
"--convert-to",
f"pdf:{conversion['libreoffice_output_filter']}",
"--outdir",
"/tmp",
"/tmp/input_file",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error converting document to PDF, LibreOffice timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Conversion to PDF failed: {p.stdout}")
return 1
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
self._print(f"Converting to PDF using GraphicsMagick")
args = [
"gm",
"convert",
"/tmp/input_file",
"/tmp/input_file.pdf",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Conversion to PDF failed: {p.stdout}")
return 1
pdf_filename = "/tmp/input_file.pdf"
else:
self._print("Invalid conversion type")
return 1
# Separate PDF into pages
self._print("")
self._print(f"Separating document into pages")
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error separating document into pages, pdfseparate timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Separating document into pages failed: {p.stdout}")
return 1
page_filenames = glob.glob("/tmp/page-*.pdf")
self._print(f"Document has {len(page_filenames)} pages")
self._print("")
# Convert to RGB pixel data
for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png"
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"
self._print(f"Converting page {page} to pixels")
# Convert to png
try:
p = subprocess.run(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
timeout=60,
)
except subprocess.TimeoutExpired:
self._print(
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Conversion from PDF to PNG failed: {p.stdout}")
return 1
# Save the width and height
im = Image.open(png_filename)
width, height = im.size
with open(width_filename, "w") as f:
f.write(str(width))
with open(height_filename, "w") as f:
f.write(str(height))
# Convert to RGB pixels
try:
p = subprocess.run(
[
"gm",
"convert",
png_filename,
"-depth",
"8",
f"rgb:{rgb_filename}",
],
timeout=60,
)
except subprocess.TimeoutExpired:
self._print(
"Error converting from PNG to pixels, convert timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Conversion from PNG to RGB failed: {p.stdout}")
return 1
# Delete the png
os.remove(png_filename)
return 0
def pixels_to_pdf(self):
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
self._print(f"Document has {num_pages} pages")
# Convert RGB files to PDF files
for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
if os.environ.get("OCR") == "1":
# OCR the document
self._print(f"Converting page {page} from pixels to searchable PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error converting pixels to PNG, convert timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}")
return 1
args = [
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"),
"--dpi",
"70",
"pdf",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}")
return 1
else:
# Don't OCR
self._print(f"Converting page {page} from pixels to PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error converting RGB to PDF, convert timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Page {page} conversion failed: {p.stdout}")
return 1
self._print()
# Merge pages into a single PDF
self._print(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
self._print(
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds"
)
return 1
if p.returncode != 0:
self._print(f"Merge failed: {p.stdout}")
return 1
# Compress
self._print("Compressing PDF")
compress_timeout = num_pages * 3
try:
p = subprocess.run(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
timeout=compress_timeout,
)
except subprocess.TimeoutExpired:
self._print(
f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds"
)
return 1
if p.returncode != 0:
self._print(f"Compression failed: {p.stdout}")
return 1
return 0
def _print(self, s=""):
print(s)
sys.stdout.flush()
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
return -1
converter = DangerzoneConverter()
if sys.argv[1] == "document-to-pixels":
return converter.document_to_pixels()
if sys.argv[1] == "pixels-to-pdf":
return converter.pixels_to_pdf()
return -1
if __name__ == "__main__":
sys.exit(main())

View file

@ -10,7 +10,7 @@ START_PERMISSIONS=$(stat /tmp/input_file | grep Access | grep Uid | cut -d"(" -f
# Do the conversion without root
# /usr/bin/sudo -u user /usr/local/bin/document-to-pixels-unpriv
/usr/local/bin/document-to-pixels-unpriv
/usr/bin/python3 /usr/local/bin/dangerzone.py document-to-pixels
RETURN_CODE=$?
# Restore original permissions

View file

@ -1,219 +0,0 @@
#!/usr/bin/env python3
import sys
import subprocess
import glob
import shutil
import os
import magic
from PIL import Image
def print_flush(s):
print(s)
sys.stdout.flush()
def main():
conversions = {
# .pdf
"application/pdf": {"type": None},
# .docx
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .doc
"application/msword": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .docm
"application/vnd.ms-word.document.macroEnabled.12": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .xlsx
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .xls
"application/vnd.ms-excel": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .pptx
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ppt
"application/vnd.ms-powerpoint": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odt
"application/vnd.oasis.opendocument.text": {
"type": "libreoffice",
"libreoffice_output_filter": "writer_pdf_Export",
},
# .odg
"application/vnd.oasis.opendocument.graphics": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .odp
"application/vnd.oasis.opendocument.presentation": {
"type": "libreoffice",
"libreoffice_output_filter": "impress_pdf_Export",
},
# .ops
"application/vnd.oasis.opendocument.spreadsheet": {
"type": "libreoffice",
"libreoffice_output_filter": "calc_pdf_Export",
},
# .jpg
"image/jpeg": {"type": "convert"},
# .gif
"image/gif": {"type": "convert"},
# .png
"image/png": {"type": "convert"},
# .tif
"image/tiff": {"type": "convert"},
"image/x-tiff": {"type": "convert"},
}
# Detect MIME type
mime = magic.Magic(mime=True)
mime_type = mime.from_file("/tmp/input_file")
# Validate MIME type
if mime_type not in conversions:
print_flush("The document format is not supported")
sys.exit(1)
# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice":
print_flush(f"Converting to PDF using LibreOffice")
args = [
"libreoffice",
"--headless",
"--convert-to",
f"pdf:{conversion['libreoffice_output_filter']}",
"--outdir",
"/tmp",
"/tmp/input_file",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting document to PDF, LibreOffice timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion to PDF failed: {p.stdout}")
sys.exit(1)
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
print_flush(f"Converting to PDF using GraphicsMagick")
args = [
"gm",
"convert",
"/tmp/input_file",
"/tmp/input_file.pdf",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion to PDF failed: {p.stdout}")
sys.exit(1)
pdf_filename = "/tmp/input_file.pdf"
else:
print_flush("Invalid conversion type")
sys.exit(1)
# Separate PDF into pages
print_flush("")
print_flush(f"Separating document into pages")
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error separating document into pages, pdfseparate timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Separating document into pages failed: {p.stdout}")
sys.exit(1)
page_filenames = glob.glob("/tmp/page-*.pdf")
print_flush(f"Document has {len(page_filenames)} pages")
print_flush("")
# Convert to RGB pixel data
for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png"
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"
print_flush(f"Converting page {page} to pixels")
# Convert to png
try:
p = subprocess.run(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
timeout=60,
)
except subprocess.TimeoutExpired:
print_flush(
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion from PDF to PNG failed: {p.stdout}")
sys.exit(1)
# Save the width and height
im = Image.open(png_filename)
width, height = im.size
with open(width_filename, "w") as f:
f.write(str(width))
with open(height_filename, "w") as f:
f.write(str(height))
# Convert to RGB pixels
try:
p = subprocess.run(
["gm", "convert", png_filename, "-depth", "8", f"rgb:{rgb_filename}"],
timeout=60,
)
except subprocess.TimeoutExpired:
print_flush(
"Error converting from PNG to pixels, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Conversion from PNG to RGB failed: {p.stdout}")
sys.exit(1)
# Delete the png
os.remove(png_filename)
if __name__ == "__main__":
main()

View file

@ -6,7 +6,7 @@ echo 127.0.0.1 $(hostname) >> /etc/hosts
# Do the conversion without root
# /usr/bin/sudo OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE -u user /usr/local/bin/pixels-to-pdf-unpriv
OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE /usr/local/bin/pixels-to-pdf-unpriv
OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE /usr/bin/python3 /usr/local/bin/dangerzone.py pixels-to-pdf
RETURN_CODE=$?
if [ $RETURN_CODE -ne 0 ]; then
echo ""

View file

@ -1,139 +0,0 @@
#!/usr/bin/env python3
import glob
import os
import sys
import subprocess
def print_flush(s=""):
print(s)
sys.stdout.flush()
def main():
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
print_flush(f"Document has {num_pages} pages")
# Convert RGB files to PDF files
for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
if os.environ.get("OCR") == "1":
# OCR the document
print_flush(f"Converting page {page} from pixels to searchable PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting pixels to PNG, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
args = [
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"),
"--dpi",
"70",
"pdf"
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
else:
# Don't OCR
print_flush(f"Converting page {page} from pixels to PDF")
args = [
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
]
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error converting RGB to PDF, convert timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Page {page} conversion failed: {p.stdout}")
sys.exit(1)
print_flush()
# Merge pages into a single PDF
print_flush(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
try:
p = subprocess.run(args, timeout=60)
except subprocess.TimeoutExpired:
print_flush(
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds"
)
sys.exit(1)
if p.returncode != 0:
print_flush(f"Merge failed: {p.stdout}")
sys.exit(1)
# Compress
print_flush("Compressing PDF")
compress_timeout = num_pages * 3
try:
p = subprocess.run(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
timeout=compress_timeout,
)
except subprocess.TimeoutExpired:
print_flush(f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds")
sys.exit(1)
if p.returncode != 0:
print_flush(f"Compression failed: {p.stdout}")
sys.exit(1)
if __name__ == "__main__":
main()