dangerzone/dangerzone/conversion/doc_to_pixels.py
Moon Sungjoon a453c890a0
Fix dynamic loading of LibreOffice extensions
HWPX MIME type is recognized as 'application/zip' with current version of file command (file-5.44).
It will be recognized as 'application/hwp+zip' when new version of file is released.

For a temporary fix, when MIME type of file is 'application/zip',
check the file type again (without the MIME option).
And then check if it's 'Zip data (MIME type "application/hwp+zip"?)' or not.
2023-08-01 14:28:36 +01:00

339 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Here are the steps, with progress bar percentages:
- 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
- 3%-5%: Split PDF into individual pages, and count those pages
- 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)
"""
import asyncio
import glob
import os
import re
import shutil
import sys
from typing import Dict, Optional
import magic
from .common import DangerzoneConverter, run_command
class DocumentToPixels(DangerzoneConverter):
async def convert(self) -> None:
conversions: Dict[str, Dict[str, Optional[str]]] = {
# .pdf
"application/pdf": {"type": None},
# .docx
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
"type": "libreoffice",
},
# .doc
"application/msword": {
"type": "libreoffice",
},
# .docm
"application/vnd.ms-word.document.macroEnabled.12": {
"type": "libreoffice",
},
# .xlsx
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
"type": "libreoffice",
},
# .xls
"application/vnd.ms-excel": {
"type": "libreoffice",
},
# .pptx
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
"type": "libreoffice",
},
# .ppt
"application/vnd.ms-powerpoint": {
"type": "libreoffice",
},
# .odt
"application/vnd.oasis.opendocument.text": {
"type": "libreoffice",
},
# .odg
"application/vnd.oasis.opendocument.graphics": {
"type": "libreoffice",
},
# .odp
"application/vnd.oasis.opendocument.presentation": {
"type": "libreoffice",
},
# .ods
"application/vnd.oasis.opendocument.spreadsheet": {
"type": "libreoffice",
},
# .ods / .ots
"application/vnd.oasis.opendocument.spreadsheet-template": {
"type": "libreoffice",
},
# .odt / .ott
"application/vnd.oasis.opendocument.text-template": {
"type": "libreoffice",
},
# .hwp
"application/haansofthwp": {
"type": "libreoffice",
"libreoffice_ext": "h2orestart.oxt",
},
"application/x-hwp": {
"type": "libreoffice",
"libreoffice_ext": "h2orestart.oxt",
},
# .hwpx
"application/hwp+zip": {
"type": "libreoffice",
"libreoffice_ext": "h2orestart.oxt",
},
# At least .odt, .docx, .odg, .odp, .ods, and .pptx
"application/zip": {
"type": "libreoffice",
},
# At least .doc, .docx, .odg, .odp, .odt, .pdf, .ppt, .pptx, .xls, and .xlsx
"application/octet-stream": {
"type": "libreoffice",
},
# At least .doc, .ppt, and .xls
"application/x-ole-storage": {
"type": "libreoffice",
},
# .jpg
"image/jpeg": {"type": "convert"},
# .gif
"image/gif": {"type": "convert"},
# .png
"image/png": {"type": "convert"},
# .tif
"image/tiff": {"type": "convert"},
"image/x-tiff": {"type": "convert"},
}
# Detect MIME type
try:
mime = magic.Magic(mime=True)
mime_type = mime.from_file("/tmp/input_file")
except TypeError:
mime_type = magic.detect_from_filename("/tmp/input_file").mime_type
# Validate MIME type
if mime_type not in conversions:
raise ValueError("The document format is not supported")
# Temporary fix for the HWPX format
if mime_type == "application/zip":
file_type = magic.from_file("/tmp/input_file")
hwpx_file_type = 'Zip data (MIME type "application/hwp+zip"?)'
if file_type == hwpx_file_type:
mime_type = "application/hwp+zip"
# Get file size (in MiB)
size = os.path.getsize("/tmp/input_file") / 1024**2
# Calculate timeout for the first few file operations. The difference with the
# subsequent ones is that we don't know the number of pages, before we have a
# PDF at hand, so we rely on size heuristics.
timeout = self.calculate_timeout(size)
# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
pdf_filename = "/tmp/input_file"
elif conversion["type"] == "libreoffice":
libreoffice_ext = conversion.get("libreoffice_ext", None)
if libreoffice_ext:
await self.install_libreoffice_ext(libreoffice_ext)
self.update_progress("Converting to PDF using LibreOffice")
args = [
"libreoffice",
"--headless",
"--safe-mode",
"--convert-to",
"pdf",
"--outdir",
"/tmp",
"/tmp/input_file",
]
await run_command(
args,
error_message="Conversion to PDF with LibreOffice failed",
timeout_message=(
"Error converting document to PDF, LibreOffice timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
self.update_progress("Converting to PDF using GraphicsMagick")
args = [
"gm",
"convert",
"/tmp/input_file",
"/tmp/input_file.pdf",
]
await run_command(
args,
error_message="Conversion to PDF with GraphicsMagick failed",
timeout_message=(
"Error converting document to PDF, GraphicsMagick timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
pdf_filename = "/tmp/input_file.pdf"
else:
raise ValueError(
f"Invalid conversion type {conversion['type']} for MIME type {mime_type}"
)
self.percentage += 3
# Obtain number of pages
self.update_progress("Calculating number of pages")
stdout, _ = await run_command(
["pdfinfo", pdf_filename],
error_message="PDF file is corrupted",
timeout_message=(
f"Extracting metadata from PDF timed out after {timeout} second"
),
timeout=timeout,
)
search = re.search(r"Pages:\s*(\d+)\s*\n", stdout.decode())
if search is not None:
num_pages: int = int(search.group(1))
else:
raise ValueError("Number of pages could not be extracted from PDF")
# Get a more precise timeout, based on the number of pages
timeout = self.calculate_timeout(size, num_pages)
def pdftoppm_progress_callback(line: bytes) -> None:
"""Function called for every line the 'pdftoppm' command outputs
Sample pdftoppm output:
$ pdftoppm sample.pdf /tmp/safe -progress
1 4 /tmp/safe-1.ppm
2 4 /tmp/safe-2.ppm
3 4 /tmp/safe-3.ppm
4 4 /tmp/safe-4.ppm
Each successful line is in the format "{page} {page_num} {ppm_filename}"
"""
try:
(page_str, num_pages_str, _) = line.decode().split()
num_pages = int(num_pages_str)
page = int(page_str)
except ValueError as e:
# Ignore all non-progress related output, since pdftoppm sends
# everything to stderr and thus, errors can't be distinguished
# easily. We rely instead on the exit code.
return
percentage_per_page = 45.0 / num_pages
self.percentage += percentage_per_page
self.update_progress(f"Converting page {page}/{num_pages} to pixels")
zero_padding = "0" * (len(num_pages_str) - len(page_str))
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
rgb_filename = f"{page_base}-{page}.rgb"
width_filename = f"{page_base}-{page}.width"
height_filename = f"{page_base}-{page}.height"
filename_base = f"{page_base}-{page}"
with open(ppm_filename, "rb") as f:
# NOTE: PPM files have multiple ways of writing headers.
# For our specific case we parse it expecting the header format that ppmtopdf produces
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
# Read the header
header = f.readline().decode().strip()
if header != "P6":
raise ValueError("Invalid PPM header")
# Save the width and height
dims = f.readline().decode().strip()
width, height = dims.split()
with open(width_filename, "w") as width_file:
width_file.write(width)
with open(height_filename, "w") as height_file:
height_file.write(height)
maxval = int(f.readline().decode().strip())
# Check that the depth is 8
if maxval != 255:
raise ValueError("Invalid PPM depth")
data = f.read()
# Save pixel data
with open(rgb_filename, "wb") as f:
f.write(data)
# Delete the ppm file
os.remove(ppm_filename)
page_base = "/tmp/page"
await run_command(
[
"pdftoppm",
pdf_filename,
page_base,
"-progress",
],
error_message="Conversion from PDF to PPM failed",
timeout_message=(
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
" seconds"
),
stderr_callback=pdftoppm_progress_callback,
timeout=timeout,
)
self.update_progress("Converted document to pixels")
# Move converted files into /tmp/dangerzone
for filename in (
glob.glob("/tmp/page-*.rgb")
+ glob.glob("/tmp/page-*.width")
+ glob.glob("/tmp/page-*.height")
):
shutil.move(filename, "/tmp/dangerzone")
async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
self.update_progress(f"Installing LibreOffice extension '{libreoffice_ext}'")
unzip_args = [
"unzip",
"-d",
f"/usr/lib/libreoffice/share/extensions/{libreoffice_ext}/",
f"/libreoffice_ext/{libreoffice_ext}",
]
await run_command(
unzip_args,
error_message="LibreOffice extension installation failed (unzipping)",
timeout_message="unzipping LibreOffice extension timed out 5 seconds",
timeout=5,
)
async def main() -> int:
converter = DocumentToPixels()
try:
await converter.convert()
except (RuntimeError, TimeoutError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1
else:
return 0 # Success!
if __name__ == "__main__":
sys.exit(asyncio.run(main()))