#!/usr/bin/env python3
"""
Here are the steps, with progress bar percentages for each step:

document_to_pixels
- 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
- 3%-5%: Split PDF into individual pages, and count those pages
- 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)

pixels_to_pdf:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""

import glob
import json
import os
import shutil
import subprocess
import sys
<<<<<<< HEAD
from typing import Dict, Optional
=======
>>>>>>> d990cfb (refactor dangerzone.py, raise exceptions instead of returning int)

import magic
from PIL import Image

# timeout in seconds for any single subprocess
# FIXME https://github.com/freedomofpress/dangerzone/issues/146
# FIXME https://github.com/freedomofpress/dangerzone/issues/149
DEFAULT_TIMEOUT: float = 60


def run_command(
    args, *, error_message: str, timeout_message: str, timeout: float = DEFAULT_TIMEOUT
) -> subprocess.CompletedProcess:
    """
    Runs a command and returns the result.

    :raises RuntimeError: if the process returns a non-zero exit status
    :raises TimeoutError: if the process times out
    """
    try:
        return subprocess.run(
            args,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            timeout=timeout,
            check=True,
        )
    except subprocess.CalledProcessError as e:
        raise RuntimeError(error_message) from e
    except subprocess.TimeoutExpired as e:
        raise TimeoutError(timeout_message) from e


def output(self, error: bool, text: str, percentage: float) -> None:
    print(json.dumps({"error": error, "text": text, "percentage": int(percentage)}))
    sys.stdout.flush()


def document_to_pixels() -> None:
    percentage: float = 0.0

    conversions: Dict[str, Dict[str, Optional[str]]] = {
        # .pdf
        "application/pdf": {"type": None},
        # .docx
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
            "type": "libreoffice",
            "libreoffice_output_filter": "writer_pdf_Export",
        },
        # .doc
        "application/msword": {
            "type": "libreoffice",
            "libreoffice_output_filter": "writer_pdf_Export",
        },
        # .docm
        "application/vnd.ms-word.document.macroEnabled.12": {
            "type": "libreoffice",
            "libreoffice_output_filter": "writer_pdf_Export",
        },
        # .xlsx
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
            "type": "libreoffice",
            "libreoffice_output_filter": "calc_pdf_Export",
        },
        # .xls
        "application/vnd.ms-excel": {
            "type": "libreoffice",
            "libreoffice_output_filter": "calc_pdf_Export",
        },
        # .pptx
        "application/vnd.openxmlformats-officedocument.presentationml.presentation": {
            "type": "libreoffice",
            "libreoffice_output_filter": "impress_pdf_Export",
        },
        # .ppt
        "application/vnd.ms-powerpoint": {
            "type": "libreoffice",
            "libreoffice_output_filter": "impress_pdf_Export",
        },
        # .odt
        "application/vnd.oasis.opendocument.text": {
            "type": "libreoffice",
            "libreoffice_output_filter": "writer_pdf_Export",
        },
        # .odg
        "application/vnd.oasis.opendocument.graphics": {
            "type": "libreoffice",
            "libreoffice_output_filter": "impress_pdf_Export",
        },
        # .odp
        "application/vnd.oasis.opendocument.presentation": {
            "type": "libreoffice",
            "libreoffice_output_filter": "impress_pdf_Export",
        },
        # .ops
        "application/vnd.oasis.opendocument.spreadsheet": {
            "type": "libreoffice",
            "libreoffice_output_filter": "calc_pdf_Export",
        },
        # .jpg
        "image/jpeg": {"type": "convert"},
        # .gif
        "image/gif": {"type": "convert"},
        # .png
        "image/png": {"type": "convert"},
        # .tif
        "image/tiff": {"type": "convert"},
        "image/x-tiff": {"type": "convert"},
    }

    # Detect MIME type
    mime = magic.Magic(mime=True)
    mime_type = mime.from_file("/tmp/input_file")

    # Validate MIME type
    if mime_type not in conversions:
        raise ValueError(f"Document format ${mime_type} is not supported")

    # Convert input document to PDF
    conversion = conversions[mime_type]
    if conversion["type"] is None:
        pdf_filename = "/tmp/input_file"
    elif conversion["type"] == "libreoffice":
        output(False, "Converting to PDF using LibreOffice", percentage)
        args = [
            "libreoffice",
            "--headless",
            "--convert-to",
            f"pdf:{conversion['libreoffice_output_filter']}",
            "--outdir",
            "/tmp",
            "/tmp/input_file",
        ]
        run_command(
            args,
            error_message="Conversion to PDF with LibreOffice failed",
            timeout_message=f"Error converting document to PDF, LibreOffice timed out after {DEFAULT_TIMEOUT} seconds",
        )
        pdf_filename = "/tmp/input_file.pdf"
    elif conversion["type"] == "convert":
        output(False, "Converting to PDF using GraphicsMagick", percentage)
        args = [
            "gm",
            "convert",
            "/tmp/input_file",
            "/tmp/input_file.pdf",
        ]
        run_command(
            args,
            error_message="Conversion to PDF with GraphicsMagick failed",
            timeout_message=f"Error converting document to PDF, GraphicsMagick timed out after {DEFAULT_TIMEOUT} seconds",
        )
        pdf_filename = "/tmp/input_file.pdf"
    else:
        raise ValueError(
            f"Invalid conversion type {conversion['type']} for MIME type {mime_type}"
        )
    percentage += 3

    # Separate PDF into pages
    output(
        False,
        "Separating document into pages",
        percentage,
    )
    args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
    run_command(
        args,
        error_message="Separating document into pages failed",
        timeout_message=f"Error separating document into pages, pdfseparate timed out after {DEFAULT_TIMEOUT} seconds",
    )

    page_filenames = glob.glob("/tmp/page-*.pdf")

    percentage += 2

    # Convert to RGB pixel data
    percentage_per_page = 45.0 / len(page_filenames)
    for page in range(1, len(page_filenames) + 1):
        pdf_filename = f"/tmp/page-{page}.pdf"
        png_filename = f"/tmp/page-{page}.png"
        rgb_filename = f"/tmp/page-{page}.rgb"
        width_filename = f"/tmp/page-{page}.width"
        height_filename = f"/tmp/page-{page}.height"
        filename_base = f"/tmp/page-{page}"

        output(
            False,
            f"Converting page {page}/{len(page_filenames)} to pixels",
            percentage,
        )

        # Convert to png
        run_command(
            ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
            error_message="Conversion from PDF to PNG failed",
            timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds",
        )

        # Save the width and height
        with Image.open(png_filename, "r") as im:
            width, height = im.size
        with open(width_filename, "w") as f:
            f.write(str(width))
        with open(height_filename, "w") as f:
            f.write(str(height))

        # Convert to RGB pixels
        run_command(
            [
                "gm",
                "convert",
                png_filename,
                "-depth",
                "8",
                f"rgb:{rgb_filename}",
            ],
            error_message="Conversion from PNG to RGB failed",
            timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds",
        )

        # Delete the png
        os.remove(png_filename)
        percentage += percentage_per_page

        # END OF FOR LOOP

    output(
        False,
        "Converted document to pixels",
        percentage,
    )

    # Move converted files into /dangerzone
    for filename in (
        glob.glob("/tmp/page-*.rgb")
        + glob.glob("/tmp/page-*.width")
        + glob.glob("/tmp/page-*.height")
    ):
        shutil.move(filename, "/dangerzone")


def pixels_to_pdf() -> None:
    percentage = 50.0

    num_pages = len(glob.glob("/dangerzone/page-*.rgb"))

    # Convert RGB files to PDF files
    percentage_per_page = 45.0 / num_pages
    for page in range(1, num_pages + 1):
        filename_base = f"/dangerzone/page-{page}"
        rgb_filename = f"{filename_base}.rgb"
        width_filename = f"{filename_base}.width"
        height_filename = f"{filename_base}.height"
        png_filename = f"/tmp/page-{page}.png"
        ocr_filename = f"/tmp/page-{page}"
        pdf_filename = f"/tmp/page-{page}.pdf"

        with open(width_filename) as f:
            width = f.read().strip()
        with open(height_filename) as f:
            height = f.read().strip()

        if os.environ.get("OCR") == "1":  # OCR the document
            output(
                False,
                f"Converting page {page}/{num_pages} from pixels to searchable PDF",
                percentage,
            )
            run_command(
                [
                    "gm",
                    "convert",
                    "-size",
                    f"{width}x{height}",
                    "-depth",
                    "8",
                    f"rgb:{rgb_filename}",
                    f"png:{png_filename}",
                ],
                error_message=f"Page {page}/{num_pages} conversion to PNG failed",
                timeout_message=f"Error converting pixels to PNG, convert timed out after {DEFAULT_TIMEOUT} seconds",
            )
            run_command(
                [
                    "tesseract",
                    png_filename,
                    ocr_filename,
                    "-l",
                    os.environ.get("OCR_LANGUAGE"),  # type: ignore
                    "--dpi",
                    "70",
                    "pdf",
                ],
                error_message=f"Page {page}/{num_pages} OCR failed",
                timeout_message=f"Error converting PNG to searchable PDF, tesseract timed out after {DEFAULT_TIMEOUT} seconds",
            )

        else:  # Don't OCR
            output(
                False,
                f"Converting page {page}/{num_pages} from pixels to PDF",
                percentage,
            )
            run_command(
                [
                    "gm",
                    "convert",
                    "-size",
                    f"{width}x{height}",
                    "-depth",
                    "8",
                    f"rgb:{rgb_filename}",
                    f"pdf:{pdf_filename}",
                ],
                error_message=f"Page {page}/{num_pages} conversion to PDF failed",
                timeout_message=f"Error converting RGB to PDF, convert timed out after {DEFAULT_TIMEOUT} seconds",
            )

        percentage += percentage_per_page

        # END OF FOR LOOP

    # Merge pages into a single PDF
    output(
        False,
        f"Merging {num_pages} pages into a single PDF",
        percentage,
    )
    args = ["pdfunite"]
    for page in range(1, num_pages + 1):
        args.append(f"/tmp/page-{page}.pdf")
    args.append(f"/tmp/safe-output.pdf")
    run_command(
        args,
        error_message="Merging pages into a single PDF failed",
        timeout_message=f"Error merging pages into a single PDF, pdfunite timed out after {DEFAULT_TIMEOUT} seconds",
    )

    percentage += 2

    # Compress
    output(
        False,
        f"Compressing PDF",
        percentage,
    )
    compress_timeout = num_pages * 3
    run_command(
        ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
        timeout_message=f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
        error_message="Compressing PDF failed",
        timeout=compress_timeout,
    )

    percentage = 100.0
    output(False, "Safe PDF created", percentage)

    # Move converted files into /safezone
    shutil.move("/tmp/safe-output.pdf", "/safezone")
    shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")


def main() -> int:
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
        return -1

    if sys.argv[1] == "document-to-pixels":
        try:
            document_to_pixels()
        except:
            return 1
        else:
            return 0

    if sys.argv[1] == "pixels-to-pdf":
        try:
            pixels_to_pdf()
        except:
            return 1
        else:
            return 0

    return -1


if __name__ == "__main__":
    sys.exit(main())