From 814d533c3b1cc5896db1badd9ac47dc641db3fda Mon Sep 17 00:00:00 2001
From: deeplow <deeplower@protonmail.com>
Date: Tue, 20 Jun 2023 19:33:48 +0300
Subject: [PATCH] Restructure container code

The files in `container/` no longer make sense to have that name since
the "document to pixels" part will run in Qubes OS in its own virtual
machine.

To adapt to this, this PR does the following:
- Moves all the files in `container` to `dangerzone/conversion`
- Splits the old `container/dangerzone.py` into its two components
  `dangerzone/conversion/{doc_to_pixels,pixels_to_pdf}.py` with a
  `common.py` file for shared functions
- Moves the Dockerfile to the project root and adapts it to the new
  container code location
- Updates the CircleCI config to properly cache Docker images.
- Updates our install scripts to properly build Docker images.
- Adds the new conversion module to the container image, so that it can
  be imported as a package.
- Adapts the container isolation provider to use the new way of calling
  the code.

NOTE: We have made zero changes to the conversion code in this commit,
except for necessary imports in order to factor out some common parts.
Any changes necessary for Qubes integration follow in the subsequent
commits.
---
 .circleci/config.yml                          |  37 ++-
 .github/workflows/scan.yml                    |   2 +-
 container/Dockerfile => Dockerfile            |   7 +-
 Makefile                                      |   5 +-
 dangerzone/conversion/__init__.py             |   0
 dangerzone/conversion/common.py               | 134 +++++++++
 .../conversion/doc_to_pixels.py               | 275 +-----------------
 dangerzone/conversion/pixels_to_pdf.py        | 166 +++++++++++
 dangerzone/isolation_provider/container.py    |   8 +-
 install/linux/build-image.sh                  |   2 +-
 install/macos/build-image.sh                  |   2 +-
 install/windows/build-image.py                |   4 +-
 setup.py                                      |   7 +-
 13 files changed, 360 insertions(+), 289 deletions(-)
 rename container/Dockerfile => Dockerfile (90%)
 create mode 100644 dangerzone/conversion/__init__.py
 create mode 100644 dangerzone/conversion/common.py
 rename container/dangerzone.py => dangerzone/conversion/doc_to_pixels.py (50%)
 create mode 100644 dangerzone/conversion/pixels_to_pdf.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 208e49d..0215cfc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -42,8 +42,16 @@ aliases:
       ./install/linux/build-rpm.py
       ls -lh dist/
 
+  - &calculate-cache-key
+    name: Caculating container cache key
+    command: |
+      mkdir -p /caches/
+      cd dangerzone/conversion/
+      cat common.py doc_to_pixels.py pixels_to_pdf.py | sha1sum | cut -d' ' -f1 > /caches/cache-id.txt
+      cd ../../
+
   - &restore-cache
-    key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }}
+    key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }}
     paths:
       - /caches/container.tar.gz
       - /caches/image-id.txt
@@ -85,9 +93,8 @@ jobs:
       - image: docker:dind
     steps:
       - checkout
-      - restore_cache:
-          keys:
-            - v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }}
+      - run: *calculate-cache-key
+      - restore_cache: *restore-cache
       - setup_remote_docker
       - run:
           name: Build Dangerzone image
@@ -95,7 +102,9 @@ jobs:
             if [ -f "/caches/container.tar.gz" ]; then
               echo "Already cached, skipping"
             else
-              docker build --cache-from=dangerzone.rocks/dangerzone --tag dangerzone.rocks/dangerzone container
+              docker build dangerzone/ -f Dockerfile \
+              --cache-from=dangerzone.rocks/dangerzone \
+              --tag dangerzone.rocks/dangerzone
             fi
       - run:
           name: Save Dangerzone image and image-id.txt to cache
@@ -108,8 +117,9 @@ jobs:
               gzip -f /caches/container.tar
               docker image ls dangerzone.rocks/dangerzone | grep "dangerzone.rocks/dangerzone" | tr -s ' ' | cut -d' ' -f3 > /caches/image-id.txt
             fi
+      - run: *calculate-cache-key
       - save_cache:
-          key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }}
+          key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }}
           paths:
             - /caches/container.tar.gz
             - /caches/image-id.txt
@@ -136,6 +146,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run:
@@ -155,6 +166,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -181,6 +193,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -207,6 +220,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -233,6 +247,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -259,6 +274,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -285,6 +301,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -328,6 +345,7 @@ jobs:
           command: |
             sudo mkdir -p /caches
             sudo chown -R $USER:$USER /caches
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
 
@@ -365,6 +383,7 @@ jobs:
     steps:
       - run: *install-dependencies-deb
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-deb
@@ -376,6 +395,7 @@ jobs:
     steps:
       - run: *install-dependencies-deb
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-deb
@@ -388,6 +408,7 @@ jobs:
       - run: *install-dependencies-deb
       - run: *install-python-all
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-deb
@@ -399,6 +420,7 @@ jobs:
     steps:
       - run: *install-dependencies-deb
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-deb
@@ -410,6 +432,7 @@ jobs:
     steps:
       - run: *install-dependencies-deb
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-deb
@@ -421,6 +444,7 @@ jobs:
     steps:
       - run: *install-dependencies-rpm
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-rpm
@@ -432,6 +456,7 @@ jobs:
     steps:
       - run: *install-dependencies-rpm
       - checkout
+      - run: *calculate-cache-key
       - restore_cache: *restore-cache
       - run: *copy-image
       - run: *build-rpm
diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml
index 9922921..4c885bf 100644
--- a/.github/workflows/scan.yml
+++ b/.github/workflows/scan.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
       - name: Build container image
-        run: docker build container --tag dangerzone.rocks/dangerzone:latest
+        run: docker build dangerzone/ -f Dockerfile --tag dangerzone.rocks/dangerzone:latest
       # NOTE: Scan first without failing, else we won't be able to read the scan
       # report.
       - name: Scan container image (no fail)
diff --git a/container/Dockerfile b/Dockerfile
similarity index 90%
rename from container/Dockerfile
rename to Dockerfile
index 14e05c6..77bcbce 100644
--- a/container/Dockerfile
+++ b/Dockerfile
@@ -33,8 +33,11 @@ RUN mkdir tessdata && cd tessdata \
     && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \
     && cd .. && rm -r tessdata
 
-COPY dangerzone.py /usr/local/bin/
-RUN chmod +x /usr/local/bin/dangerzone.py
+ENV PYTHONPATH=/opt/dangerzone
+
+RUN mkdir -p /opt/dangerzone/dangerzone
+RUN touch /opt/dangerzone/dangerzone/__init__.py
+COPY conversion /opt/dangerzone/dangerzone/conversion
 
 # Add the unprivileged user
 RUN adduser -s /bin/sh -D dangerzone
diff --git a/Makefile b/Makefile
index 05c64e5..900d5a8 100644
--- a/Makefile
+++ b/Makefile
@@ -24,13 +24,10 @@ MYPY_ARGS := --ignore-missing-imports \
 mypy-host:
 	mypy $(MYPY_ARGS) dangerzone
 
-mypy-container:
-	mypy $(MYPY_ARGS) container
-
 mypy-tests:
 	mypy $(MYPY_ARGS) tests
 
-mypy: mypy-host  mypy-container mypy-tests ## check type hints with mypy
+mypy: mypy-host mypy-tests ## check type hints with mypy
 
 .PHONY: lint
 lint: lint-black lint-isort mypy ## check the code with various linters
diff --git a/dangerzone/conversion/__init__.py b/dangerzone/conversion/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py
new file mode 100644
index 0000000..45629e1
--- /dev/null
+++ b/dangerzone/conversion/common.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import asyncio
+import glob
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import time
+from abc import abstractmethod
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+TIMEOUT_PER_PAGE: float = 30  # (seconds)
+TIMEOUT_PER_MB: float = 30  # (seconds)
+TIMEOUT_MIN: float = 60  # (seconds)
+
+
+async def read_stream(
+    sr: asyncio.StreamReader, callback: Optional[Callable] = None
+) -> bytes:
+    """Consume a byte stream line-by-line.
+
+    Read all lines in a stream until EOF. If a user has passed a callback, call it for
+    each line.
+
+    Note that the lines are in bytes, since we can't assume that all command output will
+    be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
+    if they know its encoding.
+    """
+    buf = b""
+    while True:
+        line = await sr.readline()
+        if sr.at_eof():
+            break
+        if callback is not None:
+            callback(line)
+        # TODO: This would be a good place to log the received line, mostly for debug
+        # logging.
+        buf += line
+    return buf
+
+
+async def run_command(
+    args: List[str],
+    *,
+    error_message: str,
+    timeout_message: str,
+    timeout: Optional[float],
+    stdout_callback: Optional[Callable] = None,
+    stderr_callback: Optional[Callable] = None,
+) -> Tuple[bytes, bytes]:
+    """Run a command and get its output.
+
+    Run a command using asyncio.subprocess, consume its standard streams, and return its
+    output in bytes.
+
+    :raises RuntimeError: if the process returns a non-zero exit status
+    :raises TimeoutError: if the process times out
+    """
+    # Start the provided command, and return a handle. The command will run in the
+    # background.
+    proc = await asyncio.subprocess.create_subprocess_exec(
+        *args,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+
+    assert proc.stdout is not None
+    assert proc.stderr is not None
+
+    # Create asynchronous tasks that will consume the standard streams of the command,
+    # and call callbacks if necessary.
+    stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
+    stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))
+
+    # Wait until the command has finished, for a specific timeout. Then, verify that the
+    # command has completed successfully. In any other case, raise an exception.
+    try:
+        ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
+    except asyncio.exceptions.TimeoutError:
+        raise TimeoutError(timeout_message)
+    if ret != 0:
+        raise RuntimeError(error_message)
+
+    # Wait until the tasks that consume the command's standard streams have exited as
+    # well, and return their output.
+    stdout = await stdout_task
+    stderr = await stderr_task
+    return (stdout, stderr)
+
+
+class DangerzoneConverter:
+    def __init__(self) -> None:
+        self.percentage: float = 0.0
+
+    def calculate_timeout(
+        self, size: float, pages: Optional[float] = None
+    ) -> Optional[float]:
+        """Calculate the timeout for a command.
+
+        The timeout calculation takes two factors in mind:
+
+        1. The size (in MiBs) of the dataset (document, multiple pages).
+        2. The number of pages in the dataset.
+
+        It then calculates proportional timeout values based on the above, and keeps the
+        large one.  This way, we can handle several corner cases:
+
+        * Documents with lots of pages, but small file size.
+        * Single images with large file size.
+        """
+        if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
+            return None
+
+        # Do not have timeouts lower than 10 seconds, if the file size is small, since
+        # we need to take into account the program's startup time as well.
+        timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
+        if pages:
+            timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
+        return timeout
+
+    @abstractmethod
+    async def convert(self) -> None:
+        pass
+
+    def update_progress(self, text: str, *, error: bool = False) -> None:
+        print(
+            json.dumps(
+                {"error": error, "text": text, "percentage": int(self.percentage)}
+            )
+        )
+        sys.stdout.flush()
diff --git a/container/dangerzone.py b/dangerzone/conversion/doc_to_pixels.py
similarity index 50%
rename from container/dangerzone.py
rename to dangerzone/conversion/doc_to_pixels.py
index 360552e..23db83a 100644
--- a/container/dangerzone.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@@ -1,140 +1,27 @@
 #!/usr/bin/env python3
 """
-Here are the steps, with progress bar percentages for each step:
+Here are the steps, with progress bar percentages:
 
-document_to_pixels
 - 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
 - 3%-5%: Split PDF into individual pages, and count those pages
 - 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)
-
-pixels_to_pdf:
-- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
-- 95%-100%: Compress the final PDF
 """
 
 import asyncio
 import glob
-import json
 import os
 import re
 import shutil
-import subprocess
 import sys
-import time
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Dict, Optional
 
 import magic
 
-TIMEOUT_PER_PAGE: float = 30  # (seconds)
-TIMEOUT_PER_MB: float = 30  # (seconds)
-TIMEOUT_MIN: float = 60  # (seconds)
+from .common import DangerzoneConverter, run_command
 
 
-async def read_stream(
-    sr: asyncio.StreamReader, callback: Optional[Callable] = None
-) -> bytes:
-    """Consume a byte stream line-by-line.
-
-    Read all lines in a stream until EOF. If a user has passed a callback, call it for
-    each line.
-
-    Note that the lines are in bytes, since we can't assume that all command output will
-    be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
-    if they know its encoding.
-    """
-    buf = b""
-    while True:
-        line = await sr.readline()
-        if sr.at_eof():
-            break
-        if callback is not None:
-            callback(line)
-        # TODO: This would be a good place to log the received line, mostly for debug
-        # logging.
-        buf += line
-    return buf
-
-
-async def run_command(
-    args: List[str],
-    *,
-    error_message: str,
-    timeout_message: str,
-    timeout: Optional[float],
-    stdout_callback: Optional[Callable] = None,
-    stderr_callback: Optional[Callable] = None,
-) -> Tuple[bytes, bytes]:
-    """Run a command and get its output.
-
-    Run a command using asyncio.subprocess, consume its standard streams, and return its
-    output in bytes.
-
-    :raises RuntimeError: if the process returns a non-zero exit status
-    :raises TimeoutError: if the process times out
-    """
-    # Start the provided command, and return a handle. The command will run in the
-    # background.
-    proc = await asyncio.subprocess.create_subprocess_exec(
-        *args,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-    )
-
-    assert proc.stdout is not None
-    assert proc.stderr is not None
-
-    # Create asynchronous tasks that will consume the standard streams of the command,
-    # and call callbacks if necessary.
-    stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
-    stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))
-
-    # Wait until the command has finished, for a specific timeout. Then, verify that the
-    # command has completed successfully. In any other case, raise an exception.
-    try:
-        ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
-    except asyncio.exceptions.TimeoutError:
-        raise TimeoutError(timeout_message)
-    if ret != 0:
-        raise RuntimeError(error_message)
-
-    # Wait until the tasks that consume the command's standard streams have exited as
-    # well, and return their output.
-    stdout = await stdout_task
-    stderr = await stderr_task
-    return (stdout, stderr)
-
-
-class DangerzoneConverter:
-    def __init__(self) -> None:
-        self.percentage: float = 0.0
-
-    def calculate_timeout(
-        self, size: float, pages: Optional[float] = None
-    ) -> Optional[float]:
-        """Calculate the timeout for a command.
-
-        The timeout calculation takes two factors in mind:
-
-        1. The size (in MiBs) of the dataset (document, multiple pages).
-        2. The number of pages in the dataset.
-
-        It then calculates proportional timeout values based on the above, and keeps the
-        large one.  This way, we can handle several corner cases:
-
-        * Documents with lots of pages, but small file size.
-        * Single images with large file size.
-        """
-        if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
-            return None
-
-        # Do not have timeouts lower than 10 seconds, if the file size is small, since
-        # we need to take into account the program's startup time as well.
-        timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
-        if pages:
-            timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
-        return timeout
-
-    async def document_to_pixels(self) -> None:
+class DocumentToPixels(DangerzoneConverter):
+    async def convert(self) -> None:
         conversions: Dict[str, Dict[str, Optional[str]]] = {
             # .pdf
             "application/pdf": {"type": None},
@@ -393,160 +280,12 @@ class DangerzoneConverter:
         ):
             shutil.move(filename, "/dangerzone")
 
-    async def pixels_to_pdf(self) -> None:
-        self.percentage = 50.0
-
-        num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
-        total_size = 0.0
-
-        # Convert RGB files to PDF files
-        percentage_per_page = 45.0 / num_pages
-        for page in range(1, num_pages + 1):
-            filename_base = f"/dangerzone/page-{page}"
-            rgb_filename = f"{filename_base}.rgb"
-            width_filename = f"{filename_base}.width"
-            height_filename = f"{filename_base}.height"
-            png_filename = f"/tmp/page-{page}.png"
-            ocr_filename = f"/tmp/page-{page}"
-            pdf_filename = f"/tmp/page-{page}.pdf"
-
-            with open(width_filename) as f:
-                width = f.read().strip()
-            with open(height_filename) as f:
-                height = f.read().strip()
-
-            # The first few operations happen on a per-page basis.
-            page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
-            total_size += page_size
-            timeout = self.calculate_timeout(page_size, 1)
-
-            if os.environ.get("OCR") == "1":  # OCR the document
-                self.update_progress(
-                    f"Converting page {page}/{num_pages} from pixels to searchable PDF"
-                )
-                await run_command(
-                    [
-                        "gm",
-                        "convert",
-                        "-size",
-                        f"{width}x{height}",
-                        "-depth",
-                        "8",
-                        f"rgb:{rgb_filename}",
-                        f"png:{png_filename}",
-                    ],
-                    error_message=f"Page {page}/{num_pages} conversion to PNG failed",
-                    timeout_message=(
-                        "Error converting pixels to PNG, convert timed out after"
-                        f" {timeout} seconds"
-                    ),
-                    timeout=timeout,
-                )
-                await run_command(
-                    [
-                        "tesseract",
-                        png_filename,
-                        ocr_filename,
-                        "-l",
-                        os.environ.get("OCR_LANGUAGE"),  # type: ignore
-                        "--dpi",
-                        "70",
-                        "pdf",
-                    ],
-                    error_message=f"Page {page}/{num_pages} OCR failed",
-                    timeout_message=(
-                        "Error converting PNG to searchable PDF, tesseract timed out"
-                        f" after {timeout} seconds"
-                    ),
-                    timeout=timeout,
-                )
-
-            else:  # Don't OCR
-                self.update_progress(
-                    f"Converting page {page}/{num_pages} from pixels to PDF"
-                )
-                await run_command(
-                    [
-                        "gm",
-                        "convert",
-                        "-size",
-                        f"{width}x{height}",
-                        "-depth",
-                        "8",
-                        f"rgb:{rgb_filename}",
-                        f"pdf:{pdf_filename}",
-                    ],
-                    error_message=f"Page {page}/{num_pages} conversion to PDF failed",
-                    timeout_message=(
-                        "Error converting RGB to PDF, convert timed out after"
-                        f" {timeout} seconds"
-                    ),
-                    timeout=timeout,
-                )
-
-            self.percentage += percentage_per_page
-
-        # Next operations apply to the all the pages, so we need to recalculate the
-        # timeout.
-        timeout = self.calculate_timeout(total_size, num_pages)
-
-        # Merge pages into a single PDF
-        self.update_progress(f"Merging {num_pages} pages into a single PDF")
-        args = ["pdfunite"]
-        for page in range(1, num_pages + 1):
-            args.append(f"/tmp/page-{page}.pdf")
-        args.append(f"/tmp/safe-output.pdf")
-        await run_command(
-            args,
-            error_message="Merging pages into a single PDF failed",
-            timeout_message=(
-                "Error merging pages into a single PDF, pdfunite timed out after"
-                f" {timeout} seconds"
-            ),
-            timeout=timeout,
-        )
-
-        self.percentage += 2
-
-        # Compress
-        self.update_progress("Compressing PDF")
-        await run_command(
-            ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
-            error_message="Compressing PDF failed",
-            timeout_message=(
-                f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
-            ),
-            timeout=timeout,
-        )
-
-        self.percentage = 100.0
-        self.update_progress("Safe PDF created")
-
-        # Move converted files into /safezone
-        shutil.move("/tmp/safe-output.pdf", "/safezone")
-        shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")
-
-    def update_progress(self, text: str, *, error: bool = False) -> None:
-        print(
-            json.dumps(
-                {"error": error, "text": text, "percentage": int(self.percentage)}
-            )
-        )
-        sys.stdout.flush()
-
 
 async def main() -> int:
-    if len(sys.argv) != 2:
-        print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
-        return -1
-
-    converter = DangerzoneConverter()
+    converter = DocumentToPixels()
 
     try:
-        if sys.argv[1] == "document-to-pixels":
-            await converter.document_to_pixels()
-        elif sys.argv[1] == "pixels-to-pdf":
-            await converter.pixels_to_pdf()
+        await converter.convert()
     except (RuntimeError, TimeoutError, ValueError) as e:
         converter.update_progress(str(e), error=True)
         return 1
diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
new file mode 100644
index 0000000..2e97de5
--- /dev/null
+++ b/dangerzone/conversion/pixels_to_pdf.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Here are the steps, with progress bar percentages:
+
+- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
+- 95%-100%: Compress the final PDF
+"""
+import asyncio
+import glob
+import json
+import os
+import shutil
+import sys
+
+from .common import DangerzoneConverter, run_command
+
+
+class PixelsToPDF(DangerzoneConverter):
+    async def convert(self) -> None:
+        self.percentage = 50.0
+
+        num_pages = len(glob.glob("/tmp/dangerzone/page-*.rgb"))
+        total_size = 0.0
+
+        # Convert RGB files to PDF files
+        percentage_per_page = 45.0 / num_pages
+        for page in range(1, num_pages + 1):
+            filename_base = f"/tmp/dangerzone/page-{page}"
+            rgb_filename = f"{filename_base}.rgb"
+            width_filename = f"{filename_base}.width"
+            height_filename = f"{filename_base}.height"
+            png_filename = f"/tmp/page-{page}.png"
+            ocr_filename = f"/tmp/page-{page}"
+            pdf_filename = f"/tmp/page-{page}.pdf"
+
+            with open(width_filename) as f:
+                width = f.read().strip()
+            with open(height_filename) as f:
+                height = f.read().strip()
+
+            # The first few operations happen on a per-page basis.
+            page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
+            total_size += page_size
+            timeout = self.calculate_timeout(page_size, 1)
+
+            if os.environ.get("OCR") == "1":  # OCR the document
+                self.update_progress(
+                    f"Converting page {page}/{num_pages} from pixels to searchable PDF"
+                )
+                await run_command(
+                    [
+                        "gm",
+                        "convert",
+                        "-size",
+                        f"{width}x{height}",
+                        "-depth",
+                        "8",
+                        f"rgb:{rgb_filename}",
+                        f"png:{png_filename}",
+                    ],
+                    error_message=f"Page {page}/{num_pages} conversion to PNG failed",
+                    timeout_message=(
+                        "Error converting pixels to PNG, convert timed out after"
+                        f" {timeout} seconds"
+                    ),
+                    timeout=timeout,
+                )
+                await run_command(
+                    [
+                        "tesseract",
+                        png_filename,
+                        ocr_filename,
+                        "-l",
+                        os.environ.get("OCR_LANGUAGE"),  # type: ignore
+                        "--dpi",
+                        "70",
+                        "pdf",
+                    ],
+                    error_message=f"Page {page}/{num_pages} OCR failed",
+                    timeout_message=(
+                        "Error converting PNG to searchable PDF, tesseract timed out"
+                        f" after {timeout} seconds"
+                    ),
+                    timeout=timeout,
+                )
+
+            else:  # Don't OCR
+                self.update_progress(
+                    f"Converting page {page}/{num_pages} from pixels to PDF"
+                )
+                await run_command(
+                    [
+                        "gm",
+                        "convert",
+                        "-size",
+                        f"{width}x{height}",
+                        "-depth",
+                        "8",
+                        f"rgb:{rgb_filename}",
+                        f"pdf:{pdf_filename}",
+                    ],
+                    error_message=f"Page {page}/{num_pages} conversion to PDF failed",
+                    timeout_message=(
+                        "Error converting RGB to PDF, convert timed out after"
+                        f" {timeout} seconds"
+                    ),
+                    timeout=timeout,
+                )
+
+            self.percentage += percentage_per_page
+
+        # Next operations apply to the all the pages, so we need to recalculate the
+        # timeout.
+        timeout = self.calculate_timeout(total_size, num_pages)
+
+        # Merge pages into a single PDF
+        self.update_progress(f"Merging {num_pages} pages into a single PDF")
+        args = ["pdfunite"]
+        for page in range(1, num_pages + 1):
+            args.append(f"/tmp/page-{page}.pdf")
+        args.append(f"/tmp/safe-output.pdf")
+        await run_command(
+            args,
+            error_message="Merging pages into a single PDF failed",
+            timeout_message=(
+                "Error merging pages into a single PDF, pdfunite timed out after"
+                f" {timeout} seconds"
+            ),
+            timeout=timeout,
+        )
+
+        self.percentage += 2
+
+        # Compress
+        self.update_progress("Compressing PDF")
+        await run_command(
+            ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
+            error_message="Compressing PDF failed",
+            timeout_message=(
+                f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
+            ),
+            timeout=timeout,
+        )
+
+        self.percentage = 100.0
+        self.update_progress("Safe PDF created")
+
+        # Move converted files into /safezone
+        shutil.move("/tmp/safe-output.pdf", "/safezone")
+        shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")
+
+
+async def main() -> int:
+    converter = PixelsToPDF()
+
+    try:
+        await converter.convert()
+    except (RuntimeError, TimeoutError, ValueError) as e:
+        converter.update_progress(str(e), error=True)
+        return 1
+    else:
+        return 0  # Success!
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py
index 4113317..6b71b63 100644
--- a/dangerzone/isolation_provider/container.py
+++ b/dangerzone/isolation_provider/container.py
@@ -262,8 +262,8 @@ class Container(IsolationProvider):
         # Convert document to pixels
         command = [
             "/usr/bin/python3",
-            "/usr/local/bin/dangerzone.py",
-            "document-to-pixels",
+            "-m",
+            "dangerzone.conversion.doc_to_pixels",
         ]
         extra_args = [
             "-v",
@@ -282,8 +282,8 @@ class Container(IsolationProvider):
             # Convert pixels to safe PDF
             command = [
                 "/usr/bin/python3",
-                "/usr/local/bin/dangerzone.py",
-                "pixels-to-pdf",
+                "-m",
+                "dangerzone.conversion.pixels_to_pdf",
             ]
             extra_args = [
                 "-v",
diff --git a/install/linux/build-image.sh b/install/linux/build-image.sh
index fc662ec..ad573c7 100755
--- a/install/linux/build-image.sh
+++ b/install/linux/build-image.sh
@@ -5,7 +5,7 @@ set -e
 TAG=dangerzone.rocks/dangerzone:latest
 
 echo "Building container image"
-podman build container --tag $TAG
+podman build dangerzone/ -f Dockerfile --tag $TAG
 
 echo "Saving and compressing container image"
 podman save $TAG | gzip > share/container.tar.gz
diff --git a/install/macos/build-image.sh b/install/macos/build-image.sh
index ab4fd97..eafb1c6 100755
--- a/install/macos/build-image.sh
+++ b/install/macos/build-image.sh
@@ -5,7 +5,7 @@ set -e
 TAG=dangerzone.rocks/dangerzone:latest
 
 echo "Building container image"
-docker build container --tag $TAG
+docker build dangerzone/ -f Dockerfile --tag $TAG
 
 echo "Saving and compressing container image"
 docker save $TAG | gzip > share/container.tar.gz
diff --git a/install/windows/build-image.py b/install/windows/build-image.py
index cf1d578..23fd2d5 100644
--- a/install/windows/build-image.py
+++ b/install/windows/build-image.py
@@ -9,7 +9,9 @@ def main():
         [
             "docker",
             "build",
-            "container",
+            "dangerzone/",
+            "-f",
+            "Dockerfile",
             "--tag",
             "dangerzone.rocks/dangerzone:latest",
         ]
diff --git a/setup.py b/setup.py
index 33f5e07..98656fd 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,12 @@ dangerous PDFs, office documents, or images and converts them to safe PDFs. \
 It uses container technology to convert the documents within a secure sandbox.\
 """,
     url="https://github.com/freedomofpress/dangerzone",
-    packages=["dangerzone", "dangerzone.gui", "dangerzone.isolation_provider"],
+    packages=[
+        "dangerzone",
+        "dangerzone.conversion",
+        "dangerzone.gui",
+        "dangerzone.isolation_provider",
+    ],
     data_files=[
         (
             "share/applications",