Restructure container code

The files in `container/` no longer make sense to have that name since
the "document to pixels" part will run in Qubes OS in its own virtual
machine.

To adapt to this, this PR does the following:
- Moves all the files in `container` to `dangerzone/conversion`
- Splits the old `container/dangerzone.py` into its two components
  `dangerzone/conversion/{doc_to_pixels,pixels_to_pdf}.py` with a
  `common.py` file for shared functions
- Moves the Dockerfile to the project root and adapts it to the new
  container code location
- Updates the CircleCI config to properly cache Docker images.
- Updates our install scripts to properly build Docker images.
- Adds the new conversion module to the container image, so that it can
  be imported as a package.
- Adapts the container isolation provider to use the new way of calling
  the code.

NOTE: We have made zero changes to the conversion code in this commit,
except for necessary imports in order to factor out some common parts.
Any changes necessary for Qubes integration follow in the subsequent
commits.
This commit is contained in:
deeplow 2023-06-20 19:33:48 +03:00 committed by Alex Pyrgiotis
parent 9a45bc12c5
commit 814d533c3b
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
13 changed files with 360 additions and 289 deletions

View file

@ -42,8 +42,16 @@ aliases:
./install/linux/build-rpm.py ./install/linux/build-rpm.py
ls -lh dist/ ls -lh dist/
- &calculate-cache-key
name: Caculating container cache key
command: |
mkdir -p /caches/
cd dangerzone/conversion/
cat common.py doc_to_pixels.py pixels_to_pdf.py | sha1sum | cut -d' ' -f1 > /caches/cache-id.txt
cd ../../
- &restore-cache - &restore-cache
key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }} key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }}
paths: paths:
- /caches/container.tar.gz - /caches/container.tar.gz
- /caches/image-id.txt - /caches/image-id.txt
@ -85,9 +93,8 @@ jobs:
- image: docker:dind - image: docker:dind
steps: steps:
- checkout - checkout
- restore_cache: - run: *calculate-cache-key
keys: - restore_cache: *restore-cache
- v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }}
- setup_remote_docker - setup_remote_docker
- run: - run:
name: Build Dangerzone image name: Build Dangerzone image
@ -95,7 +102,9 @@ jobs:
if [ -f "/caches/container.tar.gz" ]; then if [ -f "/caches/container.tar.gz" ]; then
echo "Already cached, skipping" echo "Already cached, skipping"
else else
docker build --cache-from=dangerzone.rocks/dangerzone --tag dangerzone.rocks/dangerzone container docker build dangerzone/ -f Dockerfile \
--cache-from=dangerzone.rocks/dangerzone \
--tag dangerzone.rocks/dangerzone
fi fi
- run: - run:
name: Save Dangerzone image and image-id.txt to cache name: Save Dangerzone image and image-id.txt to cache
@ -108,8 +117,9 @@ jobs:
gzip -f /caches/container.tar gzip -f /caches/container.tar
docker image ls dangerzone.rocks/dangerzone | grep "dangerzone.rocks/dangerzone" | tr -s ' ' | cut -d' ' -f3 > /caches/image-id.txt docker image ls dangerzone.rocks/dangerzone | grep "dangerzone.rocks/dangerzone" | tr -s ' ' | cut -d' ' -f3 > /caches/image-id.txt
fi fi
- run: *calculate-cache-key
- save_cache: - save_cache:
key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }} key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }}
paths: paths:
- /caches/container.tar.gz - /caches/container.tar.gz
- /caches/image-id.txt - /caches/image-id.txt
@ -136,6 +146,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: - run:
@ -155,6 +166,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -181,6 +193,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -207,6 +220,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -233,6 +247,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -259,6 +274,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -285,6 +301,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -328,6 +345,7 @@ jobs:
command: | command: |
sudo mkdir -p /caches sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches sudo chown -R $USER:$USER /caches
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
@ -365,6 +383,7 @@ jobs:
steps: steps:
- run: *install-dependencies-deb - run: *install-dependencies-deb
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-deb - run: *build-deb
@ -376,6 +395,7 @@ jobs:
steps: steps:
- run: *install-dependencies-deb - run: *install-dependencies-deb
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-deb - run: *build-deb
@ -388,6 +408,7 @@ jobs:
- run: *install-dependencies-deb - run: *install-dependencies-deb
- run: *install-python-all - run: *install-python-all
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-deb - run: *build-deb
@ -399,6 +420,7 @@ jobs:
steps: steps:
- run: *install-dependencies-deb - run: *install-dependencies-deb
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-deb - run: *build-deb
@ -410,6 +432,7 @@ jobs:
steps: steps:
- run: *install-dependencies-deb - run: *install-dependencies-deb
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-deb - run: *build-deb
@ -421,6 +444,7 @@ jobs:
steps: steps:
- run: *install-dependencies-rpm - run: *install-dependencies-rpm
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-rpm - run: *build-rpm
@ -432,6 +456,7 @@ jobs:
steps: steps:
- run: *install-dependencies-rpm - run: *install-dependencies-rpm
- checkout - checkout
- run: *calculate-cache-key
- restore_cache: *restore-cache - restore_cache: *restore-cache
- run: *copy-image - run: *copy-image
- run: *build-rpm - run: *build-rpm

View file

@ -13,7 +13,7 @@ jobs:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: Build container image - name: Build container image
run: docker build container --tag dangerzone.rocks/dangerzone:latest run: docker build dangerzone/ -f Dockerfile --tag dangerzone.rocks/dangerzone:latest
# NOTE: Scan first without failing, else we won't be able to read the scan # NOTE: Scan first without failing, else we won't be able to read the scan
# report. # report.
- name: Scan container image (no fail) - name: Scan container image (no fail)

View file

@ -33,8 +33,11 @@ RUN mkdir tessdata && cd tessdata \
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \ && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \
&& cd .. && rm -r tessdata && cd .. && rm -r tessdata
COPY dangerzone.py /usr/local/bin/ ENV PYTHONPATH=/opt/dangerzone
RUN chmod +x /usr/local/bin/dangerzone.py
RUN mkdir -p /opt/dangerzone/dangerzone
RUN touch /opt/dangerzone/dangerzone/__init__.py
COPY conversion /opt/dangerzone/dangerzone/conversion
# Add the unprivileged user # Add the unprivileged user
RUN adduser -s /bin/sh -D dangerzone RUN adduser -s /bin/sh -D dangerzone

View file

@ -24,13 +24,10 @@ MYPY_ARGS := --ignore-missing-imports \
mypy-host: mypy-host:
mypy $(MYPY_ARGS) dangerzone mypy $(MYPY_ARGS) dangerzone
mypy-container:
mypy $(MYPY_ARGS) container
mypy-tests: mypy-tests:
mypy $(MYPY_ARGS) tests mypy $(MYPY_ARGS) tests
mypy: mypy-host mypy-container mypy-tests ## check type hints with mypy mypy: mypy-host mypy-tests ## check type hints with mypy
.PHONY: lint .PHONY: lint
lint: lint-black lint-isort mypy ## check the code with various linters lint: lint-black lint-isort mypy ## check the code with various linters

View file

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
import asyncio
import glob
import json
import os
import re
import shutil
import subprocess
import sys
import time
from abc import abstractmethod
from typing import Callable, Dict, List, Optional, Tuple, Union
TIMEOUT_PER_PAGE: float = 30 # (seconds)
TIMEOUT_PER_MB: float = 30 # (seconds)
TIMEOUT_MIN: float = 60 # (seconds)
async def read_stream(
sr: asyncio.StreamReader, callback: Optional[Callable] = None
) -> bytes:
"""Consume a byte stream line-by-line.
Read all lines in a stream until EOF. If a user has passed a callback, call it for
each line.
Note that the lines are in bytes, since we can't assume that all command output will
be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
if they know its encoding.
"""
buf = b""
while True:
line = await sr.readline()
if sr.at_eof():
break
if callback is not None:
callback(line)
# TODO: This would be a good place to log the received line, mostly for debug
# logging.
buf += line
return buf
async def run_command(
args: List[str],
*,
error_message: str,
timeout_message: str,
timeout: Optional[float],
stdout_callback: Optional[Callable] = None,
stderr_callback: Optional[Callable] = None,
) -> Tuple[bytes, bytes]:
"""Run a command and get its output.
Run a command using asyncio.subprocess, consume its standard streams, and return its
output in bytes.
:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
# Start the provided command, and return a handle. The command will run in the
# background.
proc = await asyncio.subprocess.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
assert proc.stdout is not None
assert proc.stderr is not None
# Create asynchronous tasks that will consume the standard streams of the command,
# and call callbacks if necessary.
stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))
# Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception.
try:
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
except asyncio.exceptions.TimeoutError:
raise TimeoutError(timeout_message)
if ret != 0:
raise RuntimeError(error_message)
# Wait until the tasks that consume the command's standard streams have exited as
# well, and return their output.
stdout = await stdout_task
stderr = await stderr_task
return (stdout, stderr)
class DangerzoneConverter:
def __init__(self) -> None:
self.percentage: float = 0.0
def calculate_timeout(
self, size: float, pages: Optional[float] = None
) -> Optional[float]:
"""Calculate the timeout for a command.
The timeout calculation takes two factors in mind:
1. The size (in MiBs) of the dataset (document, multiple pages).
2. The number of pages in the dataset.
It then calculates proportional timeout values based on the above, and keeps the
large one. This way, we can handle several corner cases:
* Documents with lots of pages, but small file size.
* Single images with large file size.
"""
if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
return None
# Do not have timeouts lower than 10 seconds, if the file size is small, since
# we need to take into account the program's startup time as well.
timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
if pages:
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
return timeout
@abstractmethod
async def convert(self) -> None:
pass
def update_progress(self, text: str, *, error: bool = False) -> None:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()

View file

@ -1,140 +1,27 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Here are the steps, with progress bar percentages for each step: Here are the steps, with progress bar percentages:
document_to_pixels
- 0%-3%: Convert document into a PDF (skipped if the input file is a PDF) - 0%-3%: Convert document into a PDF (skipped if the input file is a PDF)
- 3%-5%: Split PDF into individual pages, and count those pages - 3%-5%: Split PDF into individual pages, and count those pages
- 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages) - 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages)
pixels_to_pdf:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
""" """
import asyncio import asyncio
import glob import glob
import json
import os import os
import re import re
import shutil import shutil
import subprocess
import sys import sys
import time from typing import Dict, Optional
from typing import Callable, Dict, List, Optional, Tuple, Union
import magic import magic
TIMEOUT_PER_PAGE: float = 30 # (seconds) from .common import DangerzoneConverter, run_command
TIMEOUT_PER_MB: float = 30 # (seconds)
TIMEOUT_MIN: float = 60 # (seconds)
async def read_stream( class DocumentToPixels(DangerzoneConverter):
sr: asyncio.StreamReader, callback: Optional[Callable] = None async def convert(self) -> None:
) -> bytes:
"""Consume a byte stream line-by-line.
Read all lines in a stream until EOF. If a user has passed a callback, call it for
each line.
Note that the lines are in bytes, since we can't assume that all command output will
be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
if they know its encoding.
"""
buf = b""
while True:
line = await sr.readline()
if sr.at_eof():
break
if callback is not None:
callback(line)
# TODO: This would be a good place to log the received line, mostly for debug
# logging.
buf += line
return buf
async def run_command(
args: List[str],
*,
error_message: str,
timeout_message: str,
timeout: Optional[float],
stdout_callback: Optional[Callable] = None,
stderr_callback: Optional[Callable] = None,
) -> Tuple[bytes, bytes]:
"""Run a command and get its output.
Run a command using asyncio.subprocess, consume its standard streams, and return its
output in bytes.
:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
# Start the provided command, and return a handle. The command will run in the
# background.
proc = await asyncio.subprocess.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
assert proc.stdout is not None
assert proc.stderr is not None
# Create asynchronous tasks that will consume the standard streams of the command,
# and call callbacks if necessary.
stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))
# Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception.
try:
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
except asyncio.exceptions.TimeoutError:
raise TimeoutError(timeout_message)
if ret != 0:
raise RuntimeError(error_message)
# Wait until the tasks that consume the command's standard streams have exited as
# well, and return their output.
stdout = await stdout_task
stderr = await stderr_task
return (stdout, stderr)
class DangerzoneConverter:
def __init__(self) -> None:
self.percentage: float = 0.0
def calculate_timeout(
self, size: float, pages: Optional[float] = None
) -> Optional[float]:
"""Calculate the timeout for a command.
The timeout calculation takes two factors in mind:
1. The size (in MiBs) of the dataset (document, multiple pages).
2. The number of pages in the dataset.
It then calculates proportional timeout values based on the above, and keeps the
large one. This way, we can handle several corner cases:
* Documents with lots of pages, but small file size.
* Single images with large file size.
"""
if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
return None
# Do not have timeouts lower than 10 seconds, if the file size is small, since
# we need to take into account the program's startup time as well.
timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
if pages:
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
return timeout
async def document_to_pixels(self) -> None:
conversions: Dict[str, Dict[str, Optional[str]]] = { conversions: Dict[str, Dict[str, Optional[str]]] = {
# .pdf # .pdf
"application/pdf": {"type": None}, "application/pdf": {"type": None},
@ -393,160 +280,12 @@ class DangerzoneConverter:
): ):
shutil.move(filename, "/dangerzone") shutil.move(filename, "/dangerzone")
async def pixels_to_pdf(self) -> None:
self.percentage = 50.0
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
total_size = 0.0
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1):
filename_base = f"/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
# The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
if os.environ.get("OCR") == "1": # OCR the document
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
)
await run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
await run_command(
[
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"), # type: ignore
"--dpi",
"70",
"pdf",
],
error_message=f"Page {page}/{num_pages} OCR failed",
timeout_message=(
"Error converting PNG to searchable PDF, tesseract timed out"
f" after {timeout} seconds"
),
timeout=timeout,
)
else: # Don't OCR
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to PDF"
)
await run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
timeout_message=(
"Error converting RGB to PDF, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
self.percentage += percentage_per_page
# Next operations apply to the all the pages, so we need to recalculate the
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)
# Merge pages into a single PDF
self.update_progress(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
await run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
self.percentage += 2
# Compress
self.update_progress("Compressing PDF")
await run_command(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
error_message="Compressing PDF failed",
timeout_message=(
f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
),
timeout=timeout,
)
self.percentage = 100.0
self.update_progress("Safe PDF created")
# Move converted files into /safezone
shutil.move("/tmp/safe-output.pdf", "/safezone")
shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")
def update_progress(self, text: str, *, error: bool = False) -> None:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()
async def main() -> int: async def main() -> int:
if len(sys.argv) != 2: converter = DocumentToPixels()
print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
return -1
converter = DangerzoneConverter()
try: try:
if sys.argv[1] == "document-to-pixels": await converter.convert()
await converter.document_to_pixels()
elif sys.argv[1] == "pixels-to-pdf":
await converter.pixels_to_pdf()
except (RuntimeError, TimeoutError, ValueError) as e: except (RuntimeError, TimeoutError, ValueError) as e:
converter.update_progress(str(e), error=True) converter.update_progress(str(e), error=True)
return 1 return 1

View file

@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Here are the steps, with progress bar percentages:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""
import asyncio
import glob
import json
import os
import shutil
import sys
from .common import DangerzoneConverter, run_command
class PixelsToPDF(DangerzoneConverter):
async def convert(self) -> None:
self.percentage = 50.0
num_pages = len(glob.glob("/tmp/dangerzone/page-*.rgb"))
total_size = 0.0
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1):
filename_base = f"/tmp/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"/tmp/page-{page}.png"
ocr_filename = f"/tmp/page-{page}"
pdf_filename = f"/tmp/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
# The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
if os.environ.get("OCR") == "1": # OCR the document
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
)
await run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
await run_command(
[
"tesseract",
png_filename,
ocr_filename,
"-l",
os.environ.get("OCR_LANGUAGE"), # type: ignore
"--dpi",
"70",
"pdf",
],
error_message=f"Page {page}/{num_pages} OCR failed",
timeout_message=(
"Error converting PNG to searchable PDF, tesseract timed out"
f" after {timeout} seconds"
),
timeout=timeout,
)
else: # Don't OCR
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to PDF"
)
await run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
timeout_message=(
"Error converting RGB to PDF, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
self.percentage += percentage_per_page
# Next operations apply to the all the pages, so we need to recalculate the
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)
# Merge pages into a single PDF
self.update_progress(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
await run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
self.percentage += 2
# Compress
self.update_progress("Compressing PDF")
await run_command(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
error_message="Compressing PDF failed",
timeout_message=(
f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
),
timeout=timeout,
)
self.percentage = 100.0
self.update_progress("Safe PDF created")
# Move converted files into /safezone
shutil.move("/tmp/safe-output.pdf", "/safezone")
shutil.move("/tmp/safe-output-compressed.pdf", "/safezone")
async def main() -> int:
converter = PixelsToPDF()
try:
await converter.convert()
except (RuntimeError, TimeoutError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1
else:
return 0 # Success!
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View file

@ -262,8 +262,8 @@ class Container(IsolationProvider):
# Convert document to pixels # Convert document to pixels
command = [ command = [
"/usr/bin/python3", "/usr/bin/python3",
"/usr/local/bin/dangerzone.py", "-m",
"document-to-pixels", "dangerzone.conversion.doc_to_pixels",
] ]
extra_args = [ extra_args = [
"-v", "-v",
@ -282,8 +282,8 @@ class Container(IsolationProvider):
# Convert pixels to safe PDF # Convert pixels to safe PDF
command = [ command = [
"/usr/bin/python3", "/usr/bin/python3",
"/usr/local/bin/dangerzone.py", "-m",
"pixels-to-pdf", "dangerzone.conversion.pixels_to_pdf",
] ]
extra_args = [ extra_args = [
"-v", "-v",

View file

@ -5,7 +5,7 @@ set -e
TAG=dangerzone.rocks/dangerzone:latest TAG=dangerzone.rocks/dangerzone:latest
echo "Building container image" echo "Building container image"
podman build container --tag $TAG podman build dangerzone/ -f Dockerfile --tag $TAG
echo "Saving and compressing container image" echo "Saving and compressing container image"
podman save $TAG | gzip > share/container.tar.gz podman save $TAG | gzip > share/container.tar.gz

View file

@ -5,7 +5,7 @@ set -e
TAG=dangerzone.rocks/dangerzone:latest TAG=dangerzone.rocks/dangerzone:latest
echo "Building container image" echo "Building container image"
docker build container --tag $TAG docker build dangerzone/ -f Dockerfile --tag $TAG
echo "Saving and compressing container image" echo "Saving and compressing container image"
docker save $TAG | gzip > share/container.tar.gz docker save $TAG | gzip > share/container.tar.gz

View file

@ -9,7 +9,9 @@ def main():
[ [
"docker", "docker",
"build", "build",
"container", "dangerzone/",
"-f",
"Dockerfile",
"--tag", "--tag",
"dangerzone.rocks/dangerzone:latest", "dangerzone.rocks/dangerzone:latest",
] ]

View file

@ -29,7 +29,12 @@ dangerous PDFs, office documents, or images and converts them to safe PDFs. \
It uses container technology to convert the documents within a secure sandbox.\ It uses container technology to convert the documents within a secure sandbox.\
""", """,
url="https://github.com/freedomofpress/dangerzone", url="https://github.com/freedomofpress/dangerzone",
packages=["dangerzone", "dangerzone.gui", "dangerzone.isolation_provider"], packages=[
"dangerzone",
"dangerzone.conversion",
"dangerzone.gui",
"dangerzone.isolation_provider",
],
data_files=[ data_files=[
( (
"share/applications", "share/applications",