Remove PDFtk dependency (replace w/ pdftoppm)

PDFtk actually isn't needed. It was being used for breaking a PDF
into pages but this is something that be replaced by the already present
'pdftoppm'. Furthermore, by removing this dependency we contribute to
reproducible builds and overall supply chain security because it was
obtained from gitlab with no signature verification or version pinning.

The replacement 'pdftoppm' enabled us to do a shortcut:
 - before: PDF -> PDF pages -> PNG images -> RGB images
 - after:  PDF -> PPM images -> RGB images

And this last conversion step is trivial since the RGB format we were
using is just a PPM file without the metadata in its header.
This commit is contained in:
deeplow 2022-10-21 15:55:46 +01:00
parent 08937239a5
commit d28aa5a25b
No known key found for this signature in database
GPG key ID: 577982871529A52A
2 changed files with 124 additions and 76 deletions

View file

@ -10,7 +10,6 @@ RUN apk -U upgrade && \
poppler-utils \ poppler-utils \
python3 \ python3 \
py3-magic \ py3-magic \
py3-pillow \
sudo \ sudo \
tesseract-ocr \ tesseract-ocr \
tesseract-ocr-data-afr \ tesseract-ocr-data-afr \
@ -78,15 +77,6 @@ RUN apk -U upgrade && \
tesseract-ocr-data-ukr \ tesseract-ocr-data-ukr \
tesseract-ocr-data-vie tesseract-ocr-data-vie
# Install pdftk
RUN \
wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \
mv pdftk-all.jar /usr/local/bin && \
chmod +x /usr/local/bin/pdftk-all.jar && \
echo '#!/bin/sh' > /usr/local/bin/pdftk && \
echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \
chmod +x /usr/local/bin/pdftk
COPY dangerzone.py /usr/local/bin/ COPY dangerzone.py /usr/local/bin/
RUN chmod +x /usr/local/bin/dangerzone.py RUN chmod +x /usr/local/bin/dangerzone.py

View file

@ -15,13 +15,14 @@ pixels_to_pdf:
import glob import glob
import json import json
import os import os
import re
import shutil import shutil
import subprocess import subprocess
import sys import sys
from typing import Dict, List, Optional import time
from typing import Callable, Dict, List, Optional
import magic import magic
from PIL import Image
# timeout in seconds for any single subprocess # timeout in seconds for any single subprocess
DEFAULT_TIMEOUT: float = 120 DEFAULT_TIMEOUT: float = 120
@ -36,26 +37,64 @@ def run_command(
error_message: str, error_message: str,
timeout_message: str, timeout_message: str,
timeout: float = DEFAULT_TIMEOUT, timeout: float = DEFAULT_TIMEOUT,
) -> subprocess.CompletedProcess: stdout_callback: Callable = None,
stderr_callback: Callable = None,
) -> None:
""" """
Runs a command and returns the result. Runs a command and returns the result.
:raises RuntimeError: if the process returns a non-zero exit status :raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out :raises TimeoutError: if the process times out
""" """
if stdout_callback is None and stderr_callback is None:
try: try:
return subprocess.run( subprocess.run(args, timeout=timeout, check=True)
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=timeout,
check=True,
)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
raise RuntimeError(error_message) from e raise RuntimeError(error_message) from e
except subprocess.TimeoutExpired as e: except subprocess.TimeoutExpired as e:
raise TimeoutError(timeout_message) from e raise TimeoutError(timeout_message) from e
else:
p = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
# Progress callback requires a manually implemented timeout
start_time = time.time()
# Make reading from stdout or stderr non-blocking
if p.stdout:
os.set_blocking(p.stdout.fileno(), False)
if p.stderr:
os.set_blocking(p.stderr.fileno(), False)
while True:
# Processes hasn't finished
if p.poll() is not None:
if p.returncode != 0:
raise RuntimeError(error_message)
break
# Check if timeout hasn't expired
if time.time() - start_time > timeout:
p.kill()
raise TimeoutError(timeout_message)
if p.stdout and stdout_callback is not None:
line = p.stdout.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stdout_callback(line)
if p.stderr and stderr_callback is not None:
line = p.stderr.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stderr_callback(line)
class DangerzoneConverter: class DangerzoneConverter:
def __init__(self) -> None: def __init__(self) -> None:
@ -181,66 +220,85 @@ class DangerzoneConverter:
) )
self.percentage += 3 self.percentage += 3
# Separate PDF into pages self.update_progress("Obtaining PDF metadata")
self.update_progress("Separating document into pages")
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
run_command(
args,
error_message="Separating document into pages failed",
timeout_message=f"Error separating document into pages, pdftk timed out after {DEFAULT_TIMEOUT} seconds",
)
page_filenames = glob.glob("/tmp/page-*.pdf") def pdftoppm_progress_callback(line: str) -> None:
"""Function called for every line the 'pdftoppm'command outputs
self.percentage += 2 Sample pdftoppm output:
# Convert to RGB pixel data $ pdftoppm sample.pdf /tmp/safe -progress
percentage_per_page = 45.0 / len(page_filenames) 1 4 /tmp/safe-1.ppm
for page in range(1, len(page_filenames) + 1): 2 4 /tmp/safe-2.ppm
pdf_filename = f"/tmp/page-{page}.pdf" 3 4 /tmp/safe-3.ppm
png_filename = f"/tmp/page-{page}.png" 4 4 /tmp/safe-4.ppm
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"
self.update_progress( Each successful line is in the format "{page} {page_num} {ppm_filename}"
f"Converting page {page}/{len(page_filenames)} to pixels" """
) try:
(page_str, num_pages_str, _) = line.split()
num_pages = int(num_pages_str)
page = int(page_str)
except ValueError as e:
raise RuntimeError("Conversion from PDF to PPM failed") from e
# Convert to png percentage_per_page = 45.0 / num_pages
run_command( self.percentage += percentage_per_page
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], self.update_progress(f"Converting page {page}/{num_pages} to pixels")
error_message="Conversion from PDF to PNG failed",
timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds", zero_padding = "0" * (len(num_pages_str) - len(page_str))
) ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
rgb_filename = f"{page_base}-{page}.rgb"
width_filename = f"{page_base}-{page}.width"
height_filename = f"{page_base}-{page}.height"
filename_base = f"{page_base}-{page}"
with open(ppm_filename, "rb") as f:
# NOTE: PPM files have multiple ways of writing headers.
# For our specific case we parse it expecting the header format that ppmtopdf produces
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
# Read the header
header = f.readline().decode().strip()
if header != "P6":
raise ValueError("Invalid PPM header")
# Save the width and height # Save the width and height
with Image.open(png_filename, "r") as im: dims = f.readline().decode().strip()
width, height = im.size width, height = dims.split()
with open(width_filename, "w") as f: with open(width_filename, "w") as width_file:
f.write(str(width)) width_file.write(width)
with open(height_filename, "w") as f: with open(height_filename, "w") as height_file:
f.write(str(height)) height_file.write(height)
# Convert to RGB pixels maxval = int(f.readline().decode().strip())
# Check that the depth is 8
if maxval != 255:
raise ValueError("Invalid PPM depth")
data = f.read()
# Save pixel data
with open(rgb_filename, "wb") as f:
f.write(data)
# Delete the ppm file
os.remove(ppm_filename)
page_base = "/tmp/page"
# Convert to PPM, which is essentially an RGB format
run_command( run_command(
[ [
"gm", "pdftoppm",
"convert", pdf_filename,
png_filename, page_base,
"-depth", "-progress",
"8",
f"rgb:{rgb_filename}",
], ],
error_message="Conversion from PNG to RGB failed", error_message="Conversion from PDF to PPM failed",
timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds", timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {DEFAULT_TIMEOUT} seconds",
stderr_callback=pdftoppm_progress_callback,
) )
# Delete the png
os.remove(png_filename)
self.percentage += percentage_per_page
self.update_progress("Converted document to pixels") self.update_progress("Converted document to pixels")
# Move converted files into /dangerzone # Move converted files into /dangerzone