mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Remove PDFtk dependency (replace w/ pdftoppm)
PDFtk actually isn't needed. It was being used for breaking a PDF into pages but this is something that be replaced by the already present 'pdftoppm'. Furthermore, by removing this dependency we contribute to reproducible builds and overall supply chain security because it was obtained from gitlab with no signature verification or version pinning. The replacement 'pdftoppm' enabled us to do a shortcut: - before: PDF -> PDF pages -> PNG images -> RGB images - after: PDF -> PPM images -> RGB images And this last conversion step is trivial since the RGB format we were using is just a PPM file without the metadata in its header.
This commit is contained in:
parent
08937239a5
commit
d28aa5a25b
2 changed files with 124 additions and 76 deletions
|
@ -10,7 +10,6 @@ RUN apk -U upgrade && \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
python3 \
|
python3 \
|
||||||
py3-magic \
|
py3-magic \
|
||||||
py3-pillow \
|
|
||||||
sudo \
|
sudo \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-data-afr \
|
tesseract-ocr-data-afr \
|
||||||
|
@ -78,15 +77,6 @@ RUN apk -U upgrade && \
|
||||||
tesseract-ocr-data-ukr \
|
tesseract-ocr-data-ukr \
|
||||||
tesseract-ocr-data-vie
|
tesseract-ocr-data-vie
|
||||||
|
|
||||||
# Install pdftk
|
|
||||||
RUN \
|
|
||||||
wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \
|
|
||||||
mv pdftk-all.jar /usr/local/bin && \
|
|
||||||
chmod +x /usr/local/bin/pdftk-all.jar && \
|
|
||||||
echo '#!/bin/sh' > /usr/local/bin/pdftk && \
|
|
||||||
echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \
|
|
||||||
chmod +x /usr/local/bin/pdftk
|
|
||||||
|
|
||||||
COPY dangerzone.py /usr/local/bin/
|
COPY dangerzone.py /usr/local/bin/
|
||||||
RUN chmod +x /usr/local/bin/dangerzone.py
|
RUN chmod +x /usr/local/bin/dangerzone.py
|
||||||
|
|
||||||
|
|
|
@ -15,13 +15,14 @@ pixels_to_pdf:
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from typing import Dict, List, Optional
|
import time
|
||||||
|
from typing import Callable, Dict, List, Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
# timeout in seconds for any single subprocess
|
# timeout in seconds for any single subprocess
|
||||||
DEFAULT_TIMEOUT: float = 120
|
DEFAULT_TIMEOUT: float = 120
|
||||||
|
@ -36,25 +37,63 @@ def run_command(
|
||||||
error_message: str,
|
error_message: str,
|
||||||
timeout_message: str,
|
timeout_message: str,
|
||||||
timeout: float = DEFAULT_TIMEOUT,
|
timeout: float = DEFAULT_TIMEOUT,
|
||||||
) -> subprocess.CompletedProcess:
|
stdout_callback: Callable = None,
|
||||||
|
stderr_callback: Callable = None,
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Runs a command and returns the result.
|
Runs a command and returns the result.
|
||||||
|
|
||||||
:raises RuntimeError: if the process returns a non-zero exit status
|
:raises RuntimeError: if the process returns a non-zero exit status
|
||||||
:raises TimeoutError: if the process times out
|
:raises TimeoutError: if the process times out
|
||||||
"""
|
"""
|
||||||
try:
|
if stdout_callback is None and stderr_callback is None:
|
||||||
return subprocess.run(
|
try:
|
||||||
|
subprocess.run(args, timeout=timeout, check=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(error_message) from e
|
||||||
|
except subprocess.TimeoutExpired as e:
|
||||||
|
raise TimeoutError(timeout_message) from e
|
||||||
|
|
||||||
|
else:
|
||||||
|
p = subprocess.Popen(
|
||||||
args,
|
args,
|
||||||
stdout=subprocess.DEVNULL,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.DEVNULL,
|
stderr=subprocess.PIPE,
|
||||||
timeout=timeout,
|
universal_newlines=True,
|
||||||
check=True,
|
|
||||||
)
|
)
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
raise RuntimeError(error_message) from e
|
# Progress callback requires a manually implemented timeout
|
||||||
except subprocess.TimeoutExpired as e:
|
start_time = time.time()
|
||||||
raise TimeoutError(timeout_message) from e
|
|
||||||
|
# Make reading from stdout or stderr non-blocking
|
||||||
|
if p.stdout:
|
||||||
|
os.set_blocking(p.stdout.fileno(), False)
|
||||||
|
if p.stderr:
|
||||||
|
os.set_blocking(p.stderr.fileno(), False)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Processes hasn't finished
|
||||||
|
if p.poll() is not None:
|
||||||
|
if p.returncode != 0:
|
||||||
|
raise RuntimeError(error_message)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check if timeout hasn't expired
|
||||||
|
if time.time() - start_time > timeout:
|
||||||
|
p.kill()
|
||||||
|
raise TimeoutError(timeout_message)
|
||||||
|
|
||||||
|
if p.stdout and stdout_callback is not None:
|
||||||
|
line = p.stdout.readline()
|
||||||
|
if len(line) > 0:
|
||||||
|
line = line.rstrip() # strip trailing "\n"
|
||||||
|
stdout_callback(line)
|
||||||
|
|
||||||
|
if p.stderr and stderr_callback is not None:
|
||||||
|
line = p.stderr.readline()
|
||||||
|
if len(line) > 0:
|
||||||
|
line = line.rstrip() # strip trailing "\n"
|
||||||
|
stderr_callback(line)
|
||||||
|
|
||||||
|
|
||||||
class DangerzoneConverter:
|
class DangerzoneConverter:
|
||||||
|
@ -181,65 +220,84 @@ class DangerzoneConverter:
|
||||||
)
|
)
|
||||||
self.percentage += 3
|
self.percentage += 3
|
||||||
|
|
||||||
# Separate PDF into pages
|
self.update_progress("Obtaining PDF metadata")
|
||||||
self.update_progress("Separating document into pages")
|
|
||||||
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
|
|
||||||
run_command(
|
|
||||||
args,
|
|
||||||
error_message="Separating document into pages failed",
|
|
||||||
timeout_message=f"Error separating document into pages, pdftk timed out after {DEFAULT_TIMEOUT} seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
page_filenames = glob.glob("/tmp/page-*.pdf")
|
def pdftoppm_progress_callback(line: str) -> None:
|
||||||
|
"""Function called for every line the 'pdftoppm'command outputs
|
||||||
|
|
||||||
self.percentage += 2
|
Sample pdftoppm output:
|
||||||
|
|
||||||
# Convert to RGB pixel data
|
$ pdftoppm sample.pdf /tmp/safe -progress
|
||||||
percentage_per_page = 45.0 / len(page_filenames)
|
1 4 /tmp/safe-1.ppm
|
||||||
for page in range(1, len(page_filenames) + 1):
|
2 4 /tmp/safe-2.ppm
|
||||||
pdf_filename = f"/tmp/page-{page}.pdf"
|
3 4 /tmp/safe-3.ppm
|
||||||
png_filename = f"/tmp/page-{page}.png"
|
4 4 /tmp/safe-4.ppm
|
||||||
rgb_filename = f"/tmp/page-{page}.rgb"
|
|
||||||
width_filename = f"/tmp/page-{page}.width"
|
|
||||||
height_filename = f"/tmp/page-{page}.height"
|
|
||||||
filename_base = f"/tmp/page-{page}"
|
|
||||||
|
|
||||||
self.update_progress(
|
Each successful line is in the format "{page} {page_num} {ppm_filename}"
|
||||||
f"Converting page {page}/{len(page_filenames)} to pixels"
|
"""
|
||||||
)
|
try:
|
||||||
|
(page_str, num_pages_str, _) = line.split()
|
||||||
|
num_pages = int(num_pages_str)
|
||||||
|
page = int(page_str)
|
||||||
|
except ValueError as e:
|
||||||
|
raise RuntimeError("Conversion from PDF to PPM failed") from e
|
||||||
|
|
||||||
# Convert to png
|
percentage_per_page = 45.0 / num_pages
|
||||||
run_command(
|
|
||||||
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
|
|
||||||
error_message="Conversion from PDF to PNG failed",
|
|
||||||
timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save the width and height
|
|
||||||
with Image.open(png_filename, "r") as im:
|
|
||||||
width, height = im.size
|
|
||||||
with open(width_filename, "w") as f:
|
|
||||||
f.write(str(width))
|
|
||||||
with open(height_filename, "w") as f:
|
|
||||||
f.write(str(height))
|
|
||||||
|
|
||||||
# Convert to RGB pixels
|
|
||||||
run_command(
|
|
||||||
[
|
|
||||||
"gm",
|
|
||||||
"convert",
|
|
||||||
png_filename,
|
|
||||||
"-depth",
|
|
||||||
"8",
|
|
||||||
f"rgb:{rgb_filename}",
|
|
||||||
],
|
|
||||||
error_message="Conversion from PNG to RGB failed",
|
|
||||||
timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Delete the png
|
|
||||||
os.remove(png_filename)
|
|
||||||
self.percentage += percentage_per_page
|
self.percentage += percentage_per_page
|
||||||
|
self.update_progress(f"Converting page {page}/{num_pages} to pixels")
|
||||||
|
|
||||||
|
zero_padding = "0" * (len(num_pages_str) - len(page_str))
|
||||||
|
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
|
||||||
|
rgb_filename = f"{page_base}-{page}.rgb"
|
||||||
|
width_filename = f"{page_base}-{page}.width"
|
||||||
|
height_filename = f"{page_base}-{page}.height"
|
||||||
|
filename_base = f"{page_base}-{page}"
|
||||||
|
|
||||||
|
with open(ppm_filename, "rb") as f:
|
||||||
|
# NOTE: PPM files have multiple ways of writing headers.
|
||||||
|
# For our specific case we parse it expecting the header format that ppmtopdf produces
|
||||||
|
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
|
||||||
|
|
||||||
|
# Read the header
|
||||||
|
header = f.readline().decode().strip()
|
||||||
|
if header != "P6":
|
||||||
|
raise ValueError("Invalid PPM header")
|
||||||
|
|
||||||
|
# Save the width and height
|
||||||
|
dims = f.readline().decode().strip()
|
||||||
|
width, height = dims.split()
|
||||||
|
with open(width_filename, "w") as width_file:
|
||||||
|
width_file.write(width)
|
||||||
|
with open(height_filename, "w") as height_file:
|
||||||
|
height_file.write(height)
|
||||||
|
|
||||||
|
maxval = int(f.readline().decode().strip())
|
||||||
|
# Check that the depth is 8
|
||||||
|
if maxval != 255:
|
||||||
|
raise ValueError("Invalid PPM depth")
|
||||||
|
|
||||||
|
data = f.read()
|
||||||
|
|
||||||
|
# Save pixel data
|
||||||
|
with open(rgb_filename, "wb") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
# Delete the ppm file
|
||||||
|
os.remove(ppm_filename)
|
||||||
|
|
||||||
|
page_base = "/tmp/page"
|
||||||
|
# Convert to PPM, which is essentially an RGB format
|
||||||
|
run_command(
|
||||||
|
[
|
||||||
|
"pdftoppm",
|
||||||
|
pdf_filename,
|
||||||
|
page_base,
|
||||||
|
"-progress",
|
||||||
|
],
|
||||||
|
error_message="Conversion from PDF to PPM failed",
|
||||||
|
timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {DEFAULT_TIMEOUT} seconds",
|
||||||
|
stderr_callback=pdftoppm_progress_callback,
|
||||||
|
)
|
||||||
|
|
||||||
self.update_progress("Converted document to pixels")
|
self.update_progress("Converted document to pixels")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue