mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Replace pdftoppm logic with PyMuPDF (native python)
Use PyMuPDF (AGPL-licensed) within the container conversion to replace the pdf conversion to RGB. This massively simplifies the code since PyMuPDF is a native python library.
This commit is contained in:
parent
e923ac0788
commit
327ab8791f
2 changed files with 20 additions and 80 deletions
|
@ -7,7 +7,6 @@ ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d
|
||||||
RUN apk --no-cache -U upgrade && \
|
RUN apk --no-cache -U upgrade && \
|
||||||
apk --no-cache add \
|
apk --no-cache add \
|
||||||
ghostscript \
|
ghostscript \
|
||||||
graphicsmagick \
|
|
||||||
libreoffice \
|
libreoffice \
|
||||||
openjdk8 \
|
openjdk8 \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
|
@ -17,6 +16,10 @@ RUN apk --no-cache -U upgrade && \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
font-noto-cjk
|
font-noto-cjk
|
||||||
|
|
||||||
|
RUN apk --no-cache add --virtual .builddeps g++ gcc make python3-dev py3-pip \
|
||||||
|
&& pip install --upgrade PyMuPDF \
|
||||||
|
&& apk del .builddeps # FIXME freeze w/ hashes
|
||||||
|
|
||||||
# Download the trained models from the latest GitHub release of Tesseract, and
|
# Download the trained models from the latest GitHub release of Tesseract, and
|
||||||
# store them under /usr/share/tessdata. This is basically what distro packages
|
# store them under /usr/share/tessdata. This is basically what distro packages
|
||||||
# do under the hood.
|
# do under the hood.
|
||||||
|
|
|
@ -15,6 +15,7 @@ import shutil
|
||||||
import sys
|
import sys
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import fitz
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
from . import errors
|
from . import errors
|
||||||
|
@ -267,87 +268,23 @@ class DocumentToPixels(DangerzoneConverter):
|
||||||
|
|
||||||
# Get a more precise timeout, based on the number of pages
|
# Get a more precise timeout, based on the number of pages
|
||||||
timeout = self.calculate_timeout(size, num_pages)
|
timeout = self.calculate_timeout(size, num_pages)
|
||||||
|
percentage_per_page = 45.0 / num_pages
|
||||||
async def pdftoppm_progress_callback(line: bytes) -> None:
|
|
||||||
"""Function called for every line the 'pdftoppm' command outputs
|
|
||||||
|
|
||||||
Sample pdftoppm output:
|
|
||||||
|
|
||||||
$ pdftoppm sample.pdf /tmp/safe -progress
|
|
||||||
1 4 /tmp/safe-1.ppm
|
|
||||||
2 4 /tmp/safe-2.ppm
|
|
||||||
3 4 /tmp/safe-3.ppm
|
|
||||||
4 4 /tmp/safe-4.ppm
|
|
||||||
|
|
||||||
Each successful line is in the format "{page} {page_num} {ppm_filename}"
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
(page_str, num_pages_str, _) = line.decode().split()
|
|
||||||
num_pages = int(num_pages_str)
|
|
||||||
page = int(page_str)
|
|
||||||
except ValueError as e:
|
|
||||||
# Ignore all non-progress related output, since pdftoppm sends
|
|
||||||
# everything to stderr and thus, errors can't be distinguished
|
|
||||||
# easily. We rely instead on the exit code.
|
|
||||||
return
|
|
||||||
|
|
||||||
percentage_per_page = 45.0 / num_pages
|
|
||||||
self.percentage += percentage_per_page
|
|
||||||
self.update_progress(f"Converting page {page}/{num_pages} to pixels")
|
|
||||||
|
|
||||||
zero_padding = "0" * (len(num_pages_str) - len(page_str))
|
|
||||||
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
|
|
||||||
rgb_filename = f"{page_base}-{page}.rgb"
|
|
||||||
width_filename = f"{page_base}-{page}.width"
|
|
||||||
height_filename = f"{page_base}-{page}.height"
|
|
||||||
filename_base = f"{page_base}-{page}"
|
|
||||||
|
|
||||||
with open(ppm_filename, "rb") as f:
|
|
||||||
# NOTE: PPM files have multiple ways of writing headers.
|
|
||||||
# For our specific case we parse it expecting the header format that ppmtopdf produces
|
|
||||||
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
|
|
||||||
|
|
||||||
# Read the header
|
|
||||||
header = f.readline().decode().strip()
|
|
||||||
if header != "P6":
|
|
||||||
raise errors.PDFtoPPMInvalidHeader()
|
|
||||||
|
|
||||||
# Save the width and height
|
|
||||||
dims = f.readline().decode().strip()
|
|
||||||
width, height = dims.split()
|
|
||||||
await self.write_page_width(int(width), width_filename)
|
|
||||||
await self.write_page_height(int(height), height_filename)
|
|
||||||
|
|
||||||
maxval = int(f.readline().decode().strip())
|
|
||||||
# Check that the depth is 8
|
|
||||||
if maxval != 255:
|
|
||||||
raise errors.PDFtoPPMInvalidDepth()
|
|
||||||
|
|
||||||
data = f.read()
|
|
||||||
|
|
||||||
# Save pixel data
|
|
||||||
await self.write_page_data(data, rgb_filename)
|
|
||||||
|
|
||||||
# Delete the ppm file
|
|
||||||
os.remove(ppm_filename)
|
|
||||||
|
|
||||||
page_base = "/tmp/page"
|
page_base = "/tmp/page"
|
||||||
|
doc = fitz.open(pdf_filename)
|
||||||
|
for page in doc:
|
||||||
|
# TODO check if page.number is doc-controlled
|
||||||
|
page_num = page.number + 1 # pages start in 1
|
||||||
|
rgb_filename = f"{page_base}-{page_num}.rgb"
|
||||||
|
width_filename = f"{page_base}-{page_num}.width"
|
||||||
|
height_filename = f"{page_base}-{page_num}.height"
|
||||||
|
|
||||||
await self.run_command(
|
self.percentage += percentage_per_page
|
||||||
[
|
self.update_progress(f"Converting page {page_num}/{num_pages} to pixels")
|
||||||
"pdftoppm",
|
pix = page.get_pixmap(dpi=150)
|
||||||
pdf_filename,
|
rgb_buf = pix.samples_mv
|
||||||
page_base,
|
await self.write_page_width(pix.width, width_filename)
|
||||||
"-progress",
|
await self.write_page_height(pix.height, height_filename)
|
||||||
],
|
await self.write_page_data(rgb_buf, rgb_filename)
|
||||||
error_message="Conversion from PDF to PPM failed",
|
|
||||||
timeout_message=(
|
|
||||||
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
|
|
||||||
" seconds"
|
|
||||||
),
|
|
||||||
stderr_callback=pdftoppm_progress_callback,
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
final_files = (
|
final_files = (
|
||||||
glob.glob("/tmp/page-*.rgb")
|
glob.glob("/tmp/page-*.rgb")
|
||||||
|
|
Loading…
Reference in a new issue