Replace pdftoppm logic with PyMuPDF (native python)

Use PyMuPDF (AGPL-licensed) within the container conversion to replace
the pdf conversion to RGB. This massively simplifies the code since
PyMuPDF is a native python library.
This commit is contained in:
deeplow 2023-11-13 15:20:07 +00:00
parent e923ac0788
commit 327ab8791f
No known key found for this signature in database
GPG key ID: 577982871529A52A
2 changed files with 20 additions and 80 deletions

View file

@ -7,7 +7,6 @@ ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d
RUN apk --no-cache -U upgrade && \ RUN apk --no-cache -U upgrade && \
apk --no-cache add \ apk --no-cache add \
ghostscript \ ghostscript \
graphicsmagick \
libreoffice \ libreoffice \
openjdk8 \ openjdk8 \
poppler-utils \ poppler-utils \
@ -17,6 +16,10 @@ RUN apk --no-cache -U upgrade && \
tesseract-ocr \ tesseract-ocr \
font-noto-cjk font-noto-cjk
RUN apk --no-cache add --virtual .builddeps g++ gcc make python3-dev py3-pip \
&& pip install --upgrade PyMuPDF \
&& apk del .builddeps # FIXME freeze w/ hashes
# Download the trained models from the latest GitHub release of Tesseract, and # Download the trained models from the latest GitHub release of Tesseract, and
# store them under /usr/share/tessdata. This is basically what distro packages # store them under /usr/share/tessdata. This is basically what distro packages
# do under the hood. # do under the hood.

View file

@ -15,6 +15,7 @@ import shutil
import sys import sys
from typing import Dict, List, Optional from typing import Dict, List, Optional
import fitz
import magic import magic
from . import errors from . import errors
@ -267,87 +268,23 @@ class DocumentToPixels(DangerzoneConverter):
# Get a more precise timeout, based on the number of pages # Get a more precise timeout, based on the number of pages
timeout = self.calculate_timeout(size, num_pages) timeout = self.calculate_timeout(size, num_pages)
percentage_per_page = 45.0 / num_pages
async def pdftoppm_progress_callback(line: bytes) -> None:
"""Function called for every line the 'pdftoppm' command outputs
Sample pdftoppm output:
$ pdftoppm sample.pdf /tmp/safe -progress
1 4 /tmp/safe-1.ppm
2 4 /tmp/safe-2.ppm
3 4 /tmp/safe-3.ppm
4 4 /tmp/safe-4.ppm
Each successful line is in the format "{page} {page_num} {ppm_filename}"
"""
try:
(page_str, num_pages_str, _) = line.decode().split()
num_pages = int(num_pages_str)
page = int(page_str)
except ValueError as e:
# Ignore all non-progress related output, since pdftoppm sends
# everything to stderr and thus, errors can't be distinguished
# easily. We rely instead on the exit code.
return
percentage_per_page = 45.0 / num_pages
self.percentage += percentage_per_page
self.update_progress(f"Converting page {page}/{num_pages} to pixels")
zero_padding = "0" * (len(num_pages_str) - len(page_str))
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
rgb_filename = f"{page_base}-{page}.rgb"
width_filename = f"{page_base}-{page}.width"
height_filename = f"{page_base}-{page}.height"
filename_base = f"{page_base}-{page}"
with open(ppm_filename, "rb") as f:
# NOTE: PPM files have multiple ways of writing headers.
# For our specific case we parse it expecting the header format that ppmtopdf produces
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html
# Read the header
header = f.readline().decode().strip()
if header != "P6":
raise errors.PDFtoPPMInvalidHeader()
# Save the width and height
dims = f.readline().decode().strip()
width, height = dims.split()
await self.write_page_width(int(width), width_filename)
await self.write_page_height(int(height), height_filename)
maxval = int(f.readline().decode().strip())
# Check that the depth is 8
if maxval != 255:
raise errors.PDFtoPPMInvalidDepth()
data = f.read()
# Save pixel data
await self.write_page_data(data, rgb_filename)
# Delete the ppm file
os.remove(ppm_filename)
page_base = "/tmp/page" page_base = "/tmp/page"
doc = fitz.open(pdf_filename)
for page in doc:
# TODO check if page.number is doc-controlled
page_num = page.number + 1 # pages start in 1
rgb_filename = f"{page_base}-{page_num}.rgb"
width_filename = f"{page_base}-{page_num}.width"
height_filename = f"{page_base}-{page_num}.height"
await self.run_command( self.percentage += percentage_per_page
[ self.update_progress(f"Converting page {page_num}/{num_pages} to pixels")
"pdftoppm", pix = page.get_pixmap(dpi=150)
pdf_filename, rgb_buf = pix.samples_mv
page_base, await self.write_page_width(pix.width, width_filename)
"-progress", await self.write_page_height(pix.height, height_filename)
], await self.write_page_data(rgb_buf, rgb_filename)
error_message="Conversion from PDF to PPM failed",
timeout_message=(
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
" seconds"
),
stderr_callback=pdftoppm_progress_callback,
timeout=timeout,
)
final_files = ( final_files = (
glob.glob("/tmp/page-*.rgb") glob.glob("/tmp/page-*.rgb")