Add PyMuPDF in pixels_to_pdf replacing old logic

Adding PyMuPDF essentially make the code much simpler since it can do
everything that we'd need multiple programs for. It also includes
tesseract-OCR integration, which this commit makes use of.
This commit is contained in:
deeplow 2023-11-16 08:35:03 +00:00
parent ba17016643
commit 77d5ea5940
No known key found for this signature in database
GPG key ID: 577982871529A52A
6 changed files with 43 additions and 168 deletions

View file

@ -44,6 +44,13 @@ def calculate_timeout(size: float, pages: Optional[float] = None) -> float:
return timeout
def get_tessdata_dir() -> str:
if running_on_qubes():
return "/usr/share/tesseract/tessdata/"
else:
return "/usr/share/tessdata/"
class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0

View file

@ -13,7 +13,9 @@ import shutil
import sys
from typing import Optional
from .common import DangerzoneConverter, running_on_qubes
import fitz
from .common import DangerzoneConverter, get_tessdata_dir, running_on_qubes
class PixelsToPDF(DangerzoneConverter):
@ -27,90 +29,45 @@ class PixelsToPDF(DangerzoneConverter):
num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb"))
total_size = 0.0
safe_doc = fitz.Document()
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page}"
for page_num in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page_num}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"{tempdir}/page-{page}.png"
ocr_filename = f"{tempdir}/page-{page}"
pdf_filename = f"{tempdir}/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
width = int(f.read().strip())
with open(height_filename) as f:
height = f.read().strip()
height = int(f.read().strip())
with open(rgb_filename, "rb") as rgb_f:
untrusted_rgb_data = rgb_f.read()
# The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
page_size = len(untrusted_rgb_data)
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False
)
if ocr_lang: # OCR the document
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
)
await self.run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
ocr_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
tessdata=get_tessdata_dir(),
)
await self.run_command(
[
"tesseract",
png_filename,
ocr_filename,
"-l",
ocr_lang,
"--dpi",
"70",
"pdf",
],
error_message=f"Page {page}/{num_pages} OCR failed",
timeout_message=(
"Error converting PNG to searchable PDF, tesseract timed out"
f" after {timeout} seconds"
),
timeout=timeout,
)
ocr_pdf = fitz.open("pdf", ocr_pdf_bytes)
safe_doc.insert_pdf(ocr_pdf)
else: # Don't OCR
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to PDF"
)
await self.run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
timeout_message=(
"Error converting RGB to PDF, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
f"Converting page {page_num}/{num_pages} from pixels to PDF"
)
safe_doc.insert_file(pixmap)
self.percentage += percentage_per_page
@ -118,46 +75,16 @@ class PixelsToPDF(DangerzoneConverter):
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)
# Merge pages into a single PDF
self.update_progress(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"{tempdir}/page-{page}.pdf")
args.append(f"{tempdir}/safe-output.pdf")
await self.run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
self.percentage += 2
# Compress
self.update_progress("Compressing PDF")
await self.run_command(
[
"ps2pdf",
f"{tempdir}/safe-output.pdf",
f"{tempdir}/safe-output-compressed.pdf",
],
error_message="Compressing PDF failed",
timeout_message=(
f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
),
timeout=timeout,
)
self.percentage = 100.0
self.update_progress("Safe PDF created")
# Move converted files into /safezone
if not running_on_qubes():
shutil.move(f"{tempdir}/safe-output.pdf", "/safezone")
shutil.move(f"{tempdir}/safe-output-compressed.pdf", "/safezone")
if running_on_qubes():
safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf"
else:
safe_pdf_path = f"/safezone/safe-output-compressed.pdf"
safe_doc.save(safe_pdf_path, deflate_images=True)
async def main() -> int:

View file

@ -324,6 +324,8 @@ class Container(IsolationProvider):
"-v",
f"{safe_dir}:/safezone:Z",
"-e",
"TESSDATA_PREFIX=/usr/share/tessdata",
"-e",
f"OCR={ocr}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",

View file

@ -72,6 +72,7 @@ BuildRequires: python3-devel
%if 0%{?_qubes}
# Qubes-only requirements (server-side)
Requires: python3-magic
Requires: python3-PyMuPDF
Requires: libreoffice
# Qubes-only requirements (client-side)
Requires: GraphicsMagick

66
poetry.lock generated
View file

@ -63,7 +63,6 @@ packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""}
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
[package.extras]
@ -195,7 +194,6 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
[[package]]
name = "colorama"
@ -406,7 +404,6 @@ files = [
]
[package.dependencies]
typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
zipp = ">=0.5"
[package.extras]
@ -555,7 +552,6 @@ files = [
[package.dependencies]
mypy-extensions = ">=1.0.0"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""}
typing-extensions = ">=4.1.0"
[package.extras]
@ -608,9 +604,6 @@ files = [
{file = "platformdirs-4.0.0.tar.gz", hash = "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731"},
]
[package.dependencies]
typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""}
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
@ -626,9 +619,6 @@ files = [
{file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
]
[package.dependencies]
importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
@ -656,7 +646,6 @@ files = [
[package.dependencies]
altgraph = "*"
importlib-metadata = {version = ">=1.4", markers = "python_version < \"3.8\""}
macholib = {version = ">=1.8", markers = "sys_platform == \"darwin\""}
pyinstaller-hooks-contrib = ">=2021.4"
setuptools = ">=42.0.0"
@ -741,7 +730,6 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
@ -909,56 +897,6 @@ files = [
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
[[package]]
name = "typed-ast"
version = "1.5.5"
description = "a fork of Python 2 and 3 ast modules with type comment support"
optional = false
python-versions = ">=3.6"
files = [
{file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"},
{file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"},
{file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"},
{file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"},
{file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"},
{file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"},
{file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"},
{file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"},
{file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"},
{file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"},
{file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"},
{file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"},
{file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"},
{file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"},
{file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"},
{file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"},
{file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"},
{file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"},
{file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"},
{file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"},
{file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"},
{file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"},
{file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"},
{file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"},
{file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"},
{file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"},
{file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"},
{file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"},
{file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"},
{file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"},
{file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"},
{file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"},
{file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"},
{file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"},
{file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"},
{file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"},
{file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"},
{file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"},
{file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"},
{file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"},
{file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"},
]
[[package]]
name = "types-markdown"
version = "3.5.0.3"
@ -1040,5 +978,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
[metadata]
lock-version = "2.0"
python-versions = ">=3.7,<3.12"
content-hash = "fd4f4e9362f23cf48a16bfe8c11ed4eb68a4ae7d515c33f5e98b42ef270cac57"
python-versions = ">=3.8,<3.12"
content-hash = "7afa934aee6e88893523238bd61166393f04956ebd3c3185449335411e91215a"

View file

@ -12,7 +12,7 @@ include = [
]
[tool.poetry.dependencies]
python = ">=3.7,<3.12"
python = ">=3.8,<3.12"
click = "*"
appdirs = "*"
PySide6 = "^6.4.1"