diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 4e8e9f3..94f9580 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -44,6 +44,13 @@ def calculate_timeout(size: float, pages: Optional[float] = None) -> float: return timeout +def get_tessdata_dir() -> str: + if running_on_qubes(): + return "/usr/share/tesseract/tessdata/" + else: + return "/usr/share/tessdata/" + + class DangerzoneConverter: def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.percentage: float = 0.0 diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 56b1d66..13516e0 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -13,7 +13,9 @@ import shutil import sys from typing import Optional -from .common import DangerzoneConverter, running_on_qubes +import fitz + +from .common import DangerzoneConverter, get_tessdata_dir, running_on_qubes class PixelsToPDF(DangerzoneConverter): @@ -27,90 +29,45 @@ class PixelsToPDF(DangerzoneConverter): num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) total_size = 0.0 + safe_doc = fitz.Document() + # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages - for page in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page}" + for page_num in range(1, num_pages + 1): + filename_base = f"{tempdir}/dangerzone/page-{page_num}" rgb_filename = f"{filename_base}.rgb" width_filename = f"{filename_base}.width" height_filename = f"{filename_base}.height" - png_filename = f"{tempdir}/page-{page}.png" - ocr_filename = f"{tempdir}/page-{page}" - pdf_filename = f"{tempdir}/page-{page}.pdf" with open(width_filename) as f: - width = f.read().strip() + width = int(f.read().strip()) with open(height_filename) as f: - height = f.read().strip() - + height = int(f.read().strip()) + with open(rgb_filename, "rb") as rgb_f: + untrusted_rgb_data = rgb_f.read() # The first few operations happen on a per-page basis. - page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 + page_size = len(untrusted_rgb_data) total_size += page_size timeout = self.calculate_timeout(page_size, 1) - + pixmap = fitz.Pixmap( + fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False + ) if ocr_lang: # OCR the document self.update_progress( - f"Converting page {page}/{num_pages} from pixels to searchable PDF" + f"Converting page {page_num}/{num_pages} from pixels to searchable PDF" ) - await self.run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"png:{png_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PNG failed", - timeout_message=( - "Error converting pixels to PNG, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, + ocr_pdf_bytes = pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), ) - await self.run_command( - [ - "tesseract", - png_filename, - ocr_filename, - "-l", - ocr_lang, - "--dpi", - "70", - "pdf", - ], - error_message=f"Page {page}/{num_pages} OCR failed", - timeout_message=( - "Error converting PNG to searchable PDF, tesseract timed out" - f" after {timeout} seconds" - ), - timeout=timeout, - ) - + ocr_pdf = fitz.open("pdf", ocr_pdf_bytes) + safe_doc.insert_pdf(ocr_pdf) else: # Don't OCR self.update_progress( - f"Converting page {page}/{num_pages} from pixels to PDF" - ) - await self.run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"pdf:{pdf_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PDF failed", - timeout_message=( - "Error converting RGB to PDF, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, + f"Converting page {page_num}/{num_pages} from pixels to PDF" ) + safe_doc.insert_file(pixmap) self.percentage += percentage_per_page @@ -118,46 +75,16 @@ class PixelsToPDF(DangerzoneConverter): # timeout. timeout = self.calculate_timeout(total_size, num_pages) - # Merge pages into a single PDF - self.update_progress(f"Merging {num_pages} pages into a single PDF") - args = ["pdfunite"] - for page in range(1, num_pages + 1): - args.append(f"{tempdir}/page-{page}.pdf") - args.append(f"{tempdir}/safe-output.pdf") - await self.run_command( - args, - error_message="Merging pages into a single PDF failed", - timeout_message=( - "Error merging pages into a single PDF, pdfunite timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) - - self.percentage += 2 - - # Compress - self.update_progress("Compressing PDF") - await self.run_command( - [ - "ps2pdf", - f"{tempdir}/safe-output.pdf", - f"{tempdir}/safe-output-compressed.pdf", - ], - error_message="Compressing PDF failed", - timeout_message=( - f"Error compressing PDF, ps2pdf timed out after {timeout} seconds" - ), - timeout=timeout, - ) - self.percentage = 100.0 self.update_progress("Safe PDF created") # Move converted files into /safezone - if not running_on_qubes(): - shutil.move(f"{tempdir}/safe-output.pdf", "/safezone") - shutil.move(f"{tempdir}/safe-output-compressed.pdf", "/safezone") + if running_on_qubes(): + safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf" + else: + safe_pdf_path = f"/safezone/safe-output-compressed.pdf" + + safe_doc.save(safe_pdf_path, deflate_images=True) async def main() -> int: diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 61d641b..2d51e58 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -324,6 +324,8 @@ class Container(IsolationProvider): "-v", f"{safe_dir}:/safezone:Z", "-e", + "TESSDATA_PREFIX=/usr/share/tessdata", + "-e", f"OCR={ocr}", "-e", f"OCR_LANGUAGE={ocr_lang}", diff --git a/install/linux/dangerzone.spec b/install/linux/dangerzone.spec index c93c888..453c42f 100644 --- a/install/linux/dangerzone.spec +++ b/install/linux/dangerzone.spec @@ -72,6 +72,7 @@ BuildRequires: python3-devel %if 0%{?_qubes} # Qubes-only requirements (server-side) Requires: python3-magic +Requires: python3-PyMuPDF Requires: libreoffice # Qubes-only requirements (client-side) Requires: GraphicsMagick diff --git a/poetry.lock b/poetry.lock index e69388e..5945f7a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -63,7 +63,6 @@ packaging = ">=22.0" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -195,7 +194,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" @@ -406,7 +404,6 @@ files = [ ] [package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -555,7 +552,6 @@ files = [ [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=4.1.0" [package.extras] @@ -608,9 +604,6 @@ files = [ {file = "platformdirs-4.0.0.tar.gz", hash = "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""} - [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] @@ -626,9 +619,6 @@ files = [ {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] @@ -656,7 +646,6 @@ files = [ [package.dependencies] altgraph = "*" -importlib-metadata = {version = ">=1.4", markers = "python_version < \"3.8\""} macholib = {version = ">=1.8", markers = "sys_platform == \"darwin\""} pyinstaller-hooks-contrib = ">=2021.4" setuptools = ">=42.0.0" @@ -741,7 +730,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -909,56 +897,6 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -[[package]] -name = "typed-ast" -version = "1.5.5" -description = "a fork of Python 2 and 3 ast modules with type comment support" -optional = false -python-versions = ">=3.6" -files = [ - {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, - {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, - {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, - {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, - {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, - {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, - {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, - {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, - {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, - {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, - {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, -] - [[package]] name = "types-markdown" version = "3.5.0.3" @@ -1040,5 +978,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" -python-versions = ">=3.7,<3.12" -content-hash = "fd4f4e9362f23cf48a16bfe8c11ed4eb68a4ae7d515c33f5e98b42ef270cac57" +python-versions = ">=3.8,<3.12" +content-hash = "7afa934aee6e88893523238bd61166393f04956ebd3c3185449335411e91215a" diff --git a/pyproject.toml b/pyproject.toml index d0a9ef3..13a20ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ include = [ ] [tool.poetry.dependencies] -python = ">=3.7,<3.12" +python = ">=3.8,<3.12" click = "*" appdirs = "*" PySide6 = "^6.4.1"