Remove dead code

2025-04-28 18:02:38 +02:00 · 2024-03-14 10:46:10 +02:00 · 2024-03-14 10:46:10 +02:00 · 7ea7c8a0cc
commit 7ea7c8a0cc
parent f42bb23229
8 changed files with 16 additions and 448 deletions
--- a/25
+++ b/25
@ -21,30 +21,6 @@ RUN case "$ARCH" in \
 RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt


-###########################################
-# Download Tesseract data
-
-FROM alpine:latest as tessdata-dl
-ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
-
-# Download the trained models from the latest GitHub release of Tesseract, and
-# store them under /usr/share/tessdata. This is basically what distro packages
-# do under the hood.
-#
-# Because the GitHub release contains more files than just the trained models,
-# we use `find` to fetch only the '*.traineddata' files in the top directory.
-#
-# Before we untar the models, we also check if the checksum is the expected one.
-RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
-    && TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
-        | sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
-    && wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
-    && echo "$TESSDATA_CHECKSUM  tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
-    && tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
-    && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
-    && cd .. && rm -r tessdata
-
-
 ###########################################
 # Download H2ORestart
 FROM alpine:latest as h2orestart-dl
@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
-COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
 COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext

 RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"
--- a/dangerzone/conversion/common.py
+++ b/dangerzone/conversion/common.py
@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
    return os.path.exists("/usr/share/qubes/marker-vm")


-def get_tessdata_dir() -> str:
-    if os.environ.get("TESSDATA_PREFIX"):
-        return os.environ["TESSDATA_PREFIX"]
-    elif running_on_qubes():
-        return "/usr/share/tesseract/tessdata/"
-    else:
-        return "/usr/share/tessdata/"
-
-
 class DangerzoneConverter:
    def __init__(self, progress_callback: Optional[Callable] = None) -> None:
        self.percentage: float = 0.0
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@ -1,161 +0,0 @@
-"""
-Here are the steps, with progress bar percentages:
-
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
-"""
-
-import asyncio
-import contextlib
-import glob
-import io
-import json
-import os
-import sys
-from typing import Optional
-
-from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes
-
-# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
-# stderr, but it's based on environment variables. These envvars are consulted at import
-# time [3], so we have to set them here, before we import `fitz`.
-#
-# [1] https://github.com/freedomofpress/dangerzone/issues/877
-# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
-# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
-os.environ["PYMUPDF_MESSAGE"] = "fd:2"
-os.environ["PYMUPDF_LOG"] = "fd:2"
-
-
-class PixelsToPDF(DangerzoneConverter):
-    async def convert(
-        self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
-    ) -> None:
-        self.percentage = 50.0
-        if tempdir is None:
-            tempdir = "/safezone"
-
-        # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
-        import fitz
-
-        num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
-        total_size = 0.0
-
-        safe_doc = fitz.Document()
-
-        # Convert RGB files to PDF files
-        percentage_per_page = 45.0 / num_pages
-        for page_num in range(1, num_pages + 1):
-            filename_base = f"{tempdir}/pixels/page-{page_num}"
-            rgb_filename = f"{filename_base}.rgb"
-            width_filename = f"{filename_base}.width"
-            height_filename = f"{filename_base}.height"
-
-            with open(width_filename) as f:
-                width = int(f.read().strip())
-            with open(height_filename) as f:
-                height = int(f.read().strip())
-            with open(rgb_filename, "rb") as rgb_f:
-                untrusted_rgb_data = rgb_f.read()
-            # The first few operations happen on a per-page basis.
-            page_size = len(untrusted_rgb_data)
-            total_size += page_size
-            pixmap = fitz.Pixmap(
-                fitz.Colorspace(fitz.CS_RGB),
-                width,
-                height,
-                untrusted_rgb_data,
-                False,
-            )
-            pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
-            if ocr_lang:  # OCR the document
-                self.update_progress(
-                    f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
-                )
-                if int(fitz.version[2]) >= 20230621000001:
-                    page_pdf_bytes = pixmap.pdfocr_tobytes(
-                        compress=True,
-                        language=ocr_lang,
-                        tessdata=get_tessdata_dir(),
-                    )
-                else:
-                    # XXX: In PyMuPDF v1.22.5, the function signature of
-                    # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
-                    # to explicitly set the Tesseract data dir [1].
-                    #
-                    # In earlier versions, the PyMuPDF developers recommend setting this
-                    # path via the TESSDATA_PREFIX environment variable. In practice,
-                    # this environment variable is read at import time, so subsequent
-                    # changes to the environment variable are not tracked [2].
-                    #
-                    # To make things worse, any attempt to alter the internal attribute
-                    # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
-                    # the OCR functions. That's due to the way imports work in `fitz`,
-                    # where somehow the internal `fitz.fitz` module is shadowed.
-                    #
-                    # A hacky solution is to grab the `fitz.fitz` module from
-                    # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
-                    # get away with this hack because we have a proper solution for
-                    # subsequent PyMuPDF versions, and we know that nothing will change
-                    # in older versions.
-                    #
-                    # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
-                    #
-                    # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
-                    # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
-                    sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir()  # type: ignore [attr-defined]
-
-                    page_pdf_bytes = pixmap.pdfocr_tobytes(
-                        compress=True,
-                        language=ocr_lang,
-                    )
-                ocr_pdf = fitz.open("pdf", page_pdf_bytes)
-            else:  # Don't OCR
-                self.update_progress(
-                    f"Converting page {page_num}/{num_pages} from pixels to PDF"
-                )
-                page_doc = fitz.Document()
-                page_doc.insert_file(pixmap)
-                page_pdf_bytes = page_doc.tobytes(deflate_images=True)
-
-            safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
-            self.percentage += percentage_per_page
-
-        self.percentage = 100.0
-        self.update_progress("Safe PDF created")
-
-        # Move converted files into /safezone
-        if running_on_qubes():
-            safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf"
-        else:
-            safe_pdf_path = f"/safezone/safe-output-compressed.pdf"
-
-        safe_doc.save(safe_pdf_path, deflate_images=True)
-
-    def update_progress(self, text: str, *, error: bool = False) -> None:
-        if running_on_qubes():
-            if self.progress_callback:
-                self.progress_callback(error, text, self.percentage)
-        else:
-            print(
-                json.dumps(
-                    {"error": error, "text": text, "percentage": self.percentage}
-                )
-            )
-            sys.stdout.flush()
-
-
-async def main() -> int:
-    ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None
-    converter = PixelsToPDF()
-
-    try:
-        await converter.convert(ocr_lang)
-        return 0
-    except (RuntimeError, ValueError) as e:
-        converter.update_progress(str(e), error=True)
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(asyncio.run(main()))
--- a/dangerzone/isolation_provider/base.py
+++ b/dangerzone/isolation_provider/base.py
@ -220,12 +220,6 @@ class IsolationProvider(ABC):
        text = "Successfully converted document"
        self.print_progress(document, False, text, 100)

-    @abstractmethod
-    def pixels_to_pdf(
-        self, document: Document, tempdir: str, ocr_lang: Optional[str]
-    ) -> None:
-        pass
-
    def print_progress(
        self, document: Document, error: bool, text: str, percentage: float
    ) -> None:
@ -352,74 +346,3 @@ class IsolationProvider(ABC):
                    f"{debug_log}"  # no need for an extra newline here
                    f"{DOC_TO_PIXELS_LOG_END}"
                )
-
-# From global_common:
-
-# def validate_convert_to_pixel_output(self, common, output):
-#     """
-#     Take the output from the convert to pixels tasks and validate it. Returns
-#     a tuple like: (success (boolean), error_message (str))
-#     """
-#     max_image_width = 10000
-#     max_image_height = 10000
-
-#     # Did we hit an error?
-#     for line in output.split("\n"):
-#         if (
-#             "failed:" in line
-#             or "The document format is not supported" in line
-#             or "Error" in line
-#         ):
-#             return False, output
-
-#     # How many pages was that?
-#     num_pages = None
-#     for line in output.split("\n"):
-#         if line.startswith("Document has "):
-#             num_pages = line.split(" ")[2]
-#             break
-#     if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
-#         return False, "Invalid number of pages returned"
-#     num_pages = int(num_pages)
-
-#     # Make sure we have the files we expect
-#     expected_filenames = []
-#     for i in range(1, num_pages + 1):
-#         expected_filenames += [
-#             f"page-{i}.rgb",
-#             f"page-{i}.width",
-#             f"page-{i}.height",
-#         ]
-#     expected_filenames.sort()
-#     actual_filenames = os.listdir(common.pixel_dir.name)
-#     actual_filenames.sort()
-
-#     if expected_filenames != actual_filenames:
-#         return (
-#             False,
-#             f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
-#         )
-
-#     # Make sure the files are the correct sizes
-#     for i in range(1, num_pages + 1):
-#         with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
-#             w_str = f.read().strip()
-#         with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
-#             h_str = f.read().strip()
-#         w = int(w_str)
-#         h = int(h_str)
-#         if (
-#             not w_str.isdigit()
-#             or not h_str.isdigit()
-#             or w <= 0
-#             or w > max_image_width
-#             or h <= 0
-#             or h > max_image_height
-#         ):
-#             return False, f"Page {i} has invalid geometry"
-
-#         # Make sure the RGB file is the correct size
-#         if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
-#             return False, f"Page {i} has an invalid RGB file size"
-
-#     return True, True
--- a/dangerzone/isolation_provider/container.py
+++ b/dangerzone/isolation_provider/container.py
@ -1,24 +1,15 @@
 import gzip
-import json
 import logging
 import os
 import platform
 import shlex
 import shutil
 import subprocess
-import sys
-from typing import Any, List, Optional, Tuple
+from typing import List, Tuple

-from ..conversion import errors
 from ..document import Document
-from ..util import get_tmp_dir  # NOQA : required for mocking in our tests.
 from ..util import get_resource_path, get_subprocess_startupinfo
-from .base import (
-    PIXELS_TO_PDF_LOG_END,
-    PIXELS_TO_PDF_LOG_START,
-    IsolationProvider,
-    terminate_process_group,
-)
+from .base import IsolationProvider, terminate_process_group

 TIMEOUT_KILL = 5  # Timeout in seconds until the kill command returns.

@ -234,31 +225,6 @@ class Container(IsolationProvider):
        """Unique container name for the pixels-to-pdf phase."""
        return f"dangerzone-pixels-to-pdf-{document.id}"

-    def assert_field_type(self, val: Any, _type: object) -> None:
-        # XXX: Use a stricter check than isinstance because `bool` is a subclass of
-        # `int`.
-        #
-        # See https://stackoverflow.com/a/37888668
-        if type(val) is not _type:
-            raise ValueError("Status field has incorrect type")
-
-    def parse_progress_trusted(self, document: Document, line: str) -> None:
-        """
-        Parses a line returned by the container.
-        """
-        try:
-            status = json.loads(line)
-            text = status["text"]
-            self.assert_field_type(text, str)
-            error = status["error"]
-            self.assert_field_type(error, bool)
-            percentage = status["percentage"]
-            self.assert_field_type(percentage, float)
-            self.print_progress(document, error, text, percentage)
-        except Exception:
-            error_message = f"Invalid JSON returned from container:\n\n\t {line}"
-            self.print_progress(document, True, error_message, -1)
-
    def exec(
        self,
        args: List[str],
@ -337,84 +303,6 @@ class Container(IsolationProvider):
                f"Unexpected error occurred while killing container '{name}': {str(e)}"
            )

-    def pixels_to_pdf(
-        self, document: Document, tempdir: str, ocr_lang: Optional[str]
-    ) -> None:
-        # Convert pixels to safe PDF
-        command = [
-            "/usr/bin/python3",
-            "-m",
-            "dangerzone.conversion.pixels_to_pdf",
-        ]
-        extra_args = [
-            "-v",
-            f"{tempdir}:/safezone:Z",
-            "-e",
-            f"OCR={0 if ocr_lang is None else 1}",
-            "-e",
-            f"OCR_LANGUAGE={ocr_lang}",
-        ]
-        # XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a
-        # container, which involves mounting two temp dirs. This does not bode well with
-        # gVisor for two reasons:
-        #
-        # 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs,
-        #    meaning that the location of the temp dirs must be relevant to that path.
-        # 2. Reading and writing to these temp dirs requires permissions which are not
-        #    available to the user within gVisor's user namespace.
-        #
-        # For these reasons, and because the pixels to PDF phase is more trusted (and
-        # will soon stop being containerized), we circumvent gVisor support by doing the
-        # following:
-        #
-        # 1. Override our entrypoint script with a no-op command (/usr/bin/env).
-        # 2. Set the PYTHONPATH so that we can import the Python code within
-        #    /home/dangerzone/dangerzone-image/rootfs
-        # 3. Run the container as the root user, so that it can always write to the
-        #    mounted directories. This container is trusted, so running as root has no
-        #    impact to the security of Dangerzone.
-        img_root = "/home/dangerzone/dangerzone-image/rootfs"
-        extra_args += [
-            "--entrypoint",
-            "/usr/bin/env",
-            "-e",
-            f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages",
-            "-e",
-            f"TESSDATA_PREFIX={img_root}/usr/share/tessdata",
-            "-u",
-            "root",
-        ]
-
-        name = self.pixels_to_pdf_container_name(document)
-        pixels_to_pdf_proc = self.exec_container(command, name, extra_args)
-        if pixels_to_pdf_proc.stdout:
-            for line in pixels_to_pdf_proc.stdout:
-                self.parse_progress_trusted(document, line.decode())
-        error_code = pixels_to_pdf_proc.wait()
-
-        # In case of a dev run, log everything from the second container.
-        if getattr(sys, "dangerzone_dev", False):
-            assert pixels_to_pdf_proc.stderr
-            out = pixels_to_pdf_proc.stderr.read().decode()
-            text = (
-                f"Conversion output: (pixels to PDF)\n"
-                f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}"
-            )
-            log.info(text)
-
-        if error_code != 0:
-            log.error("pixels-to-pdf failed")
-            raise errors.exception_from_error_code(error_code)
-        else:
-            # Move the final file to the right place
-            if os.path.exists(document.output_filename):
-                os.remove(document.output_filename)
-
-            container_output_filename = os.path.join(
-                tempdir, "safe-output-compressed.pdf"
-            )
-            shutil.move(container_output_filename, document.output_filename)
-
    def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen:
        # Convert document to pixels
        command = [
--- a/dangerzone/isolation_provider/qubes.py
+++ b/dangerzone/isolation_provider/qubes.py
@ -1,20 +1,16 @@
-import asyncio
 import io
 import logging
 import os
-import shutil
 import subprocess
 import sys
 import zipfile
 from pathlib import Path
-from typing import IO, Optional
+from typing import IO

-from ..conversion import errors
 from ..conversion.common import running_on_qubes
-from ..conversion.pixels_to_pdf import PixelsToPDF
 from ..document import Document
 from ..util import get_resource_path
-from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider
+from .base import IsolationProvider

 log = logging.getLogger(__name__)

@ -25,28 +21,6 @@ class Qubes(IsolationProvider):
    def install(self) -> bool:
        return True

-    def pixels_to_pdf(
-        self, document: Document, tempdir: str, ocr_lang: Optional[str]
-    ) -> None:
-        def print_progress_wrapper(error: bool, text: str, percentage: float) -> None:
-            self.print_progress(document, error, text, percentage)
-
-        converter = PixelsToPDF(progress_callback=print_progress_wrapper)
-        try:
-            asyncio.run(converter.convert(ocr_lang, tempdir))
-        except (RuntimeError, ValueError) as e:
-            raise errors.UnexpectedConversionError(str(e))
-        finally:
-            if getattr(sys, "dangerzone_dev", False):
-                out = converter.captured_output.decode()
-                text = (
-                    f"Conversion output: (pixels to PDF)\n"
-                    f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}"
-                )
-                log.info(text)
-
-        shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
-
    def get_max_parallel_conversions(self) -> int:
        return 1

--- a/dangerzone/util.py
+++ b/dangerzone/util.py
@ -3,7 +3,6 @@ import platform
 import subprocess
 import sys
 import unicodedata
-from typing import Optional

 import appdirs

@ -12,17 +11,6 @@ def get_config_dir() -> str:
    return appdirs.user_config_dir("dangerzone")


-def get_tmp_dir() -> Optional[str]:
-    """Get the parent dir for the Dangerzone temporary dirs.
-
-    This function returns the parent directory where Dangerzone will store its temporary
-    directories. The default behavior is to let Python choose for us (e.g., in `/tmp`
-    for Linux), which is why we return None. However, we still need to define this
-    function in order to be able to set this dir via mocking in our tests.
-    """
-    return None
-
-
 def get_resource_path(filename: str) -> str:
    if getattr(sys, "dangerzone_dev", False):
        # Look for resources directory relative to python file
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -134,29 +134,19 @@ class TestCli:
        if os.environ.get("DUMMY_CONVERSION", False):
            args = ("--unsafe-dummy-conversion", *args)

-        with tempfile.TemporaryDirectory() as t:
-            tmp_dir = Path(t)
-            # TODO: Replace this with `contextlib.chdir()` [1], which was added in
-            # Python 3.11.
-            #
-            # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir
-            try:
-                if tmp_path is not None:
-                    cwd = os.getcwd()
-                    os.chdir(tmp_path)
+        # TODO: Replace this with `contextlib.chdir()` [1], which was added in
+        # Python 3.11.
+        #
+        # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir
+        try:
+            if tmp_path is not None:
+                cwd = os.getcwd()
+                os.chdir(tmp_path)

-                with mock.patch(
-                    "dangerzone.isolation_provider.container.get_tmp_dir",
-                    return_value=t,
-                ):
-                    result = CliRunner().invoke(cli_main, args)
-            finally:
-                if tmp_path is not None:
-                    os.chdir(cwd)
-
-                if tmp_dir.exists():
-                    stale_files = list(tmp_dir.iterdir())
-                    assert not stale_files
+            result = CliRunner().invoke(cli_main, args)
+        finally:
+            if tmp_path is not None:
+                os.chdir(cwd)

        # XXX Print stdout so that junitXML exports with output capturing
        # actually include the stdout + stderr (they are combined into stdout)