Remove dead code

2025-04-28 18:02:38 +02:00 · 2024-03-14 10:46:10 +02:00 · 2024-03-14 10:46:10 +02:00 · 7ea7c8a0cc
commit 7ea7c8a0cc
parent f42bb23229
8 changed files with 16 additions and 448 deletions
--- a/25
+++ b/25
@ -21,30 +21,6 @@ RUN case "$ARCH" in \
 RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt
 ###########################################
 # Download Tesseract data
 FROM alpine:latest as tessdata-dl
 ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
 # Download the trained models from the latest GitHub release of Tesseract, and
 # store them under /usr/share/tessdata. This is basically what distro packages
 # do under the hood.
 #
 # Because the GitHub release contains more files than just the trained models,
 # we use `find` to fetch only the '*.traineddata' files in the top directory.
 #
 # Before we untar the models, we also check if the checksum is the expected one.
 RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
    && TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
        | sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
    && wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
    && echo "$TESSDATA_CHECKSUM  tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
    && tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
    && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
    && cd .. && rm -r tessdata
 ###########################################
 # Download H2ORestart
 FROM alpine:latest as h2orestart-dl
@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
 COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
 COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
 RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"
--- a/dangerzone/conversion/common.py
+++ b/dangerzone/conversion/common.py
@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
    return os.path.exists("/usr/share/qubes/marker-vm")
 def get_tessdata_dir() -> str:
    if os.environ.get("TESSDATA_PREFIX"):
        return os.environ["TESSDATA_PREFIX"]
    elif running_on_qubes():
        return "/usr/share/tesseract/tessdata/"
    else:
        return "/usr/share/tessdata/"
 class DangerzoneConverter:
    def __init__(self, progress_callback: Optional[Callable] = None) -> None:
        self.percentage: float = 0.0
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@ -1,161 +0,0 @@
 """
 Here are the steps, with progress bar percentages:
 - 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
 - 95%-100%: Compress the final PDF
 """
 import asyncio
 import contextlib
 import glob
 import io
 import json
 import os
 import sys
 from typing import Optional
 from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes
 # XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
 # stderr, but it's based on environment variables. These envvars are consulted at import
 # time [3], so we have to set them here, before we import `fitz`.
 #
 # [1] https://github.com/freedomofpress/dangerzone/issues/877
 # [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
 # [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
 os.environ["PYMUPDF_MESSAGE"] = "fd:2"
 os.environ["PYMUPDF_LOG"] = "fd:2"
 class PixelsToPDF(DangerzoneConverter):
    async def convert(
        self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
    ) -> None:
        self.percentage = 50.0
        if tempdir is None:
            tempdir = "/safezone"
        # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
        import fitz
        num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
        total_size = 0.0
        safe_doc = fitz.Document()
        # Convert RGB files to PDF files
        percentage_per_page = 45.0 / num_pages
        for page_num in range(1, num_pages + 1):
            filename_base = f"{tempdir}/pixels/page-{page_num}"
            rgb_filename = f"{filename_base}.rgb"
            width_filename = f"{filename_base}.width"
            height_filename = f"{filename_base}.height"
            with open(width_filename) as f:
                width = int(f.read().strip())
            with open(height_filename) as f:
                height = int(f.read().strip())
            with open(rgb_filename, "rb") as rgb_f:
                untrusted_rgb_data = rgb_f.read()
            # The first few operations happen on a per-page basis.
            page_size = len(untrusted_rgb_data)
            total_size += page_size
            pixmap = fitz.Pixmap(
                fitz.Colorspace(fitz.CS_RGB),
                width,
                height,
                untrusted_rgb_data,
                False,
            )
            pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
            if ocr_lang:  # OCR the document
                self.update_progress(
                    f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
                )
                if int(fitz.version[2]) >= 20230621000001:
                    page_pdf_bytes = pixmap.pdfocr_tobytes(
                        compress=True,
                        language=ocr_lang,
                        tessdata=get_tessdata_dir(),
                    )
                else:
                    # XXX: In PyMuPDF v1.22.5, the function signature of
                    # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
                    # to explicitly set the Tesseract data dir [1].
                    #
                    # In earlier versions, the PyMuPDF developers recommend setting this
                    # path via the TESSDATA_PREFIX environment variable. In practice,
                    # this environment variable is read at import time, so subsequent
                    # changes to the environment variable are not tracked [2].
                    #
                    # To make things worse, any attempt to alter the internal attribute
                    # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
                    # the OCR functions. That's due to the way imports work in `fitz`,
                    # where somehow the internal `fitz.fitz` module is shadowed.
                    #
                    # A hacky solution is to grab the `fitz.fitz` module from
                    # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
                    # get away with this hack because we have a proper solution for
                    # subsequent PyMuPDF versions, and we know that nothing will change
                    # in older versions.
                    #
                    # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
                    #
                    # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
                    # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
                    sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir()  # type: ignore [attr-defined]
                    page_pdf_bytes = pixmap.pdfocr_tobytes(
                        compress=True,
                        language=ocr_lang,
                    )
                ocr_pdf = fitz.open("pdf", page_pdf_bytes)
            else:  # Don't OCR
                self.update_progress(
                    f"Converting page {page_num}/{num_pages} from pixels to PDF"
                )
                page_doc = fitz.Document()
                page_doc.insert_file(pixmap)
                page_pdf_bytes = page_doc.tobytes(deflate_images=True)
            safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
            self.percentage += percentage_per_page
        self.percentage = 100.0
        self.update_progress("Safe PDF created")
        # Move converted files into /safezone
        if running_on_qubes():
            safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf"
        else:
            safe_pdf_path = f"/safezone/safe-output-compressed.pdf"
        safe_doc.save(safe_pdf_path, deflate_images=True)
    def update_progress(self, text: str, *, error: bool = False) -> None:
        if running_on_qubes():
            if self.progress_callback:
                self.progress_callback(error, text, self.percentage)
        else:
            print(
                json.dumps(
                    {"error": error, "text": text, "percentage": self.percentage}
                )
            )
            sys.stdout.flush()
 async def main() -> int:
    ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None
    converter = PixelsToPDF()
    try:
        await converter.convert(ocr_lang)
        return 0
    except (RuntimeError, ValueError) as e:
        converter.update_progress(str(e), error=True)
        return 1
 if __name__ == "__main__":
    sys.exit(asyncio.run(main()))
--- a/dangerzone/isolation_provider/base.py
+++ b/dangerzone/isolation_provider/base.py
@ -220,12 +220,6 @@ class IsolationProvider(ABC):
        text = "Successfully converted document"
        self.print_progress(document, False, text, 100)
    @abstractmethod
    def pixels_to_pdf(
        self, document: Document, tempdir: str, ocr_lang: Optional[str]
    ) -> None:
        pass
    def print_progress(
        self, document: Document, error: bool, text: str, percentage: float
    ) -> None:
@ -352,74 +346,3 @@ class IsolationProvider(ABC):
                    f"{debug_log}"  # no need for an extra newline here
                    f"{DOC_TO_PIXELS_LOG_END}"
                )
 # From global_common:
 # def validate_convert_to_pixel_output(self, common, output):
 #     """
 #     Take the output from the convert to pixels tasks and validate it. Returns
 #     a tuple like: (success (boolean), error_message (str))
 #     """
 #     max_image_width = 10000
 #     max_image_height = 10000
 #     # Did we hit an error?
 #     for line in output.split("\n"):
 #         if (
 #             "failed:" in line
 #             or "The document format is not supported" in line
 #             or "Error" in line
 #         ):
 #             return False, output
 #     # How many pages was that?
 #     num_pages = None
 #     for line in output.split("\n"):
 #         if line.startswith("Document has "):
 #             num_pages = line.split(" ")[2]
 #             break
 #     if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
 #         return False, "Invalid number of pages returned"
 #     num_pages = int(num_pages)
 #     # Make sure we have the files we expect
 #     expected_filenames = []
 #     for i in range(1, num_pages + 1):
 #         expected_filenames += [
 #             f"page-{i}.rgb",
 #             f"page-{i}.width",
 #             f"page-{i}.height",
 #         ]
 #     expected_filenames.sort()
 #     actual_filenames = os.listdir(common.pixel_dir.name)
 #     actual_filenames.sort()
 #     if expected_filenames != actual_filenames:
 #         return (
 #             False,
 #             f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
 #         )
 #     # Make sure the files are the correct sizes
 #     for i in range(1, num_pages + 1):
 #         with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
 #             w_str = f.read().strip()
 #         with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
 #             h_str = f.read().strip()
 #         w = int(w_str)
 #         h = int(h_str)
 #         if (
 #             not w_str.isdigit()
 #             or not h_str.isdigit()
 #             or w <= 0
 #             or w > max_image_width
 #             or h <= 0
 #             or h > max_image_height
 #         ):
 #             return False, f"Page {i} has invalid geometry"
 #         # Make sure the RGB file is the correct size
 #         if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
 #             return False, f"Page {i} has an invalid RGB file size"
 #     return True, True
--- a/dangerzone/isolation_provider/container.py
+++ b/dangerzone/isolation_provider/container.py
@ -1,24 +1,15 @@
 import gzip
 import json
 import logging
 import os
 import platform
 import shlex
 import shutil
 import subprocess
-import sys
+from typing import List, Tuple
 from typing import Any, List, Optional, Tuple
 from ..conversion import errors
 from ..document import Document
 from ..util import get_tmp_dir  # NOQA : required for mocking in our tests.
 from ..util import get_resource_path, get_subprocess_startupinfo
-from .base import (
+from .base import IsolationProvider, terminate_process_group
    PIXELS_TO_PDF_LOG_END,
    PIXELS_TO_PDF_LOG_START,
    IsolationProvider,
    terminate_process_group,
 )
 TIMEOUT_KILL = 5  # Timeout in seconds until the kill command returns.
@ -234,31 +225,6 @@ class Container(IsolationProvider):
        """Unique container name for the pixels-to-pdf phase."""
        return f"dangerzone-pixels-to-pdf-{document.id}"
    def assert_field_type(self, val: Any, _type: object) -> None:
        # XXX: Use a stricter check than isinstance because `bool` is a subclass of
        # `int`.
        #
        # See https://stackoverflow.com/a/37888668
        if type(val) is not _type:
            raise ValueError("Status field has incorrect type")
    def parse_progress_trusted(self, document: Document, line: str) -> None:
        """
        Parses a line returned by the container.
        """
        try:
            status = json.loads(line)
            text = status["text"]
            self.assert_field_type(text, str)
            error = status["error"]
            self.assert_field_type(error, bool)
            percentage = status["percentage"]
            self.assert_field_type(percentage, float)
            self.print_progress(document, error, text, percentage)
        except Exception:
            error_message = f"Invalid JSON returned from container:\n\n\t {line}"
            self.print_progress(document, True, error_message, -1)
    def exec(
        self,
        args: List[str],
@ -337,84 +303,6 @@ class Container(IsolationProvider):
                f"Unexpected error occurred while killing container '{name}': {str(e)}"
            )
    def pixels_to_pdf(
        self, document: Document, tempdir: str, ocr_lang: Optional[str]
    ) -> None:
        # Convert pixels to safe PDF
        command = [
            "/usr/bin/python3",
            "-m",
            "dangerzone.conversion.pixels_to_pdf",
        ]
        extra_args = [
            "-v",
            f"{tempdir}:/safezone:Z",
            "-e",
            f"OCR={0 if ocr_lang is None else 1}",
            "-e",
            f"OCR_LANGUAGE={ocr_lang}",
        ]
        # XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a
        # container, which involves mounting two temp dirs. This does not bode well with
        # gVisor for two reasons:
        #
        # 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs,
        #    meaning that the location of the temp dirs must be relevant to that path.
        # 2. Reading and writing to these temp dirs requires permissions which are not
        #    available to the user within gVisor's user namespace.
        #
        # For these reasons, and because the pixels to PDF phase is more trusted (and
        # will soon stop being containerized), we circumvent gVisor support by doing the
        # following:
        #
        # 1. Override our entrypoint script with a no-op command (/usr/bin/env).
        # 2. Set the PYTHONPATH so that we can import the Python code within
        #    /home/dangerzone/dangerzone-image/rootfs
        # 3. Run the container as the root user, so that it can always write to the
        #    mounted directories. This container is trusted, so running as root has no
        #    impact to the security of Dangerzone.
        img_root = "/home/dangerzone/dangerzone-image/rootfs"
        extra_args += [
            "--entrypoint",
            "/usr/bin/env",
            "-e",
            f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages",
            "-e",
            f"TESSDATA_PREFIX={img_root}/usr/share/tessdata",
            "-u",
            "root",
        ]
        name = self.pixels_to_pdf_container_name(document)
        pixels_to_pdf_proc = self.exec_container(command, name, extra_args)
        if pixels_to_pdf_proc.stdout:
            for line in pixels_to_pdf_proc.stdout:
                self.parse_progress_trusted(document, line.decode())
        error_code = pixels_to_pdf_proc.wait()
        # In case of a dev run, log everything from the second container.
        if getattr(sys, "dangerzone_dev", False):
            assert pixels_to_pdf_proc.stderr
            out = pixels_to_pdf_proc.stderr.read().decode()
            text = (
                f"Conversion output: (pixels to PDF)\n"
                f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}"
            )
            log.info(text)
        if error_code != 0:
            log.error("pixels-to-pdf failed")
            raise errors.exception_from_error_code(error_code)
        else:
            # Move the final file to the right place
            if os.path.exists(document.output_filename):
                os.remove(document.output_filename)
            container_output_filename = os.path.join(
                tempdir, "safe-output-compressed.pdf"
            )
            shutil.move(container_output_filename, document.output_filename)
    def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen:
        # Convert document to pixels
        command = [
--- a/dangerzone/isolation_provider/qubes.py
+++ b/dangerzone/isolation_provider/qubes.py
@ -1,20 +1,16 @@
 import asyncio
 import io
 import logging
 import os
 import shutil
 import subprocess
 import sys
 import zipfile
 from pathlib import Path
-from typing import IO, Optional
+from typing import IO
 from ..conversion import errors
 from ..conversion.common import running_on_qubes
 from ..conversion.pixels_to_pdf import PixelsToPDF
 from ..document import Document
 from ..util import get_resource_path
-from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider
+from .base import IsolationProvider
 log = logging.getLogger(__name__)
@ -25,28 +21,6 @@ class Qubes(IsolationProvider):
    def install(self) -> bool:
        return True
    def pixels_to_pdf(
        self, document: Document, tempdir: str, ocr_lang: Optional[str]
    ) -> None:
        def print_progress_wrapper(error: bool, text: str, percentage: float) -> None:
            self.print_progress(document, error, text, percentage)
        converter = PixelsToPDF(progress_callback=print_progress_wrapper)
        try:
            asyncio.run(converter.convert(ocr_lang, tempdir))
        except (RuntimeError, ValueError) as e:
            raise errors.UnexpectedConversionError(str(e))
        finally:
            if getattr(sys, "dangerzone_dev", False):
                out = converter.captured_output.decode()
                text = (
                    f"Conversion output: (pixels to PDF)\n"
                    f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}"
                )
                log.info(text)
        shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
    def get_max_parallel_conversions(self) -> int:
        return 1
--- a/dangerzone/util.py
+++ b/dangerzone/util.py
@ -3,7 +3,6 @@ import platform
 import subprocess
 import sys
 import unicodedata
 from typing import Optional
 import appdirs
@ -12,17 +11,6 @@ def get_config_dir() -> str:
    return appdirs.user_config_dir("dangerzone")
 def get_tmp_dir() -> Optional[str]:
    """Get the parent dir for the Dangerzone temporary dirs.
    This function returns the parent directory where Dangerzone will store its temporary
    directories. The default behavior is to let Python choose for us (e.g., in `/tmp`
    for Linux), which is why we return None. However, we still need to define this
    function in order to be able to set this dir via mocking in our tests.
    """
    return None
 def get_resource_path(filename: str) -> str:
    if getattr(sys, "dangerzone_dev", False):
        # Look for resources directory relative to python file
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -134,8 +134,6 @@ class TestCli:
        if os.environ.get("DUMMY_CONVERSION", False):
            args = ("--unsafe-dummy-conversion", *args)
        with tempfile.TemporaryDirectory() as t:
            tmp_dir = Path(t)
        # TODO: Replace this with `contextlib.chdir()` [1], which was added in
        # Python 3.11.
        #
@ -145,19 +143,11 @@ class TestCli:
                cwd = os.getcwd()
                os.chdir(tmp_path)
                with mock.patch(
                    "dangerzone.isolation_provider.container.get_tmp_dir",
                    return_value=t,
                ):
            result = CliRunner().invoke(cli_main, args)
        finally:
            if tmp_path is not None:
                os.chdir(cwd)
                if tmp_dir.exists():
                    stale_files = list(tmp_dir.iterdir())
                    assert not stale_files
        # XXX Print stdout so that junitXML exports with output capturing
        # actually include the stdout + stderr (they are combined into stdout)
        print(result.stdout)