Remove dead code

This commit is contained in:
Alex Pyrgiotis 2024-03-14 10:46:10 +02:00
parent f42bb23229
commit 7ea7c8a0cc
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
8 changed files with 16 additions and 448 deletions

View file

@ -21,30 +21,6 @@ RUN case "$ARCH" in \
RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt
###########################################
# Download Tesseract data
FROM alpine:latest as tessdata-dl
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
# Download the trained models from the latest GitHub release of Tesseract, and
# store them under /usr/share/tessdata. This is basically what distro packages
# do under the hood.
#
# Because the GitHub release contains more files than just the trained models,
# we use `find` to fetch only the '*.traineddata' files in the top directory.
#
# Before we untar the models, we also check if the checksum is the expected one.
RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
&& cd .. && rm -r tessdata
###########################################
# Download H2ORestart
FROM alpine:latest as h2orestart-dl
@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"

View file

@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
return os.path.exists("/usr/share/qubes/marker-vm")
def get_tessdata_dir() -> str:
if os.environ.get("TESSDATA_PREFIX"):
return os.environ["TESSDATA_PREFIX"]
elif running_on_qubes():
return "/usr/share/tesseract/tessdata/"
else:
return "/usr/share/tessdata/"
class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0

View file

@ -1,161 +0,0 @@
"""
Here are the steps, with progress bar percentages:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""
import asyncio
import contextlib
import glob
import io
import json
import os
import sys
from typing import Optional
from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes
# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
# stderr, but it's based on environment variables. These envvars are consulted at import
# time [3], so we have to set them here, before we import `fitz`.
#
# [1] https://github.com/freedomofpress/dangerzone/issues/877
# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
os.environ["PYMUPDF_MESSAGE"] = "fd:2"
os.environ["PYMUPDF_LOG"] = "fd:2"
class PixelsToPDF(DangerzoneConverter):
async def convert(
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
) -> None:
self.percentage = 50.0
if tempdir is None:
tempdir = "/safezone"
# XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
import fitz
num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
total_size = 0.0
safe_doc = fitz.Document()
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page_num in range(1, num_pages + 1):
filename_base = f"{tempdir}/pixels/page-{page_num}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
with open(width_filename) as f:
width = int(f.read().strip())
with open(height_filename) as f:
height = int(f.read().strip())
with open(rgb_filename, "rb") as rgb_f:
untrusted_rgb_data = rgb_f.read()
# The first few operations happen on a per-page basis.
page_size = len(untrusted_rgb_data)
total_size += page_size
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB),
width,
height,
untrusted_rgb_data,
False,
)
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
if ocr_lang: # OCR the document
self.update_progress(
f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
)
if int(fitz.version[2]) >= 20230621000001:
page_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
tessdata=get_tessdata_dir(),
)
else:
# XXX: In PyMuPDF v1.22.5, the function signature of
# `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
# to explicitly set the Tesseract data dir [1].
#
# In earlier versions, the PyMuPDF developers recommend setting this
# path via the TESSDATA_PREFIX environment variable. In practice,
# this environment variable is read at import time, so subsequent
# changes to the environment variable are not tracked [2].
#
# To make things worse, any attempt to alter the internal attribute
# (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
# the OCR functions. That's due to the way imports work in `fitz`,
# where somehow the internal `fitz.fitz` module is shadowed.
#
# A hacky solution is to grab the `fitz.fitz` module from
# `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
# get away with this hack because we have a proper solution for
# subsequent PyMuPDF versions, and we know that nothing will change
# in older versions.
#
# TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
#
# [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
# [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined]
page_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
)
ocr_pdf = fitz.open("pdf", page_pdf_bytes)
else: # Don't OCR
self.update_progress(
f"Converting page {page_num}/{num_pages} from pixels to PDF"
)
page_doc = fitz.Document()
page_doc.insert_file(pixmap)
page_pdf_bytes = page_doc.tobytes(deflate_images=True)
safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
self.percentage += percentage_per_page
self.percentage = 100.0
self.update_progress("Safe PDF created")
# Move converted files into /safezone
if running_on_qubes():
safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf"
else:
safe_pdf_path = f"/safezone/safe-output-compressed.pdf"
safe_doc.save(safe_pdf_path, deflate_images=True)
def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes():
if self.progress_callback:
self.progress_callback(error, text, self.percentage)
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": self.percentage}
)
)
sys.stdout.flush()
async def main() -> int:
ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None
converter = PixelsToPDF()
try:
await converter.convert(ocr_lang)
return 0
except (RuntimeError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View file

@ -220,12 +220,6 @@ class IsolationProvider(ABC):
text = "Successfully converted document"
self.print_progress(document, False, text, 100)
@abstractmethod
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
pass
def print_progress(
self, document: Document, error: bool, text: str, percentage: float
) -> None:
@ -352,74 +346,3 @@ class IsolationProvider(ABC):
f"{debug_log}" # no need for an extra newline here
f"{DOC_TO_PIXELS_LOG_END}"
)
# From global_common:
# def validate_convert_to_pixel_output(self, common, output):
# """
# Take the output from the convert to pixels tasks and validate it. Returns
# a tuple like: (success (boolean), error_message (str))
# """
# max_image_width = 10000
# max_image_height = 10000
# # Did we hit an error?
# for line in output.split("\n"):
# if (
# "failed:" in line
# or "The document format is not supported" in line
# or "Error" in line
# ):
# return False, output
# # How many pages was that?
# num_pages = None
# for line in output.split("\n"):
# if line.startswith("Document has "):
# num_pages = line.split(" ")[2]
# break
# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
# return False, "Invalid number of pages returned"
# num_pages = int(num_pages)
# # Make sure we have the files we expect
# expected_filenames = []
# for i in range(1, num_pages + 1):
# expected_filenames += [
# f"page-{i}.rgb",
# f"page-{i}.width",
# f"page-{i}.height",
# ]
# expected_filenames.sort()
# actual_filenames = os.listdir(common.pixel_dir.name)
# actual_filenames.sort()
# if expected_filenames != actual_filenames:
# return (
# False,
# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
# )
# # Make sure the files are the correct sizes
# for i in range(1, num_pages + 1):
# with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
# w_str = f.read().strip()
# with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
# h_str = f.read().strip()
# w = int(w_str)
# h = int(h_str)
# if (
# not w_str.isdigit()
# or not h_str.isdigit()
# or w <= 0
# or w > max_image_width
# or h <= 0
# or h > max_image_height
# ):
# return False, f"Page {i} has invalid geometry"
# # Make sure the RGB file is the correct size
# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
# return False, f"Page {i} has an invalid RGB file size"
# return True, True

View file

@ -1,24 +1,15 @@
import gzip
import json
import logging
import os
import platform
import shlex
import shutil
import subprocess
import sys
from typing import Any, List, Optional, Tuple
from typing import List, Tuple
from ..conversion import errors
from ..document import Document
from ..util import get_tmp_dir # NOQA : required for mocking in our tests.
from ..util import get_resource_path, get_subprocess_startupinfo
from .base import (
PIXELS_TO_PDF_LOG_END,
PIXELS_TO_PDF_LOG_START,
IsolationProvider,
terminate_process_group,
)
from .base import IsolationProvider, terminate_process_group
TIMEOUT_KILL = 5 # Timeout in seconds until the kill command returns.
@ -234,31 +225,6 @@ class Container(IsolationProvider):
"""Unique container name for the pixels-to-pdf phase."""
return f"dangerzone-pixels-to-pdf-{document.id}"
def assert_field_type(self, val: Any, _type: object) -> None:
# XXX: Use a stricter check than isinstance because `bool` is a subclass of
# `int`.
#
# See https://stackoverflow.com/a/37888668
if type(val) is not _type:
raise ValueError("Status field has incorrect type")
def parse_progress_trusted(self, document: Document, line: str) -> None:
"""
Parses a line returned by the container.
"""
try:
status = json.loads(line)
text = status["text"]
self.assert_field_type(text, str)
error = status["error"]
self.assert_field_type(error, bool)
percentage = status["percentage"]
self.assert_field_type(percentage, float)
self.print_progress(document, error, text, percentage)
except Exception:
error_message = f"Invalid JSON returned from container:\n\n\t {line}"
self.print_progress(document, True, error_message, -1)
def exec(
self,
args: List[str],
@ -337,84 +303,6 @@ class Container(IsolationProvider):
f"Unexpected error occurred while killing container '{name}': {str(e)}"
)
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
# Convert pixels to safe PDF
command = [
"/usr/bin/python3",
"-m",
"dangerzone.conversion.pixels_to_pdf",
]
extra_args = [
"-v",
f"{tempdir}:/safezone:Z",
"-e",
f"OCR={0 if ocr_lang is None else 1}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
]
# XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a
# container, which involves mounting two temp dirs. This does not bode well with
# gVisor for two reasons:
#
# 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs,
# meaning that the location of the temp dirs must be relevant to that path.
# 2. Reading and writing to these temp dirs requires permissions which are not
# available to the user within gVisor's user namespace.
#
# For these reasons, and because the pixels to PDF phase is more trusted (and
# will soon stop being containerized), we circumvent gVisor support by doing the
# following:
#
# 1. Override our entrypoint script with a no-op command (/usr/bin/env).
# 2. Set the PYTHONPATH so that we can import the Python code within
# /home/dangerzone/dangerzone-image/rootfs
# 3. Run the container as the root user, so that it can always write to the
# mounted directories. This container is trusted, so running as root has no
# impact to the security of Dangerzone.
img_root = "/home/dangerzone/dangerzone-image/rootfs"
extra_args += [
"--entrypoint",
"/usr/bin/env",
"-e",
f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages",
"-e",
f"TESSDATA_PREFIX={img_root}/usr/share/tessdata",
"-u",
"root",
]
name = self.pixels_to_pdf_container_name(document)
pixels_to_pdf_proc = self.exec_container(command, name, extra_args)
if pixels_to_pdf_proc.stdout:
for line in pixels_to_pdf_proc.stdout:
self.parse_progress_trusted(document, line.decode())
error_code = pixels_to_pdf_proc.wait()
# In case of a dev run, log everything from the second container.
if getattr(sys, "dangerzone_dev", False):
assert pixels_to_pdf_proc.stderr
out = pixels_to_pdf_proc.stderr.read().decode()
text = (
f"Conversion output: (pixels to PDF)\n"
f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}"
)
log.info(text)
if error_code != 0:
log.error("pixels-to-pdf failed")
raise errors.exception_from_error_code(error_code)
else:
# Move the final file to the right place
if os.path.exists(document.output_filename):
os.remove(document.output_filename)
container_output_filename = os.path.join(
tempdir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, document.output_filename)
def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen:
# Convert document to pixels
command = [

View file

@ -1,20 +1,16 @@
import asyncio
import io
import logging
import os
import shutil
import subprocess
import sys
import zipfile
from pathlib import Path
from typing import IO, Optional
from typing import IO
from ..conversion import errors
from ..conversion.common import running_on_qubes
from ..conversion.pixels_to_pdf import PixelsToPDF
from ..document import Document
from ..util import get_resource_path
from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider
from .base import IsolationProvider
log = logging.getLogger(__name__)
@ -25,28 +21,6 @@ class Qubes(IsolationProvider):
def install(self) -> bool:
return True
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
def print_progress_wrapper(error: bool, text: str, percentage: float) -> None:
self.print_progress(document, error, text, percentage)
converter = PixelsToPDF(progress_callback=print_progress_wrapper)
try:
asyncio.run(converter.convert(ocr_lang, tempdir))
except (RuntimeError, ValueError) as e:
raise errors.UnexpectedConversionError(str(e))
finally:
if getattr(sys, "dangerzone_dev", False):
out = converter.captured_output.decode()
text = (
f"Conversion output: (pixels to PDF)\n"
f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}"
)
log.info(text)
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
def get_max_parallel_conversions(self) -> int:
return 1

View file

@ -3,7 +3,6 @@ import platform
import subprocess
import sys
import unicodedata
from typing import Optional
import appdirs
@ -12,17 +11,6 @@ def get_config_dir() -> str:
return appdirs.user_config_dir("dangerzone")
def get_tmp_dir() -> Optional[str]:
"""Get the parent dir for the Dangerzone temporary dirs.
This function returns the parent directory where Dangerzone will store its temporary
directories. The default behavior is to let Python choose for us (e.g., in `/tmp`
for Linux), which is why we return None. However, we still need to define this
function in order to be able to set this dir via mocking in our tests.
"""
return None
def get_resource_path(filename: str) -> str:
if getattr(sys, "dangerzone_dev", False):
# Look for resources directory relative to python file

View file

@ -134,29 +134,19 @@ class TestCli:
if os.environ.get("DUMMY_CONVERSION", False):
args = ("--unsafe-dummy-conversion", *args)
with tempfile.TemporaryDirectory() as t:
tmp_dir = Path(t)
# TODO: Replace this with `contextlib.chdir()` [1], which was added in
# Python 3.11.
#
# [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir
try:
if tmp_path is not None:
cwd = os.getcwd()
os.chdir(tmp_path)
# TODO: Replace this with `contextlib.chdir()` [1], which was added in
# Python 3.11.
#
# [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir
try:
if tmp_path is not None:
cwd = os.getcwd()
os.chdir(tmp_path)
with mock.patch(
"dangerzone.isolation_provider.container.get_tmp_dir",
return_value=t,
):
result = CliRunner().invoke(cli_main, args)
finally:
if tmp_path is not None:
os.chdir(cwd)
if tmp_dir.exists():
stale_files = list(tmp_dir.iterdir())
assert not stale_files
result = CliRunner().invoke(cli_main, args)
finally:
if tmp_path is not None:
os.chdir(cwd)
# XXX Print stdout so that junitXML exports with output capturing
# actually include the stdout + stderr (they are combined into stdout)